PyBRAID 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
braid/__init__.py ADDED
File without changes
braid/cli.py ADDED
@@ -0,0 +1,156 @@
1
+ import sys
2
+ import argparse
3
+ import os
4
+ import pysam
5
+ import logging
6
+
7
+ from .utils import setup_logging, CheckpointManager
8
+ from .modifier import SequenceModifier
9
+ from .protein import ProteinAnalyzer
10
+ from .genome import GenomeProcessor
11
+ from .vcf import VCFProcessor
12
+ from .output import ResultsOutputter
13
+
14
+ try:
15
+ from importlib import resources
16
+ except ImportError:
17
+ import importlib_resources as resources
18
+
19
+ def get_test_data_path(filename):
20
+ try:
21
+ data_path = resources.files('braid').joinpath('data').joinpath(filename)
22
+ return str(data_path)
23
+ except AttributeError:
24
+ with resources.path('braid', 'data') as data_dir:
25
+ return str(data_dir / filename)
26
+
27
+ def run_test():
28
+ vcf_path = get_test_data_path("test.vcf.gz")
29
+ fasta_path = get_test_data_path("test.fasta")
30
+ gff_path = get_test_data_path("test.gff3")
31
+ test_args = [
32
+ "-v", vcf_path,
33
+ "-r", fasta_path,
34
+ "-g", gff_path
35
+ ]
36
+ try:
37
+ main(test_args)
38
+ except Exception as e:
39
+ print(f"\n Failed: {e}")
40
+ import traceback
41
+ traceback.print_exc()
42
+
43
+ def main(args_list=None):
44
+ parser = argparse.ArgumentParser(
45
+ description="Analyzes phased VCF data to predict variant effects on protein sequences for each haplotype.",
46
+ formatter_class=argparse.RawTextHelpFormatter
47
+ )
48
+ parser.add_argument('-g','--gff', help='Input GFF3 file.')
49
+ parser.add_argument('-r', '--reference', help='Reference genome FASTA file (indexed: .fai).')
50
+ parser.add_argument('-v', '--vcf', help='Phased VCF file (indexed: .tbi/.csi).')
51
+ parser.add_argument('-o', '--output', default='variant_analysis_output.tsv', help='Output TSV file name.')
52
+ parser.add_argument('--force-unphased', action='store_true', help='Skip the phasing check and force the script to run on a potentially unphased VCF file.')
53
+ parser.add_argument('--ignore-intron', action='store_true', help='Ignore mutations marked only as intron.')
54
+ parser.add_argument('-s', '--sample', help='Path to file with specific sample IDs to analyze (one per line, no header).')
55
+ parser.add_argument('--gene', help='Path to file with specific gene IDs to analyze (one per line, no header).')
56
+ parser.add_argument('--lof-threshold', type=float, default=0.30, help="LOF classification threshold.")
57
+ parser.add_argument('--resume', action='store_true', help='Resume from the last successfully processed gene found in the log.')
58
+
59
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
60
+ parser_test = subparsers.add_parser("test", help="Run tests using built-in data.")
61
+
62
+ args = parser.parse_args(args_list)
63
+
64
+ if args.command == "test":
65
+ run_test()
66
+ return
67
+ if not args.gff or not args.reference or not args.vcf:
68
+ parser.print_help()
69
+ logging.error("Error: Arguments -g, -r, and -v are required for analysis.")
70
+ sys.exit(1)
71
+
72
+ output_basename = os.path.splitext(args.output)[0]
73
+ args.log = f"{output_basename}.log"
74
+ setup_logging(args.log)
75
+
76
+ args.align = f"{output_basename}.alignment.txt"
77
+ args.sample_matrix = f"{output_basename}.sample.txt"
78
+
79
+ ckpt_manager = CheckpointManager(args.log)
80
+ open_mode = 'w'
81
+
82
+ if args.resume:
83
+ logging.info("Received (Resume Mode)...")
84
+ ckpt_manager.load_completed_genes()
85
+ ckpt_manager.truncate_if_incomplete(args.output, file_type='tsv')
86
+ ckpt_manager.truncate_if_incomplete(args.sample_matrix, file_type='tsv')
87
+ ckpt_manager.truncate_if_incomplete(args.align, file_type='align')
88
+
89
+ open_mode = 'a'
90
+ else:
91
+ logging.info("Start new task (Start from scratch)...")
92
+
93
+ try:
94
+ genome = GenomeProcessor(args.gff, args.reference)
95
+ genome.load_reference_sequences()
96
+ genome.parse_gff()
97
+ genome.extract_region_sequences()
98
+ genome.assemble_cds_sequences()
99
+
100
+ all_genes = list(genome.gene_data.items())
101
+ genes_to_process = all_genes
102
+ if args.gene:
103
+ if not os.path.exists(args.gene):
104
+ logging.error(f"Gene list file not found: {args.gene}")
105
+ sys.exit(1)
106
+ logging.info(f"Loading target genes from {args.gene}...")
107
+ with open(args.gene, 'r') as f:
108
+ target_gene_ids = set(line.strip() for line in f if line.strip())
109
+ genes_to_process = [(gid, obj) for gid, obj in all_genes if gid in target_gene_ids]
110
+ logging.info(f"Target genes loaded: {len(target_gene_ids)}. Found in GFF: {len(genes_to_process)}.")
111
+ if len(genes_to_process) == 0:
112
+ logging.warning("No target genes found in the GFF file! Please check IDs.")
113
+ sys.exit(0)
114
+
115
+ vcf_processor = VCFProcessor(genome, args.vcf, args.force_unphased, args.sample, args.ignore_intron)
116
+ sequence_modifier = SequenceModifier(genome)
117
+ protein_analyzer = ProteinAnalyzer(genome, args.lof_threshold)
118
+
119
+ results_outputter = ResultsOutputter(genome, vcf_processor, sequence_modifier, protein_analyzer)
120
+
121
+ if args.resume and os.path.exists(args.output) and os.path.getsize(args.output) > 0:
122
+ results_outputter.header_written = True
123
+
124
+ with pysam.VariantFile(args.vcf) as vcf_in, \
125
+ open(args.output, open_mode, buffering=1) as out_f, \
126
+ open(args.align, open_mode, buffering=1) as align_f, \
127
+ open(args.sample_matrix, open_mode, buffering=1) as sample_f:
128
+
129
+ vcf_processor.load_and_validate_samples(vcf_in)
130
+ results_outputter.write_header(out_f, sample_f)
131
+ logging.info(f"Starting pipeline. Total genes: {len(genes_to_process)}")
132
+
133
+ for i, (gene_id, gene_obj) in enumerate(genes_to_process, 1):
134
+ if gene_id in ckpt_manager.completed_genes:
135
+ continue
136
+ logging.info(f"Processing Gene {i}/{len(genes_to_process)}: {gene_id}...")
137
+
138
+ gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
139
+ if gene_variant_combinations:
140
+ results_outputter.analyze_and_write_gene_results(gene_obj, gene_variant_combinations, out_f, align_f, sample_f)
141
+
142
+ out_f.flush()
143
+ align_f.flush()
144
+ sample_f.flush()
145
+
146
+ ckpt_manager.log_completion(gene_id)
147
+
148
+ logging.info("Pipeline finished successfully.")
149
+
150
+ except Exception as e:
151
+ logging.error(f"Pipeline failed: {e}", exc_info=True)
152
+ sys.exit(1)
153
+
154
+ if __name__ == "__main__":
155
+ main()
156
+
braid/data/test.fasta ADDED
@@ -0,0 +1,3 @@
1
+ >1
2
+ CGTACGTAGCTATGAGCTTAGCTAGCTCAGCTAACGATGTCGTTAAGTAGATGATCGATC
3
+ GATCGATCGATCGATCGGTCGATCGATCAGATCGATCGATCGATCGATCGATCGTGAC
@@ -0,0 +1 @@
1
+ 1 118 3 60 61
braid/data/test.gff3 ADDED
@@ -0,0 +1,15 @@
1
+ 1 ensembl gene 2 117 . + . ID=gene1;biotype=protein_coding;gene_id=gene1
2
+ 1 ensembl mRNA 2 117 . + . ID=transcript1;Parent=gene1;biotype=protein_coding;transcript_id=transcript1
3
+ 1 ensembl five_prime_UTR 2 11 . + . Parent=transcript1
4
+ 1 ensembl exon 2 38 . + . Parent=transcript1;Name=exon1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon1;rank=1;version=1
5
+ 1 ensembl CDS 12 38 . + 0 ID=CDS:protein1;Parent=transcript1;protein_id=protein1
6
+ 1 ensembl exon 51 77 . + . Parent=transcript1;Name=exon2;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon2;rank=2;version=1
7
+ 1 ensembl CDS 51 77 . + 0 ID=CDS:protein1;Parent=transcript1;protein_id=protein1
8
+ 1 ensembl exon 91 117 . + . Parent=transcript1;Name=exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon3;rank=3;version=1
9
+ 1 ensembl CDS 91 117 . + 0 ID=CDS:protein1;Parent=transcript1;protein_id=protein1
10
+ 1 ensembl mRNA 2 77 . + . ID=transcript2;Parent=gene1;biotype=protein_coding;transcript_id=transcript2
11
+ 1 ensembl five_prime_UTR 2 11 . + . Parent=transcript1
12
+ 1 ensembl exon 2 38 . + . Parent=transcript2;Name=exon1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon1;rank=1;version=1
13
+ 1 ensembl CDS 12 38 . + 0 ID=CDS:protein2;Parent=transcript2;protein_id=protein2
14
+ 1 ensembl exon 91 117 . + . Parent=transcript2;Name=exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon3;rank=2;version=1
15
+ 1 ensembl CDS 91 117 . + 0 ID=CDS:protein2;Parent=transcript2;protein_id=protein2
braid/data/test.vcf ADDED
@@ -0,0 +1,8 @@
1
+ ##fileformat=VCFv4.2
2
+ ##contig=<ID=1>
3
+ ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count">
4
+ ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles">
5
+ ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3 sample4
7
+ 1 17 . CTTAG C . PASS . GT 0|1 1|0 1|1 0|0
8
+ 1 27 . T TT . PASS . GT 0|1 1|0 1|1 0|0
braid/data/test.vcf.gz ADDED
Binary file
Binary file
@@ -0,0 +1,20 @@
1
+ Haplotype_ID: transcript1:1
2
+ Gene: gene1 | mRNA: transcript1
3
+ Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
4
+ Variant_Type: NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion||||||||||||||
5
+ Protein_Changes: Del(5)S
6
+ Alignment:
7
+ Ref: MSLASSANDMIDRSIDRSIDRSIDRS*
8
+ |||| ||||||||||||||||||||||
9
+ Alt: MSLA-SANDMIDRSIDRSIDRSIDRS*
10
+
11
+ Haplotype_ID: transcript2:1
12
+ Gene: gene1 | mRNA: transcript2
13
+ Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
14
+ Variant_Type: NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18)|||deletion||||||||||||||
15
+ Protein_Changes: Del(5)S
16
+ Alignment:
17
+ Ref: MSLASSANDIDRSIDRS*
18
+ |||| |||||||||||||
19
+ Alt: MSLA-SANDIDRSIDRS*
20
+
@@ -0,0 +1,191 @@
1
+ 2026-01-16 15:41:45,245 - INFO - Logging configured. Log file: variant_analysis_output.log
2
+ 2026-01-16 15:41:45,246 - INFO - Start new task (Start from scratch)...
3
+ 2026-01-16 15:41:45,246 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
4
+ 2026-01-16 15:41:45,246 - INFO - Reference sequences loaded.
5
+ 2026-01-16 15:41:45,246 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff
6
+ 2026-01-16 15:41:45,246 - ERROR - Pipeline failed: [Errno 2] No such file or directory: '/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff'
7
+ Traceback (most recent call last):
8
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 95, in main
9
+ genome.parse_gff()
10
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/genome.py", line 31, in parse_gff
11
+ with open(self.gff_file, 'r') as f:
12
+ FileNotFoundError: [Errno 2] No such file or directory: '/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff'
13
+ 2026-01-16 15:43:18,538 - INFO - Logging configured. Log file: variant_analysis_output.log
14
+ 2026-01-16 15:43:18,539 - INFO - Start new task (Start from scratch)...
15
+ 2026-01-16 15:43:18,539 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
16
+ 2026-01-16 15:43:18,539 - INFO - Reference sequences loaded.
17
+ 2026-01-16 15:43:18,539 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
18
+ 2026-01-16 15:43:18,539 - INFO - Detected GFF3 format.
19
+ 2026-01-16 15:43:18,540 - INFO - GFF parsing complete.
20
+ 2026-01-16 15:43:18,540 - INFO - Extracting sequences for GFF regions.
21
+ 2026-01-16 15:43:18,540 - INFO - Region sequence extraction complete.
22
+ 2026-01-16 15:43:18,540 - INFO - Assembling CDS sequences for mRNAs.
23
+ 2026-01-16 15:43:18,540 - INFO - CDS sequence assembly complete.
24
+ 2026-01-16 15:43:18,540 - ERROR - Pipeline failed: name 'PairwiseAligner' is not defined
25
+ Traceback (most recent call last):
26
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 115, in main
27
+ sequence_modifier = SequenceModifier(genome)
28
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/modifier.py", line 11, in __init__
29
+ self.aligner = PairwiseAligner()
30
+ NameError: name 'PairwiseAligner' is not defined
31
+ 2026-01-16 15:45:26,079 - INFO - Logging configured. Log file: variant_analysis_output.log
32
+ 2026-01-16 15:45:26,080 - INFO - Start new task (Start from scratch)...
33
+ 2026-01-16 15:45:26,080 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
34
+ 2026-01-16 15:45:26,080 - INFO - Reference sequences loaded.
35
+ 2026-01-16 15:45:26,080 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
36
+ 2026-01-16 15:45:26,080 - INFO - Detected GFF3 format.
37
+ 2026-01-16 15:45:26,080 - INFO - GFF parsing complete.
38
+ 2026-01-16 15:45:26,081 - INFO - Extracting sequences for GFF regions.
39
+ 2026-01-16 15:45:26,081 - INFO - Region sequence extraction complete.
40
+ 2026-01-16 15:45:26,081 - INFO - Assembling CDS sequences for mRNAs.
41
+ 2026-01-16 15:45:26,081 - INFO - CDS sequence assembly complete.
42
+ 2026-01-16 15:45:26,082 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
43
+ 2026-01-16 15:45:26,082 - ERROR - Pipeline failed: name 'pysam' is not defined
44
+ Traceback (most recent call last):
45
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 123, in main
46
+ with pysam.VariantFile(args.vcf) as vcf_in, \
47
+ NameError: name 'pysam' is not defined
48
+ 2026-01-16 15:46:22,451 - INFO - Logging configured. Log file: variant_analysis_output.log
49
+ 2026-01-16 15:46:22,451 - INFO - Start new task (Start from scratch)...
50
+ 2026-01-16 15:46:22,451 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
51
+ 2026-01-16 15:46:22,455 - INFO - Reference sequences loaded.
52
+ 2026-01-16 15:46:22,455 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
53
+ 2026-01-16 15:46:22,455 - INFO - Detected GFF3 format.
54
+ 2026-01-16 15:46:22,455 - INFO - GFF parsing complete.
55
+ 2026-01-16 15:46:22,455 - INFO - Extracting sequences for GFF regions.
56
+ 2026-01-16 15:46:22,456 - INFO - Region sequence extraction complete.
57
+ 2026-01-16 15:46:22,456 - INFO - Assembling CDS sequences for mRNAs.
58
+ 2026-01-16 15:46:22,456 - INFO - CDS sequence assembly complete.
59
+ 2026-01-16 15:46:22,457 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
60
+ 2026-01-16 15:46:22,458 - INFO - Loading and validating VCF samples...
61
+ 2026-01-16 15:46:22,459 - ERROR - Pipeline failed: name 'os' is not defined
62
+ Traceback (most recent call last):
63
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 129, in main
64
+ vcf_processor.load_and_validate_samples(vcf_in)
65
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 28, in load_and_validate_samples
66
+ if not os.path.exists(vcf_path + ".tbi") and not os.path.exists(vcf_path + ".csi"):
67
+ NameError: name 'os' is not defined
68
+ 2026-01-16 15:47:15,122 - INFO - Logging configured. Log file: variant_analysis_output.log
69
+ 2026-01-16 15:47:15,122 - INFO - Start new task (Start from scratch)...
70
+ 2026-01-16 15:47:15,123 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
71
+ 2026-01-16 15:47:15,123 - INFO - Reference sequences loaded.
72
+ 2026-01-16 15:47:15,123 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
73
+ 2026-01-16 15:47:15,123 - INFO - Detected GFF3 format.
74
+ 2026-01-16 15:47:15,123 - INFO - GFF parsing complete.
75
+ 2026-01-16 15:47:15,123 - INFO - Extracting sequences for GFF regions.
76
+ 2026-01-16 15:47:15,124 - INFO - Region sequence extraction complete.
77
+ 2026-01-16 15:47:15,124 - INFO - Assembling CDS sequences for mRNAs.
78
+ 2026-01-16 15:47:15,124 - INFO - CDS sequence assembly complete.
79
+ 2026-01-16 15:47:15,125 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
80
+ 2026-01-16 15:47:15,126 - INFO - Loading and validating VCF samples...
81
+ 2026-01-16 15:47:15,126 - ERROR - Index tbi/csi is missing, run 'bcftools index /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.vcf' to produce index
82
+ 2026-01-16 15:47:15,126 - ERROR - Pipeline failed: name 'sys' is not defined
83
+ Traceback (most recent call last):
84
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 129, in main
85
+ vcf_processor.load_and_validate_samples(vcf_in)
86
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 33, in load_and_validate_samples
87
+ sys.exit(1)
88
+ NameError: name 'sys' is not defined
89
+ 2026-01-16 15:48:10,135 - INFO - Logging configured. Log file: variant_analysis_output.log
90
+ 2026-01-16 15:48:10,135 - INFO - Start new task (Start from scratch)...
91
+ 2026-01-16 15:48:10,135 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
92
+ 2026-01-16 15:48:10,135 - INFO - Reference sequences loaded.
93
+ 2026-01-16 15:48:10,135 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
94
+ 2026-01-16 15:48:10,135 - INFO - Detected GFF3 format.
95
+ 2026-01-16 15:48:10,136 - INFO - GFF parsing complete.
96
+ 2026-01-16 15:48:10,136 - INFO - Extracting sequences for GFF regions.
97
+ 2026-01-16 15:48:10,136 - INFO - Region sequence extraction complete.
98
+ 2026-01-16 15:48:10,136 - INFO - Assembling CDS sequences for mRNAs.
99
+ 2026-01-16 15:48:10,136 - INFO - CDS sequence assembly complete.
100
+ 2026-01-16 15:48:10,137 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
101
+ 2026-01-16 15:48:10,138 - INFO - Loading and validating VCF samples...
102
+ 2026-01-16 15:48:10,138 - ERROR - Index tbi/csi is missing, run 'bcftools index /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.vcf' to produce index
103
+ 2026-01-16 15:49:38,190 - INFO - Logging configured. Log file: variant_analysis_output.log
104
+ 2026-01-16 15:49:38,190 - INFO - Start new task (Start from scratch)...
105
+ 2026-01-16 15:49:38,190 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
106
+ 2026-01-16 15:49:38,191 - INFO - Reference sequences loaded.
107
+ 2026-01-16 15:49:38,191 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
108
+ 2026-01-16 15:49:38,191 - INFO - Detected GFF3 format.
109
+ 2026-01-16 15:49:38,191 - INFO - GFF parsing complete.
110
+ 2026-01-16 15:49:38,191 - INFO - Extracting sequences for GFF regions.
111
+ 2026-01-16 15:49:38,191 - INFO - Region sequence extraction complete.
112
+ 2026-01-16 15:49:38,191 - INFO - Assembling CDS sequences for mRNAs.
113
+ 2026-01-16 15:49:38,191 - INFO - CDS sequence assembly complete.
114
+ 2026-01-16 15:49:38,193 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
115
+ 2026-01-16 15:49:38,194 - INFO - Loading and validating VCF samples...
116
+ 2026-01-16 15:49:38,194 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
117
+ 2026-01-16 15:49:38,194 - INFO - Checking VCF for phased genotype information...
118
+ 2026-01-16 15:49:38,194 - INFO - Phasing check passed. VCF appears to be phased.
119
+ 2026-01-16 15:49:38,194 - INFO - Starting pipeline. Total genes: 1
120
+ 2026-01-16 15:49:38,194 - INFO - Processing Gene 1/1: gene1...
121
+ 2026-01-16 15:49:38,195 - ERROR - Pipeline failed: name 'Mutation' is not defined
122
+ Traceback (most recent call last):
123
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 138, in main
124
+ gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
125
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 139, in process_variants_for_gene
126
+ mut_obj = self._create_mutation_from_record(record, alleles[allele_idx], labels)
127
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 92, in _create_mutation_from_record
128
+ return Mutation(mutation_id, chrom, record.pos, ref, alt, mut_type, False, labels)
129
+ NameError: name 'Mutation' is not defined
130
+ 2026-01-16 18:10:14,143 - INFO - Logging configured. Log file: variant_analysis_output.log
131
+ 2026-01-16 18:10:14,158 - INFO - Start new task (Start from scratch)...
132
+ 2026-01-16 18:10:14,158 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
133
+ 2026-01-16 18:10:14,159 - INFO - Reference sequences loaded.
134
+ 2026-01-16 18:10:14,159 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
135
+ 2026-01-16 18:10:14,159 - INFO - Detected GFF3 format.
136
+ 2026-01-16 18:10:14,160 - INFO - GFF parsing complete.
137
+ 2026-01-16 18:10:14,160 - INFO - Extracting sequences for GFF regions.
138
+ 2026-01-16 18:10:14,160 - INFO - Region sequence extraction complete.
139
+ 2026-01-16 18:10:14,160 - INFO - Assembling CDS sequences for mRNAs.
140
+ 2026-01-16 18:10:14,160 - INFO - CDS sequence assembly complete.
141
+ 2026-01-16 18:10:14,162 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
142
+ 2026-01-16 18:10:14,163 - INFO - Loading and validating VCF samples...
143
+ 2026-01-16 18:10:14,163 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
144
+ 2026-01-16 18:10:14,163 - INFO - Checking VCF for phased genotype information...
145
+ 2026-01-16 18:10:14,164 - INFO - Phasing check passed. VCF appears to be phased.
146
+ 2026-01-16 18:10:14,164 - INFO - Starting pipeline. Total genes: 1
147
+ 2026-01-16 18:10:14,164 - INFO - Processing Gene 1/1: gene1...
148
+ 2026-01-16 18:10:14,164 - ERROR - Pipeline failed: name 'Mutation' is not defined
149
+ Traceback (most recent call last):
150
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 138, in main
151
+ gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
152
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 139, in process_variants_for_gene
153
+ mut_obj = self._create_mutation_from_record(record, alleles[allele_idx], labels)
154
+ File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 92, in _create_mutation_from_record
155
+ return Mutation(mutation_id, chrom, record.pos, ref, alt, mut_type, False, labels)
156
+ NameError: name 'Mutation' is not defined
157
+ 2026-01-16 18:14:22,484 - INFO - Logging configured. Log file: variant_analysis_output.log
158
+ 2026-01-16 18:14:22,494 - INFO - Start new task (Start from scratch)...
159
+ 2026-01-16 18:14:22,494 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
160
+ 2026-01-16 18:14:22,494 - INFO - Reference sequences loaded.
161
+ 2026-01-16 18:14:22,494 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
162
+ 2026-01-16 18:14:22,495 - INFO - Detected GFF3 format.
163
+ 2026-01-16 18:14:22,495 - INFO - GFF parsing complete.
164
+ 2026-01-16 18:14:22,495 - INFO - Extracting sequences for GFF regions.
165
+ 2026-01-16 18:14:22,495 - INFO - Region sequence extraction complete.
166
+ 2026-01-16 18:14:22,495 - INFO - Assembling CDS sequences for mRNAs.
167
+ 2026-01-16 18:14:22,495 - INFO - CDS sequence assembly complete.
168
+ 2026-01-16 18:14:22,497 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
169
+ 2026-01-16 18:14:22,498 - INFO - Loading and validating VCF samples...
170
+ 2026-01-16 18:14:22,498 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
171
+ 2026-01-16 18:14:22,498 - INFO - Checking VCF for phased genotype information...
172
+ 2026-01-16 18:14:22,498 - INFO - Phasing check passed. VCF appears to be phased.
173
+ 2026-01-16 18:14:22,498 - INFO - Starting pipeline. Total genes: 1
174
+ 2026-01-16 18:14:22,498 - INFO - Processing Gene 1/1: gene1...
175
+ 2026-01-16 18:14:22,500 - INFO - pos: 27, ref_crosses_boundary: False
176
+ 2026-01-16 18:14:22,500 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['T']
177
+ 2026-01-16 18:14:22,500 - INFO - alt_for_cds: TT
178
+ 2026-01-16 18:14:22,500 - INFO - pos: 27, len(cds_seq_list_after_alt): 82
179
+ 2026-01-16 18:14:22,500 - INFO - pos: 17, ref_crosses_boundary: False
180
+ 2026-01-16 18:14:22,500 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['C', 'T', 'T', 'A', 'G']
181
+ 2026-01-16 18:14:22,500 - INFO - alt_for_cds: C
182
+ 2026-01-16 18:14:22,500 - INFO - pos: 17, len(cds_seq_list_after_alt): 78
183
+ 2026-01-16 18:14:22,501 - INFO - pos: 27, ref_crosses_boundary: False
184
+ 2026-01-16 18:14:22,501 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['T']
185
+ 2026-01-16 18:14:22,501 - INFO - alt_for_cds: TT
186
+ 2026-01-16 18:14:22,501 - INFO - pos: 27, len(cds_seq_list_after_alt): 55
187
+ 2026-01-16 18:14:22,501 - INFO - pos: 17, ref_crosses_boundary: False
188
+ 2026-01-16 18:14:22,501 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['C', 'T', 'T', 'A', 'G']
189
+ 2026-01-16 18:14:22,501 - INFO - alt_for_cds: C
190
+ 2026-01-16 18:14:22,501 - INFO - pos: 17, len(cds_seq_list_after_alt): 51
191
+ 2026-01-16 18:14:22,501 - INFO - Finished processing gene: gene1
@@ -0,0 +1,3 @@
1
+ Gene_ID mRNA_ID Ref_ID Alt_IDs sample1 sample2 sample3 sample4
2
+ gene1 transcript1 transcript1:REF transcript1:1 0|1 1|0 1|1 0|0
3
+ gene1 transcript2 transcript2:REF transcript2:1 0|1 1|0 1|1 0|0
@@ -0,0 +1,5 @@
1
+ Gene_ID Haplotype_ID mRNA Haplotype_Count Frequency Variant_Type Protein_Changes Haplotype_Mutations Sample_Sources Ref_Protein Alt_Protein Ref_CDS Alt_CDS Aligned_Ref Comparison_String Aligned_Alt
2
+ gene1 transcript1:REF transcript1 . . NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:27)||||||||||||||||| . . . MSLASSANDMIDRSIDRSIDRSIDRS* MSLASSANDMIDRSIDRSIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA MSLASSANDMIDRSIDRSIDRSIDRS* ||||||||||||||||||||||||||| MSLASSANDMIDRSIDRSIDRSIDRS*
3
+ gene1 transcript1:1 transcript1 4 0.500000 NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion|||||||||||||| Del(5)S 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON] sample1(Hap2);sample2(Hap1);sample3(Homo) MSLASSANDMIDRSIDRSIDRSIDRS* MSLASANDMIDRSIDRSIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA ATGAGCCTAGCTTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA MSLASSANDMIDRSIDRSIDRSIDRS* |||| |||||||||||||||||||||| MSLA-SANDMIDRSIDRSIDRSIDRS*
4
+ gene1 transcript2:REF transcript2 . . NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:18)||||||||||||||||| . . . MSLASSANDIDRSIDRS* MSLASSANDIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA MSLASSANDIDRSIDRS* |||||||||||||||||| MSLASSANDIDRSIDRS*
5
+ gene1 transcript2:1 transcript2 4 0.500000 NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18)|||deletion|||||||||||||| Del(5)S 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON] sample1(Hap2);sample2(Hap1);sample3(Homo) MSLASSANDIDRSIDRS* MSLASANDIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA ATGAGCCTAGCTTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA MSLASSANDIDRSIDRS* |||| ||||||||||||| MSLA-SANDIDRSIDRS*
braid/genome.py ADDED
@@ -0,0 +1,207 @@
1
+ import logging
2
+ import re
3
+ import pysam
4
+ from collections import defaultdict, namedtuple
5
+ from Bio.Seq import Seq
6
+
7
+ from .utils import reverse_complement
8
+
9
+ Gene = namedtuple('Gene', ['id', 'chrom', 'start', 'end', 'strand', 'mRNAs'])
10
+ mRNA = namedtuple('mRNA', ['id', 'chrom', 'start', 'end', 'strand', 'regions', 'cds_sequence', 'splice_junctions'])
11
+ Region = namedtuple('Region', ['type', 'chrom', 'start', 'end', 'strand', 'sequence'])
12
+ Mutation = namedtuple('Mutation', ['id', 'chrom', 'pos', 'ref', 'alt', 'type', 'overlapping', 'labels'])
13
+
14
+ class GenomeProcessor:
15
+ def __init__(self, gff_file, fasta_file):
16
+ self.gff_file = gff_file
17
+ self.fasta_file = fasta_file
18
+ self.gene_data = {}
19
+ self.mRNA_data = {}
20
+ self.reference_sequences = {}
21
+ self.chrom_lengths = {}
22
+
23
+ def parse_gff(self):
24
+ logging.info(f"Parsing GFF file: {self.gff_file}")
25
+ gene_mRNAs = defaultdict(list)
26
+ mrna_exons = defaultdict(list)
27
+ mrna_cds = defaultdict(list)
28
+ mrna_five_prime_utrs = defaultdict(list)
29
+ mrna_three_prime_utrs = defaultdict(list)
30
+ file_format = None
31
+ with open(self.gff_file, 'r') as f:
32
+ for line in f:
33
+ if line.startswith('#'): continue
34
+ parts = line.strip().split('\t')
35
+ if len(parts) < 9: continue
36
+ chrom = parts[0]
37
+ feature_type = parts[2]
38
+ start, end = int(parts[3]), int(parts[4])
39
+ strand = parts[6]
40
+ if '"' in parts[8] and ' ' in parts[8]:
41
+ if file_format is None:
42
+ logging.info("Detected GTF format.")
43
+ file_format = 'gtf'
44
+ attributes = dict(re.findall(r'(\w+)\s+"([^"]+)"', parts[8]))
45
+ elif '=' in parts[8]:
46
+ if file_format is None:
47
+ logging.info("Detected GFF3 format.")
48
+ file_format = 'gff'
49
+ attributes = dict(re.findall(r'(\w+)=([^;]+)', parts[8]))
50
+ else:
51
+ if file_format is None:
52
+ logging.warning("Could not definitively determine format. Assuming GFF3.")
53
+ file_format = 'gff'
54
+ attributes = dict(re.findall(r'(\w+)=([^;]+)', parts[8]))
55
+ if file_format == 'gtf':
56
+ if feature_type == 'gene':
57
+ gene_id = attributes.get('gene_id')
58
+ if gene_id: self.gene_data[gene_id] = Gene(gene_id, chrom, start, end, strand, [])
59
+ elif feature_type == 'transcript' or feature_type == 'mRNA':
60
+ mrna_id, parent_gene_id = attributes.get('transcript_id'), attributes.get('gene_id')
61
+ if mrna_id and parent_gene_id:
62
+ self.mRNA_data[mrna_id] = mRNA(mrna_id, chrom, start, end, strand, defaultdict(list), '', [])
63
+ gene_mRNAs[parent_gene_id].append(mrna_id)
64
+ elif feature_type == 'exon':
65
+ parent_mrna_id = attributes.get('transcript_id')
66
+ if parent_mrna_id in self.mRNA_data:
67
+ mrna_exons[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
68
+ elif feature_type == 'CDS':
69
+ parent_mrna_id = attributes.get('transcript_id')
70
+ if parent_mrna_id in self.mRNA_data:
71
+ mrna_cds[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
72
+ elif feature_type == 'five_prime_UTR':
73
+ parent_mrna_id = attributes.get('transcript_id')
74
+ if parent_mrna_id in self.mRNA_data:
75
+ mrna_five_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
76
+ elif feature_type == 'three_prime_UTR':
77
+ parent_mrna_id = attributes.get('transcript_id')
78
+ if parent_mrna_id in self.mRNA_data:
79
+ mrna_three_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
80
+ elif file_format == 'gff':
81
+ if feature_type == 'gene':
82
+ gene_id = attributes.get('ID')
83
+ if gene_id: self.gene_data[gene_id] = Gene(gene_id, chrom, start, end, strand, [])
84
+ elif feature_type == 'transcript' or feature_type == 'mRNA':
85
+ mrna_id, parent_gene_id = attributes.get('ID'), attributes.get('Parent')
86
+ if mrna_id and parent_gene_id:
87
+ self.mRNA_data[mrna_id] = mRNA(mrna_id, chrom, start, end, strand, defaultdict(list), '', [])
88
+ gene_mRNAs[parent_gene_id].append(mrna_id)
89
+ elif feature_type == 'exon':
90
+ parent_mrna_id = attributes.get('Parent')
91
+ if parent_mrna_id in self.mRNA_data:
92
+ mrna_exons[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
93
+ elif feature_type == 'CDS':
94
+ parent_mrna_id = attributes.get('Parent')
95
+ if parent_mrna_id in self.mRNA_data:
96
+ mrna_cds[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
97
+ elif feature_type == 'five_prime_UTR':
98
+ parent_mrna_id = attributes.get('Parent')
99
+ if parent_mrna_id in self.mRNA_data:
100
+ mrna_five_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
101
+ elif feature_type == 'three_prime_UTR':
102
+ parent_mrna_id = attributes.get('Parent')
103
+ if parent_mrna_id in self.mRNA_data:
104
+ mrna_three_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
105
+ for mrna_id in mrna_exons: mrna_exons[mrna_id].sort(key=lambda x: x['start'])
106
+ for mrna_id in mrna_cds: mrna_cds[mrna_id].sort(key=lambda x: x['start'])
107
+ for mrna_id, mrna_obj in self.mRNA_data.items():
108
+ for exon in mrna_exons.get(mrna_id, []):
109
+ mrna_obj.regions['EXON'].append(Region('EXON', mrna_obj.chrom, exon['start'], exon['end'], mrna_obj.strand, ''))
110
+ for cds in mrna_cds.get(mrna_id, []):
111
+ mrna_obj.regions['CDS'].append(Region('CDS', mrna_obj.chrom, cds['start'], cds['end'], mrna_obj.strand, ''))
112
+ for utr in mrna_five_prime_utrs.get(mrna_id, []):
113
+ mrna_obj.regions['five_prime_UTR'].append(Region('five_prime_UTR', chrom, utr['start'], utr['end'], strand, ''))
114
+ for utr in mrna_three_prime_utrs.get(mrna_id, []):
115
+ mrna_obj.regions['three_prime_UTR'].append(Region('three_prime_UTR', chrom, utr['start'], utr['end'], strand, ''))
116
+
117
+ sorted_exons = sorted(mrna_obj.regions['EXON'], key=lambda r: r.start)
118
+ if len(sorted_exons) > 1:
119
+ for i in range(len(sorted_exons) - 1):
120
+ intron_start = sorted_exons[i].end + 1
121
+ intron_end = sorted_exons[i+1].start - 1
122
+ if intron_start <= intron_end:
123
+ mrna_obj.regions['INTRON'].append(Region('INTRON', mrna_obj.chrom, intron_start, intron_end, mrna_obj.strand, ''))
124
+
125
+ sorted_introns = sorted(mrna_obj.regions['INTRON'], key=lambda r: r.start)
126
+ filtered_introns = [i for i in sorted_introns if (i.end - i.start) >= 4]
127
+ for intron in filtered_introns:
128
+ chrom_len = self.chrom_lengths.get(mrna_obj.chrom, 0)
129
+ if mrna_obj.strand == '+': # define donor and acceptor sites and their windows
130
+ donor_start, donor_end = intron.start, intron.start + 1
131
+ acceptor_start, acceptor_end = intron.end - 1, intron.end
132
+ donor_window_start = max(1, intron.start - 3)
133
+ donor_window_end = min(chrom_len, intron.start + 8)
134
+ acceptor_window_start = max(1, intron.end - 8)
135
+ acceptor_window_end = min(chrom_len, intron.end + 3)
136
+ else:
137
+ donor_start, donor_end = intron.end - 1, intron.end
138
+ acceptor_start, acceptor_end = intron.start, intron.start + 1
139
+ donor_window_start = max(1, intron.end - 8)
140
+ donor_window_end = min(chrom_len, intron.end + 3)
141
+ acceptor_window_start = max(1, intron.start - 3)
142
+ acceptor_window_end = min(chrom_len, intron.start + 8)
143
+ if donor_start <= donor_end:
144
+ donor_junction = {'type': 'donor', 'site': Region('splice_donor_site', mrna_obj.chrom, donor_start, donor_end, mrna_obj.strand, ''), 'window': Region('splice_donor_window', mrna_obj.chrom, donor_window_start, donor_window_end, mrna_obj.strand, '')}
145
+ mrna_obj.splice_junctions.append(donor_junction)
146
+ else:
147
+ logging.warning(f"Invalid donor site coordinates for mRNA {mrna_obj.id}: start ({donor_start}) > end ({donor_end}). Skipping.")
148
+ if acceptor_start <= acceptor_end:
149
+ acceptor_junction = {'type': 'acceptor', 'site': Region('splice_acceptor_site', mrna_obj.chrom, acceptor_start, acceptor_end, mrna_obj.strand, ''), 'window': Region('splice_acceptor_window', mrna_obj.chrom, acceptor_window_start, acceptor_window_end, mrna_obj.strand, '')}
150
+ mrna_obj.splice_junctions.append(acceptor_junction)
151
+ else:
152
+ logging.warning(f"Invalid acceptor site coordinates for mRNA {mrna_obj.id}: start ({acceptor_start}) > end ({acceptor_end}). Skipping.")
153
+ for gene_id, mrna_ids in gene_mRNAs.items():
154
+ if gene_id in self.gene_data:
155
+ self.gene_data[gene_id] = self.gene_data[gene_id]._replace(mRNAs=[self.mRNA_data[mid] for mid in mrna_ids if mid in self.mRNA_data])
156
+ logging.info("GFF parsing complete.")
157
+
158
+ def load_reference_sequences(self):
159
+ logging.info(f"Loading reference sequences from FASTA file: {self.fasta_file}")
160
+ try:
161
+ with pysam.FastaFile(self.fasta_file) as fasta:
162
+ for chrom_name in fasta.references:
163
+ self.reference_sequences[chrom_name] = fasta.fetch(chrom_name)
164
+ self.chrom_lengths[chrom_name] = fasta.get_reference_length(chrom_name)
165
+ logging.info("Reference sequences loaded.")
166
+ except Exception as e:
167
+ logging.error(f"Error loading FASTA file: {e}")
168
+ raise
169
+
170
+ def extract_region_sequences(self):
171
+ logging.info("Extracting sequences for GFF regions.")
172
+ for mrna_obj in self.mRNA_data.values():
173
+ for regions in mrna_obj.regions.values():
174
+ for i, region in enumerate(regions):
175
+ if region.chrom not in self.reference_sequences: continue
176
+ seq = self.reference_sequences[region.chrom][region.start - 1:region.end]
177
+ regions[i] = region._replace(sequence=seq)
178
+ if hasattr(mrna_obj, 'splice_junctions'):
179
+ for i, junction in enumerate(mrna_obj.splice_junctions):
180
+ site_region = junction['site']
181
+ window_region = junction['window']
182
+ if site_region.chrom in self.reference_sequences:
183
+ site_seq = self.reference_sequences[site_region.chrom][site_region.start - 1:site_region.end]
184
+ mrna_obj.splice_junctions[i]['site'] = site_region._replace(sequence=site_seq)
185
+ if window_region.chrom in self.reference_sequences:
186
+ window_seq = self.reference_sequences[window_region.chrom][window_region.start - 1:window_region.end]
187
+ mrna_obj.splice_junctions[i]['window'] = window_region._replace(sequence=window_seq)
188
+ logging.info("Region sequence extraction complete.")
189
+
190
+ def assemble_cds_sequences(self):
191
+ logging.info("Assembling CDS sequences for mRNAs.")
192
+ for mrna_id, mrna_obj in self.mRNA_data.items():
193
+ cds_regions = mrna_obj.regions['CDS']
194
+ sorted_cds = sorted(cds_regions, key=lambda r: r.start)
195
+ full_cds_seq = "".join(cds.sequence for cds in sorted_cds)
196
+ self.mRNA_data[mrna_id] = mrna_obj._replace(cds_sequence=full_cds_seq)
197
+ for gene_id, gene_obj in self.gene_data.items():
198
+ updated_mRNAs = []
199
+ for old_mrna in gene_obj.mRNAs:
200
+ if old_mrna.id in self.mRNA_data:
201
+ updated_mRNAs.append(self.mRNA_data[old_mrna.id])
202
+ else:
203
+ updated_mRNAs.append(old_mrna)
204
+ new_gene_obj = gene_obj._replace(mRNAs=updated_mRNAs)
205
+ self.gene_data[gene_id] = new_gene_obj
206
+ logging.info("CDS sequence assembly complete.")
207
+