PyBRAID 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- braid/__init__.py +0 -0
- braid/cli.py +156 -0
- braid/data/test.fasta +3 -0
- braid/data/test.fasta.fai +1 -0
- braid/data/test.gff3 +15 -0
- braid/data/test.vcf +8 -0
- braid/data/test.vcf.gz +0 -0
- braid/data/test.vcf.gz.csi +0 -0
- braid/data/variant_analysis_output.alignment.txt +20 -0
- braid/data/variant_analysis_output.log +191 -0
- braid/data/variant_analysis_output.sample.txt +3 -0
- braid/data/variant_analysis_output.tsv +5 -0
- braid/genome.py +207 -0
- braid/modifier.py +500 -0
- braid/output.py +285 -0
- braid/protein.py +141 -0
- braid/utils.py +86 -0
- braid/vcf.py +180 -0
- pybraid-1.0.0.dist-info/METADATA +227 -0
- pybraid-1.0.0.dist-info/RECORD +24 -0
- pybraid-1.0.0.dist-info/WHEEL +5 -0
- pybraid-1.0.0.dist-info/entry_points.txt +2 -0
- pybraid-1.0.0.dist-info/licenses/LICENSE +21 -0
- pybraid-1.0.0.dist-info/top_level.txt +1 -0
braid/__init__.py
ADDED
|
File without changes
|
braid/cli.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import argparse
|
|
3
|
+
import os
|
|
4
|
+
import pysam
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from .utils import setup_logging, CheckpointManager
|
|
8
|
+
from .modifier import SequenceModifier
|
|
9
|
+
from .protein import ProteinAnalyzer
|
|
10
|
+
from .genome import GenomeProcessor
|
|
11
|
+
from .vcf import VCFProcessor
|
|
12
|
+
from .output import ResultsOutputter
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from importlib import resources
|
|
16
|
+
except ImportError:
|
|
17
|
+
import importlib_resources as resources
|
|
18
|
+
|
|
19
|
+
def get_test_data_path(filename):
|
|
20
|
+
try:
|
|
21
|
+
data_path = resources.files('braid').joinpath('data').joinpath(filename)
|
|
22
|
+
return str(data_path)
|
|
23
|
+
except AttributeError:
|
|
24
|
+
with resources.path('braid', 'data') as data_dir:
|
|
25
|
+
return str(data_dir / filename)
|
|
26
|
+
|
|
27
|
+
def run_test():
|
|
28
|
+
vcf_path = get_test_data_path("test.vcf.gz")
|
|
29
|
+
fasta_path = get_test_data_path("test.fasta")
|
|
30
|
+
gff_path = get_test_data_path("test.gff3")
|
|
31
|
+
test_args = [
|
|
32
|
+
"-v", vcf_path,
|
|
33
|
+
"-r", fasta_path,
|
|
34
|
+
"-g", gff_path
|
|
35
|
+
]
|
|
36
|
+
try:
|
|
37
|
+
main(test_args)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
print(f"\n Failed: {e}")
|
|
40
|
+
import traceback
|
|
41
|
+
traceback.print_exc()
|
|
42
|
+
|
|
43
|
+
def main(args_list=None):
|
|
44
|
+
parser = argparse.ArgumentParser(
|
|
45
|
+
description="Analyzes phased VCF data to predict variant effects on protein sequences for each haplotype.",
|
|
46
|
+
formatter_class=argparse.RawTextHelpFormatter
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument('-g','--gff', help='Input GFF3 file.')
|
|
49
|
+
parser.add_argument('-r', '--reference', help='Reference genome FASTA file (indexed: .fai).')
|
|
50
|
+
parser.add_argument('-v', '--vcf', help='Phased VCF file (indexed: .tbi/.csi).')
|
|
51
|
+
parser.add_argument('-o', '--output', default='variant_analysis_output.tsv', help='Output TSV file name.')
|
|
52
|
+
parser.add_argument('--force-unphased', action='store_true', help='Skip the phasing check and force the script to run on a potentially unphased VCF file.')
|
|
53
|
+
parser.add_argument('--ignore-intron', action='store_true', help='Ignore mutations marked only as intron.')
|
|
54
|
+
parser.add_argument('-s', '--sample', help='Path to file with specific sample IDs to analyze (one per line, no header).')
|
|
55
|
+
parser.add_argument('--gene', help='Path to file with specific gene IDs to analyze (one per line, no header).')
|
|
56
|
+
parser.add_argument('--lof-threshold', type=float, default=0.30, help="LOF classification threshold.")
|
|
57
|
+
parser.add_argument('--resume', action='store_true', help='Resume from the last successfully processed gene found in the log.')
|
|
58
|
+
|
|
59
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
60
|
+
parser_test = subparsers.add_parser("test", help="Run tests using built-in data.")
|
|
61
|
+
|
|
62
|
+
args = parser.parse_args(args_list)
|
|
63
|
+
|
|
64
|
+
if args.command == "test":
|
|
65
|
+
run_test()
|
|
66
|
+
return
|
|
67
|
+
if not args.gff or not args.reference or not args.vcf:
|
|
68
|
+
parser.print_help()
|
|
69
|
+
logging.error("Error: Arguments -g, -r, and -v are required for analysis.")
|
|
70
|
+
sys.exit(1)
|
|
71
|
+
|
|
72
|
+
output_basename = os.path.splitext(args.output)[0]
|
|
73
|
+
args.log = f"{output_basename}.log"
|
|
74
|
+
setup_logging(args.log)
|
|
75
|
+
|
|
76
|
+
args.align = f"{output_basename}.alignment.txt"
|
|
77
|
+
args.sample_matrix = f"{output_basename}.sample.txt"
|
|
78
|
+
|
|
79
|
+
ckpt_manager = CheckpointManager(args.log)
|
|
80
|
+
open_mode = 'w'
|
|
81
|
+
|
|
82
|
+
if args.resume:
|
|
83
|
+
logging.info("Received (Resume Mode)...")
|
|
84
|
+
ckpt_manager.load_completed_genes()
|
|
85
|
+
ckpt_manager.truncate_if_incomplete(args.output, file_type='tsv')
|
|
86
|
+
ckpt_manager.truncate_if_incomplete(args.sample_matrix, file_type='tsv')
|
|
87
|
+
ckpt_manager.truncate_if_incomplete(args.align, file_type='align')
|
|
88
|
+
|
|
89
|
+
open_mode = 'a'
|
|
90
|
+
else:
|
|
91
|
+
logging.info("Start new task (Start from scratch)...")
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
genome = GenomeProcessor(args.gff, args.reference)
|
|
95
|
+
genome.load_reference_sequences()
|
|
96
|
+
genome.parse_gff()
|
|
97
|
+
genome.extract_region_sequences()
|
|
98
|
+
genome.assemble_cds_sequences()
|
|
99
|
+
|
|
100
|
+
all_genes = list(genome.gene_data.items())
|
|
101
|
+
genes_to_process = all_genes
|
|
102
|
+
if args.gene:
|
|
103
|
+
if not os.path.exists(args.gene):
|
|
104
|
+
logging.error(f"Gene list file not found: {args.gene}")
|
|
105
|
+
sys.exit(1)
|
|
106
|
+
logging.info(f"Loading target genes from {args.gene}...")
|
|
107
|
+
with open(args.gene, 'r') as f:
|
|
108
|
+
target_gene_ids = set(line.strip() for line in f if line.strip())
|
|
109
|
+
genes_to_process = [(gid, obj) for gid, obj in all_genes if gid in target_gene_ids]
|
|
110
|
+
logging.info(f"Target genes loaded: {len(target_gene_ids)}. Found in GFF: {len(genes_to_process)}.")
|
|
111
|
+
if len(genes_to_process) == 0:
|
|
112
|
+
logging.warning("No target genes found in the GFF file! Please check IDs.")
|
|
113
|
+
sys.exit(0)
|
|
114
|
+
|
|
115
|
+
vcf_processor = VCFProcessor(genome, args.vcf, args.force_unphased, args.sample, args.ignore_intron)
|
|
116
|
+
sequence_modifier = SequenceModifier(genome)
|
|
117
|
+
protein_analyzer = ProteinAnalyzer(genome, args.lof_threshold)
|
|
118
|
+
|
|
119
|
+
results_outputter = ResultsOutputter(genome, vcf_processor, sequence_modifier, protein_analyzer)
|
|
120
|
+
|
|
121
|
+
if args.resume and os.path.exists(args.output) and os.path.getsize(args.output) > 0:
|
|
122
|
+
results_outputter.header_written = True
|
|
123
|
+
|
|
124
|
+
with pysam.VariantFile(args.vcf) as vcf_in, \
|
|
125
|
+
open(args.output, open_mode, buffering=1) as out_f, \
|
|
126
|
+
open(args.align, open_mode, buffering=1) as align_f, \
|
|
127
|
+
open(args.sample_matrix, open_mode, buffering=1) as sample_f:
|
|
128
|
+
|
|
129
|
+
vcf_processor.load_and_validate_samples(vcf_in)
|
|
130
|
+
results_outputter.write_header(out_f, sample_f)
|
|
131
|
+
logging.info(f"Starting pipeline. Total genes: {len(genes_to_process)}")
|
|
132
|
+
|
|
133
|
+
for i, (gene_id, gene_obj) in enumerate(genes_to_process, 1):
|
|
134
|
+
if gene_id in ckpt_manager.completed_genes:
|
|
135
|
+
continue
|
|
136
|
+
logging.info(f"Processing Gene {i}/{len(genes_to_process)}: {gene_id}...")
|
|
137
|
+
|
|
138
|
+
gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
|
|
139
|
+
if gene_variant_combinations:
|
|
140
|
+
results_outputter.analyze_and_write_gene_results(gene_obj, gene_variant_combinations, out_f, align_f, sample_f)
|
|
141
|
+
|
|
142
|
+
out_f.flush()
|
|
143
|
+
align_f.flush()
|
|
144
|
+
sample_f.flush()
|
|
145
|
+
|
|
146
|
+
ckpt_manager.log_completion(gene_id)
|
|
147
|
+
|
|
148
|
+
logging.info("Pipeline finished successfully.")
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logging.error(f"Pipeline failed: {e}", exc_info=True)
|
|
152
|
+
sys.exit(1)
|
|
153
|
+
|
|
154
|
+
if __name__ == "__main__":
|
|
155
|
+
main()
|
|
156
|
+
|
braid/data/test.fasta
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
1 118 3 60 61
|
braid/data/test.gff3
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
1 ensembl gene 2 117 . + . ID=gene1;biotype=protein_coding;gene_id=gene1
|
|
2
|
+
1 ensembl mRNA 2 117 . + . ID=transcript1;Parent=gene1;biotype=protein_coding;transcript_id=transcript1
|
|
3
|
+
1 ensembl five_prime_UTR 2 11 . + . Parent=transcript1
|
|
4
|
+
1 ensembl exon 2 38 . + . Parent=transcript1;Name=exon1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon1;rank=1;version=1
|
|
5
|
+
1 ensembl CDS 12 38 . + 0 ID=CDS:protein1;Parent=transcript1;protein_id=protein1
|
|
6
|
+
1 ensembl exon 51 77 . + . Parent=transcript1;Name=exon2;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon2;rank=2;version=1
|
|
7
|
+
1 ensembl CDS 51 77 . + 0 ID=CDS:protein1;Parent=transcript1;protein_id=protein1
|
|
8
|
+
1 ensembl exon 91 117 . + . Parent=transcript1;Name=exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon3;rank=3;version=1
|
|
9
|
+
1 ensembl CDS 91 117 . + 0 ID=CDS:protein1;Parent=transcript1;protein_id=protein1
|
|
10
|
+
1 ensembl mRNA 2 77 . + . ID=transcript2;Parent=gene1;biotype=protein_coding;transcript_id=transcript2
|
|
11
|
+
1 ensembl five_prime_UTR 2 11 . + . Parent=transcript1
|
|
12
|
+
1 ensembl exon 2 38 . + . Parent=transcript2;Name=exon1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon1;rank=1;version=1
|
|
13
|
+
1 ensembl CDS 12 38 . + 0 ID=CDS:protein2;Parent=transcript2;protein_id=protein2
|
|
14
|
+
1 ensembl exon 91 117 . + . Parent=transcript2;Name=exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon3;rank=2;version=1
|
|
15
|
+
1 ensembl CDS 91 117 . + 0 ID=CDS:protein2;Parent=transcript2;protein_id=protein2
|
braid/data/test.vcf
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
##fileformat=VCFv4.2
|
|
2
|
+
##contig=<ID=1>
|
|
3
|
+
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count">
|
|
4
|
+
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles">
|
|
5
|
+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
|
6
|
+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3 sample4
|
|
7
|
+
1 17 . CTTAG C . PASS . GT 0|1 1|0 1|1 0|0
|
|
8
|
+
1 27 . T TT . PASS . GT 0|1 1|0 1|1 0|0
|
braid/data/test.vcf.gz
ADDED
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Haplotype_ID: transcript1:1
|
|
2
|
+
Gene: gene1 | mRNA: transcript1
|
|
3
|
+
Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
|
|
4
|
+
Variant_Type: NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion||||||||||||||
|
|
5
|
+
Protein_Changes: Del(5)S
|
|
6
|
+
Alignment:
|
|
7
|
+
Ref: MSLASSANDMIDRSIDRSIDRSIDRS*
|
|
8
|
+
|||| ||||||||||||||||||||||
|
|
9
|
+
Alt: MSLA-SANDMIDRSIDRSIDRSIDRS*
|
|
10
|
+
|
|
11
|
+
Haplotype_ID: transcript2:1
|
|
12
|
+
Gene: gene1 | mRNA: transcript2
|
|
13
|
+
Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
|
|
14
|
+
Variant_Type: NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18)|||deletion||||||||||||||
|
|
15
|
+
Protein_Changes: Del(5)S
|
|
16
|
+
Alignment:
|
|
17
|
+
Ref: MSLASSANDIDRSIDRS*
|
|
18
|
+
|||| |||||||||||||
|
|
19
|
+
Alt: MSLA-SANDIDRSIDRS*
|
|
20
|
+
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
2026-01-16 15:41:45,245 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
2
|
+
2026-01-16 15:41:45,246 - INFO - Start new task (Start from scratch)...
|
|
3
|
+
2026-01-16 15:41:45,246 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
4
|
+
2026-01-16 15:41:45,246 - INFO - Reference sequences loaded.
|
|
5
|
+
2026-01-16 15:41:45,246 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff
|
|
6
|
+
2026-01-16 15:41:45,246 - ERROR - Pipeline failed: [Errno 2] No such file or directory: '/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff'
|
|
7
|
+
Traceback (most recent call last):
|
|
8
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 95, in main
|
|
9
|
+
genome.parse_gff()
|
|
10
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/genome.py", line 31, in parse_gff
|
|
11
|
+
with open(self.gff_file, 'r') as f:
|
|
12
|
+
FileNotFoundError: [Errno 2] No such file or directory: '/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff'
|
|
13
|
+
2026-01-16 15:43:18,538 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
14
|
+
2026-01-16 15:43:18,539 - INFO - Start new task (Start from scratch)...
|
|
15
|
+
2026-01-16 15:43:18,539 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
16
|
+
2026-01-16 15:43:18,539 - INFO - Reference sequences loaded.
|
|
17
|
+
2026-01-16 15:43:18,539 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
18
|
+
2026-01-16 15:43:18,539 - INFO - Detected GFF3 format.
|
|
19
|
+
2026-01-16 15:43:18,540 - INFO - GFF parsing complete.
|
|
20
|
+
2026-01-16 15:43:18,540 - INFO - Extracting sequences for GFF regions.
|
|
21
|
+
2026-01-16 15:43:18,540 - INFO - Region sequence extraction complete.
|
|
22
|
+
2026-01-16 15:43:18,540 - INFO - Assembling CDS sequences for mRNAs.
|
|
23
|
+
2026-01-16 15:43:18,540 - INFO - CDS sequence assembly complete.
|
|
24
|
+
2026-01-16 15:43:18,540 - ERROR - Pipeline failed: name 'PairwiseAligner' is not defined
|
|
25
|
+
Traceback (most recent call last):
|
|
26
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 115, in main
|
|
27
|
+
sequence_modifier = SequenceModifier(genome)
|
|
28
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/modifier.py", line 11, in __init__
|
|
29
|
+
self.aligner = PairwiseAligner()
|
|
30
|
+
NameError: name 'PairwiseAligner' is not defined
|
|
31
|
+
2026-01-16 15:45:26,079 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
32
|
+
2026-01-16 15:45:26,080 - INFO - Start new task (Start from scratch)...
|
|
33
|
+
2026-01-16 15:45:26,080 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
34
|
+
2026-01-16 15:45:26,080 - INFO - Reference sequences loaded.
|
|
35
|
+
2026-01-16 15:45:26,080 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
36
|
+
2026-01-16 15:45:26,080 - INFO - Detected GFF3 format.
|
|
37
|
+
2026-01-16 15:45:26,080 - INFO - GFF parsing complete.
|
|
38
|
+
2026-01-16 15:45:26,081 - INFO - Extracting sequences for GFF regions.
|
|
39
|
+
2026-01-16 15:45:26,081 - INFO - Region sequence extraction complete.
|
|
40
|
+
2026-01-16 15:45:26,081 - INFO - Assembling CDS sequences for mRNAs.
|
|
41
|
+
2026-01-16 15:45:26,081 - INFO - CDS sequence assembly complete.
|
|
42
|
+
2026-01-16 15:45:26,082 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
|
|
43
|
+
2026-01-16 15:45:26,082 - ERROR - Pipeline failed: name 'pysam' is not defined
|
|
44
|
+
Traceback (most recent call last):
|
|
45
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 123, in main
|
|
46
|
+
with pysam.VariantFile(args.vcf) as vcf_in, \
|
|
47
|
+
NameError: name 'pysam' is not defined
|
|
48
|
+
2026-01-16 15:46:22,451 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
49
|
+
2026-01-16 15:46:22,451 - INFO - Start new task (Start from scratch)...
|
|
50
|
+
2026-01-16 15:46:22,451 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
51
|
+
2026-01-16 15:46:22,455 - INFO - Reference sequences loaded.
|
|
52
|
+
2026-01-16 15:46:22,455 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
53
|
+
2026-01-16 15:46:22,455 - INFO - Detected GFF3 format.
|
|
54
|
+
2026-01-16 15:46:22,455 - INFO - GFF parsing complete.
|
|
55
|
+
2026-01-16 15:46:22,455 - INFO - Extracting sequences for GFF regions.
|
|
56
|
+
2026-01-16 15:46:22,456 - INFO - Region sequence extraction complete.
|
|
57
|
+
2026-01-16 15:46:22,456 - INFO - Assembling CDS sequences for mRNAs.
|
|
58
|
+
2026-01-16 15:46:22,456 - INFO - CDS sequence assembly complete.
|
|
59
|
+
2026-01-16 15:46:22,457 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
|
|
60
|
+
2026-01-16 15:46:22,458 - INFO - Loading and validating VCF samples...
|
|
61
|
+
2026-01-16 15:46:22,459 - ERROR - Pipeline failed: name 'os' is not defined
|
|
62
|
+
Traceback (most recent call last):
|
|
63
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 129, in main
|
|
64
|
+
vcf_processor.load_and_validate_samples(vcf_in)
|
|
65
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 28, in load_and_validate_samples
|
|
66
|
+
if not os.path.exists(vcf_path + ".tbi") and not os.path.exists(vcf_path + ".csi"):
|
|
67
|
+
NameError: name 'os' is not defined
|
|
68
|
+
2026-01-16 15:47:15,122 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
69
|
+
2026-01-16 15:47:15,122 - INFO - Start new task (Start from scratch)...
|
|
70
|
+
2026-01-16 15:47:15,123 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
71
|
+
2026-01-16 15:47:15,123 - INFO - Reference sequences loaded.
|
|
72
|
+
2026-01-16 15:47:15,123 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
73
|
+
2026-01-16 15:47:15,123 - INFO - Detected GFF3 format.
|
|
74
|
+
2026-01-16 15:47:15,123 - INFO - GFF parsing complete.
|
|
75
|
+
2026-01-16 15:47:15,123 - INFO - Extracting sequences for GFF regions.
|
|
76
|
+
2026-01-16 15:47:15,124 - INFO - Region sequence extraction complete.
|
|
77
|
+
2026-01-16 15:47:15,124 - INFO - Assembling CDS sequences for mRNAs.
|
|
78
|
+
2026-01-16 15:47:15,124 - INFO - CDS sequence assembly complete.
|
|
79
|
+
2026-01-16 15:47:15,125 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
|
|
80
|
+
2026-01-16 15:47:15,126 - INFO - Loading and validating VCF samples...
|
|
81
|
+
2026-01-16 15:47:15,126 - ERROR - Index tbi/csi is missing, run 'bcftools index /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.vcf' to produce index
|
|
82
|
+
2026-01-16 15:47:15,126 - ERROR - Pipeline failed: name 'sys' is not defined
|
|
83
|
+
Traceback (most recent call last):
|
|
84
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 129, in main
|
|
85
|
+
vcf_processor.load_and_validate_samples(vcf_in)
|
|
86
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 33, in load_and_validate_samples
|
|
87
|
+
sys.exit(1)
|
|
88
|
+
NameError: name 'sys' is not defined
|
|
89
|
+
2026-01-16 15:48:10,135 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
90
|
+
2026-01-16 15:48:10,135 - INFO - Start new task (Start from scratch)...
|
|
91
|
+
2026-01-16 15:48:10,135 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
92
|
+
2026-01-16 15:48:10,135 - INFO - Reference sequences loaded.
|
|
93
|
+
2026-01-16 15:48:10,135 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
94
|
+
2026-01-16 15:48:10,135 - INFO - Detected GFF3 format.
|
|
95
|
+
2026-01-16 15:48:10,136 - INFO - GFF parsing complete.
|
|
96
|
+
2026-01-16 15:48:10,136 - INFO - Extracting sequences for GFF regions.
|
|
97
|
+
2026-01-16 15:48:10,136 - INFO - Region sequence extraction complete.
|
|
98
|
+
2026-01-16 15:48:10,136 - INFO - Assembling CDS sequences for mRNAs.
|
|
99
|
+
2026-01-16 15:48:10,136 - INFO - CDS sequence assembly complete.
|
|
100
|
+
2026-01-16 15:48:10,137 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
|
|
101
|
+
2026-01-16 15:48:10,138 - INFO - Loading and validating VCF samples...
|
|
102
|
+
2026-01-16 15:48:10,138 - ERROR - Index tbi/csi is missing, run 'bcftools index /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.vcf' to produce index
|
|
103
|
+
2026-01-16 15:49:38,190 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
104
|
+
2026-01-16 15:49:38,190 - INFO - Start new task (Start from scratch)...
|
|
105
|
+
2026-01-16 15:49:38,190 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
106
|
+
2026-01-16 15:49:38,191 - INFO - Reference sequences loaded.
|
|
107
|
+
2026-01-16 15:49:38,191 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
108
|
+
2026-01-16 15:49:38,191 - INFO - Detected GFF3 format.
|
|
109
|
+
2026-01-16 15:49:38,191 - INFO - GFF parsing complete.
|
|
110
|
+
2026-01-16 15:49:38,191 - INFO - Extracting sequences for GFF regions.
|
|
111
|
+
2026-01-16 15:49:38,191 - INFO - Region sequence extraction complete.
|
|
112
|
+
2026-01-16 15:49:38,191 - INFO - Assembling CDS sequences for mRNAs.
|
|
113
|
+
2026-01-16 15:49:38,191 - INFO - CDS sequence assembly complete.
|
|
114
|
+
2026-01-16 15:49:38,193 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
|
|
115
|
+
2026-01-16 15:49:38,194 - INFO - Loading and validating VCF samples...
|
|
116
|
+
2026-01-16 15:49:38,194 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
|
|
117
|
+
2026-01-16 15:49:38,194 - INFO - Checking VCF for phased genotype information...
|
|
118
|
+
2026-01-16 15:49:38,194 - INFO - Phasing check passed. VCF appears to be phased.
|
|
119
|
+
2026-01-16 15:49:38,194 - INFO - Starting pipeline. Total genes: 1
|
|
120
|
+
2026-01-16 15:49:38,194 - INFO - Processing Gene 1/1: gene1...
|
|
121
|
+
2026-01-16 15:49:38,195 - ERROR - Pipeline failed: name 'Mutation' is not defined
|
|
122
|
+
Traceback (most recent call last):
|
|
123
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 138, in main
|
|
124
|
+
gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
|
|
125
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 139, in process_variants_for_gene
|
|
126
|
+
mut_obj = self._create_mutation_from_record(record, alleles[allele_idx], labels)
|
|
127
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 92, in _create_mutation_from_record
|
|
128
|
+
return Mutation(mutation_id, chrom, record.pos, ref, alt, mut_type, False, labels)
|
|
129
|
+
NameError: name 'Mutation' is not defined
|
|
130
|
+
2026-01-16 18:10:14,143 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
131
|
+
2026-01-16 18:10:14,158 - INFO - Start new task (Start from scratch)...
|
|
132
|
+
2026-01-16 18:10:14,158 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
133
|
+
2026-01-16 18:10:14,159 - INFO - Reference sequences loaded.
|
|
134
|
+
2026-01-16 18:10:14,159 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
135
|
+
2026-01-16 18:10:14,159 - INFO - Detected GFF3 format.
|
|
136
|
+
2026-01-16 18:10:14,160 - INFO - GFF parsing complete.
|
|
137
|
+
2026-01-16 18:10:14,160 - INFO - Extracting sequences for GFF regions.
|
|
138
|
+
2026-01-16 18:10:14,160 - INFO - Region sequence extraction complete.
|
|
139
|
+
2026-01-16 18:10:14,160 - INFO - Assembling CDS sequences for mRNAs.
|
|
140
|
+
2026-01-16 18:10:14,160 - INFO - CDS sequence assembly complete.
|
|
141
|
+
2026-01-16 18:10:14,162 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
|
|
142
|
+
2026-01-16 18:10:14,163 - INFO - Loading and validating VCF samples...
|
|
143
|
+
2026-01-16 18:10:14,163 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
|
|
144
|
+
2026-01-16 18:10:14,163 - INFO - Checking VCF for phased genotype information...
|
|
145
|
+
2026-01-16 18:10:14,164 - INFO - Phasing check passed. VCF appears to be phased.
|
|
146
|
+
2026-01-16 18:10:14,164 - INFO - Starting pipeline. Total genes: 1
|
|
147
|
+
2026-01-16 18:10:14,164 - INFO - Processing Gene 1/1: gene1...
|
|
148
|
+
2026-01-16 18:10:14,164 - ERROR - Pipeline failed: name 'Mutation' is not defined
|
|
149
|
+
Traceback (most recent call last):
|
|
150
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 138, in main
|
|
151
|
+
gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
|
|
152
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 139, in process_variants_for_gene
|
|
153
|
+
mut_obj = self._create_mutation_from_record(record, alleles[allele_idx], labels)
|
|
154
|
+
File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 92, in _create_mutation_from_record
|
|
155
|
+
return Mutation(mutation_id, chrom, record.pos, ref, alt, mut_type, False, labels)
|
|
156
|
+
NameError: name 'Mutation' is not defined
|
|
157
|
+
2026-01-16 18:14:22,484 - INFO - Logging configured. Log file: variant_analysis_output.log
|
|
158
|
+
2026-01-16 18:14:22,494 - INFO - Start new task (Start from scratch)...
|
|
159
|
+
2026-01-16 18:14:22,494 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
|
|
160
|
+
2026-01-16 18:14:22,494 - INFO - Reference sequences loaded.
|
|
161
|
+
2026-01-16 18:14:22,494 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
|
|
162
|
+
2026-01-16 18:14:22,495 - INFO - Detected GFF3 format.
|
|
163
|
+
2026-01-16 18:14:22,495 - INFO - GFF parsing complete.
|
|
164
|
+
2026-01-16 18:14:22,495 - INFO - Extracting sequences for GFF regions.
|
|
165
|
+
2026-01-16 18:14:22,495 - INFO - Region sequence extraction complete.
|
|
166
|
+
2026-01-16 18:14:22,495 - INFO - Assembling CDS sequences for mRNAs.
|
|
167
|
+
2026-01-16 18:14:22,495 - INFO - CDS sequence assembly complete.
|
|
168
|
+
2026-01-16 18:14:22,497 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
|
|
169
|
+
2026-01-16 18:14:22,498 - INFO - Loading and validating VCF samples...
|
|
170
|
+
2026-01-16 18:14:22,498 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
|
|
171
|
+
2026-01-16 18:14:22,498 - INFO - Checking VCF for phased genotype information...
|
|
172
|
+
2026-01-16 18:14:22,498 - INFO - Phasing check passed. VCF appears to be phased.
|
|
173
|
+
2026-01-16 18:14:22,498 - INFO - Starting pipeline. Total genes: 1
|
|
174
|
+
2026-01-16 18:14:22,498 - INFO - Processing Gene 1/1: gene1...
|
|
175
|
+
2026-01-16 18:14:22,500 - INFO - pos: 27, ref_crosses_boundary: False
|
|
176
|
+
2026-01-16 18:14:22,500 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['T']
|
|
177
|
+
2026-01-16 18:14:22,500 - INFO - alt_for_cds: TT
|
|
178
|
+
2026-01-16 18:14:22,500 - INFO - pos: 27, len(cds_seq_list_after_alt): 82
|
|
179
|
+
2026-01-16 18:14:22,500 - INFO - pos: 17, ref_crosses_boundary: False
|
|
180
|
+
2026-01-16 18:14:22,500 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['C', 'T', 'T', 'A', 'G']
|
|
181
|
+
2026-01-16 18:14:22,500 - INFO - alt_for_cds: C
|
|
182
|
+
2026-01-16 18:14:22,500 - INFO - pos: 17, len(cds_seq_list_after_alt): 78
|
|
183
|
+
2026-01-16 18:14:22,501 - INFO - pos: 27, ref_crosses_boundary: False
|
|
184
|
+
2026-01-16 18:14:22,501 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['T']
|
|
185
|
+
2026-01-16 18:14:22,501 - INFO - alt_for_cds: TT
|
|
186
|
+
2026-01-16 18:14:22,501 - INFO - pos: 27, len(cds_seq_list_after_alt): 55
|
|
187
|
+
2026-01-16 18:14:22,501 - INFO - pos: 17, ref_crosses_boundary: False
|
|
188
|
+
2026-01-16 18:14:22,501 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['C', 'T', 'T', 'A', 'G']
|
|
189
|
+
2026-01-16 18:14:22,501 - INFO - alt_for_cds: C
|
|
190
|
+
2026-01-16 18:14:22,501 - INFO - pos: 17, len(cds_seq_list_after_alt): 51
|
|
191
|
+
2026-01-16 18:14:22,501 - INFO - Finished processing gene: gene1
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
Gene_ID Haplotype_ID mRNA Haplotype_Count Frequency Variant_Type Protein_Changes Haplotype_Mutations Sample_Sources Ref_Protein Alt_Protein Ref_CDS Alt_CDS Aligned_Ref Comparison_String Aligned_Alt
|
|
2
|
+
gene1 transcript1:REF transcript1 . . NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:27)||||||||||||||||| . . . MSLASSANDMIDRSIDRSIDRSIDRS* MSLASSANDMIDRSIDRSIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA MSLASSANDMIDRSIDRSIDRSIDRS* ||||||||||||||||||||||||||| MSLASSANDMIDRSIDRSIDRSIDRS*
|
|
3
|
+
gene1 transcript1:1 transcript1 4 0.500000 NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion|||||||||||||| Del(5)S 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON] sample1(Hap2);sample2(Hap1);sample3(Homo) MSLASSANDMIDRSIDRSIDRSIDRS* MSLASANDMIDRSIDRSIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA ATGAGCCTAGCTTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA MSLASSANDMIDRSIDRSIDRSIDRS* |||| |||||||||||||||||||||| MSLA-SANDMIDRSIDRSIDRSIDRS*
|
|
4
|
+
gene1 transcript2:REF transcript2 . . NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:18)||||||||||||||||| . . . MSLASSANDIDRSIDRS* MSLASSANDIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA MSLASSANDIDRSIDRS* |||||||||||||||||| MSLASSANDIDRSIDRS*
|
|
5
|
+
gene1 transcript2:1 transcript2 4 0.500000 NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18)|||deletion|||||||||||||| Del(5)S 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON] sample1(Hap2);sample2(Hap1);sample3(Homo) MSLASSANDIDRSIDRS* MSLASANDIDRSIDRS* ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA ATGAGCCTAGCTTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA MSLASSANDIDRSIDRS* |||| ||||||||||||| MSLA-SANDIDRSIDRS*
|
braid/genome.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import pysam
|
|
4
|
+
from collections import defaultdict, namedtuple
|
|
5
|
+
from Bio.Seq import Seq
|
|
6
|
+
|
|
7
|
+
from .utils import reverse_complement
|
|
8
|
+
|
|
9
|
+
Gene = namedtuple('Gene', ['id', 'chrom', 'start', 'end', 'strand', 'mRNAs'])
|
|
10
|
+
mRNA = namedtuple('mRNA', ['id', 'chrom', 'start', 'end', 'strand', 'regions', 'cds_sequence', 'splice_junctions'])
|
|
11
|
+
Region = namedtuple('Region', ['type', 'chrom', 'start', 'end', 'strand', 'sequence'])
|
|
12
|
+
Mutation = namedtuple('Mutation', ['id', 'chrom', 'pos', 'ref', 'alt', 'type', 'overlapping', 'labels'])
|
|
13
|
+
|
|
14
|
+
class GenomeProcessor:
|
|
15
|
+
def __init__(self, gff_file, fasta_file):
|
|
16
|
+
self.gff_file = gff_file
|
|
17
|
+
self.fasta_file = fasta_file
|
|
18
|
+
self.gene_data = {}
|
|
19
|
+
self.mRNA_data = {}
|
|
20
|
+
self.reference_sequences = {}
|
|
21
|
+
self.chrom_lengths = {}
|
|
22
|
+
|
|
23
|
+
def parse_gff(self):
|
|
24
|
+
logging.info(f"Parsing GFF file: {self.gff_file}")
|
|
25
|
+
gene_mRNAs = defaultdict(list)
|
|
26
|
+
mrna_exons = defaultdict(list)
|
|
27
|
+
mrna_cds = defaultdict(list)
|
|
28
|
+
mrna_five_prime_utrs = defaultdict(list)
|
|
29
|
+
mrna_three_prime_utrs = defaultdict(list)
|
|
30
|
+
file_format = None
|
|
31
|
+
with open(self.gff_file, 'r') as f:
|
|
32
|
+
for line in f:
|
|
33
|
+
if line.startswith('#'): continue
|
|
34
|
+
parts = line.strip().split('\t')
|
|
35
|
+
if len(parts) < 9: continue
|
|
36
|
+
chrom = parts[0]
|
|
37
|
+
feature_type = parts[2]
|
|
38
|
+
start, end = int(parts[3]), int(parts[4])
|
|
39
|
+
strand = parts[6]
|
|
40
|
+
if '"' in parts[8] and ' ' in parts[8]:
|
|
41
|
+
if file_format is None:
|
|
42
|
+
logging.info("Detected GTF format.")
|
|
43
|
+
file_format = 'gtf'
|
|
44
|
+
attributes = dict(re.findall(r'(\w+)\s+"([^"]+)"', parts[8]))
|
|
45
|
+
elif '=' in parts[8]:
|
|
46
|
+
if file_format is None:
|
|
47
|
+
logging.info("Detected GFF3 format.")
|
|
48
|
+
file_format = 'gff'
|
|
49
|
+
attributes = dict(re.findall(r'(\w+)=([^;]+)', parts[8]))
|
|
50
|
+
else:
|
|
51
|
+
if file_format is None:
|
|
52
|
+
logging.warning("Could not definitively determine format. Assuming GFF3.")
|
|
53
|
+
file_format = 'gff'
|
|
54
|
+
attributes = dict(re.findall(r'(\w+)=([^;]+)', parts[8]))
|
|
55
|
+
if file_format == 'gtf':
|
|
56
|
+
if feature_type == 'gene':
|
|
57
|
+
gene_id = attributes.get('gene_id')
|
|
58
|
+
if gene_id: self.gene_data[gene_id] = Gene(gene_id, chrom, start, end, strand, [])
|
|
59
|
+
elif feature_type == 'transcript' or feature_type == 'mRNA':
|
|
60
|
+
mrna_id, parent_gene_id = attributes.get('transcript_id'), attributes.get('gene_id')
|
|
61
|
+
if mrna_id and parent_gene_id:
|
|
62
|
+
self.mRNA_data[mrna_id] = mRNA(mrna_id, chrom, start, end, strand, defaultdict(list), '', [])
|
|
63
|
+
gene_mRNAs[parent_gene_id].append(mrna_id)
|
|
64
|
+
elif feature_type == 'exon':
|
|
65
|
+
parent_mrna_id = attributes.get('transcript_id')
|
|
66
|
+
if parent_mrna_id in self.mRNA_data:
|
|
67
|
+
mrna_exons[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
68
|
+
elif feature_type == 'CDS':
|
|
69
|
+
parent_mrna_id = attributes.get('transcript_id')
|
|
70
|
+
if parent_mrna_id in self.mRNA_data:
|
|
71
|
+
mrna_cds[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
72
|
+
elif feature_type == 'five_prime_UTR':
|
|
73
|
+
parent_mrna_id = attributes.get('transcript_id')
|
|
74
|
+
if parent_mrna_id in self.mRNA_data:
|
|
75
|
+
mrna_five_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
76
|
+
elif feature_type == 'three_prime_UTR':
|
|
77
|
+
parent_mrna_id = attributes.get('transcript_id')
|
|
78
|
+
if parent_mrna_id in self.mRNA_data:
|
|
79
|
+
mrna_three_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
80
|
+
elif file_format == 'gff':
|
|
81
|
+
if feature_type == 'gene':
|
|
82
|
+
gene_id = attributes.get('ID')
|
|
83
|
+
if gene_id: self.gene_data[gene_id] = Gene(gene_id, chrom, start, end, strand, [])
|
|
84
|
+
elif feature_type == 'transcript' or feature_type == 'mRNA':
|
|
85
|
+
mrna_id, parent_gene_id = attributes.get('ID'), attributes.get('Parent')
|
|
86
|
+
if mrna_id and parent_gene_id:
|
|
87
|
+
self.mRNA_data[mrna_id] = mRNA(mrna_id, chrom, start, end, strand, defaultdict(list), '', [])
|
|
88
|
+
gene_mRNAs[parent_gene_id].append(mrna_id)
|
|
89
|
+
elif feature_type == 'exon':
|
|
90
|
+
parent_mrna_id = attributes.get('Parent')
|
|
91
|
+
if parent_mrna_id in self.mRNA_data:
|
|
92
|
+
mrna_exons[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
93
|
+
elif feature_type == 'CDS':
|
|
94
|
+
parent_mrna_id = attributes.get('Parent')
|
|
95
|
+
if parent_mrna_id in self.mRNA_data:
|
|
96
|
+
mrna_cds[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
97
|
+
elif feature_type == 'five_prime_UTR':
|
|
98
|
+
parent_mrna_id = attributes.get('Parent')
|
|
99
|
+
if parent_mrna_id in self.mRNA_data:
|
|
100
|
+
mrna_five_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
101
|
+
elif feature_type == 'three_prime_UTR':
|
|
102
|
+
parent_mrna_id = attributes.get('Parent')
|
|
103
|
+
if parent_mrna_id in self.mRNA_data:
|
|
104
|
+
mrna_three_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
|
|
105
|
+
for mrna_id in mrna_exons: mrna_exons[mrna_id].sort(key=lambda x: x['start'])
|
|
106
|
+
for mrna_id in mrna_cds: mrna_cds[mrna_id].sort(key=lambda x: x['start'])
|
|
107
|
+
for mrna_id, mrna_obj in self.mRNA_data.items():
|
|
108
|
+
for exon in mrna_exons.get(mrna_id, []):
|
|
109
|
+
mrna_obj.regions['EXON'].append(Region('EXON', mrna_obj.chrom, exon['start'], exon['end'], mrna_obj.strand, ''))
|
|
110
|
+
for cds in mrna_cds.get(mrna_id, []):
|
|
111
|
+
mrna_obj.regions['CDS'].append(Region('CDS', mrna_obj.chrom, cds['start'], cds['end'], mrna_obj.strand, ''))
|
|
112
|
+
for utr in mrna_five_prime_utrs.get(mrna_id, []):
|
|
113
|
+
mrna_obj.regions['five_prime_UTR'].append(Region('five_prime_UTR', chrom, utr['start'], utr['end'], strand, ''))
|
|
114
|
+
for utr in mrna_three_prime_utrs.get(mrna_id, []):
|
|
115
|
+
mrna_obj.regions['three_prime_UTR'].append(Region('three_prime_UTR', chrom, utr['start'], utr['end'], strand, ''))
|
|
116
|
+
|
|
117
|
+
sorted_exons = sorted(mrna_obj.regions['EXON'], key=lambda r: r.start)
|
|
118
|
+
if len(sorted_exons) > 1:
|
|
119
|
+
for i in range(len(sorted_exons) - 1):
|
|
120
|
+
intron_start = sorted_exons[i].end + 1
|
|
121
|
+
intron_end = sorted_exons[i+1].start - 1
|
|
122
|
+
if intron_start <= intron_end:
|
|
123
|
+
mrna_obj.regions['INTRON'].append(Region('INTRON', mrna_obj.chrom, intron_start, intron_end, mrna_obj.strand, ''))
|
|
124
|
+
|
|
125
|
+
sorted_introns = sorted(mrna_obj.regions['INTRON'], key=lambda r: r.start)
|
|
126
|
+
filtered_introns = [i for i in sorted_introns if (i.end - i.start) >= 4]
|
|
127
|
+
for intron in filtered_introns:
|
|
128
|
+
chrom_len = self.chrom_lengths.get(mrna_obj.chrom, 0)
|
|
129
|
+
if mrna_obj.strand == '+': # define donor and acceptor sites and their windows
|
|
130
|
+
donor_start, donor_end = intron.start, intron.start + 1
|
|
131
|
+
acceptor_start, acceptor_end = intron.end - 1, intron.end
|
|
132
|
+
donor_window_start = max(1, intron.start - 3)
|
|
133
|
+
donor_window_end = min(chrom_len, intron.start + 8)
|
|
134
|
+
acceptor_window_start = max(1, intron.end - 8)
|
|
135
|
+
acceptor_window_end = min(chrom_len, intron.end + 3)
|
|
136
|
+
else:
|
|
137
|
+
donor_start, donor_end = intron.end - 1, intron.end
|
|
138
|
+
acceptor_start, acceptor_end = intron.start, intron.start + 1
|
|
139
|
+
donor_window_start = max(1, intron.end - 8)
|
|
140
|
+
donor_window_end = min(chrom_len, intron.end + 3)
|
|
141
|
+
acceptor_window_start = max(1, intron.start - 3)
|
|
142
|
+
acceptor_window_end = min(chrom_len, intron.start + 8)
|
|
143
|
+
if donor_start <= donor_end:
|
|
144
|
+
donor_junction = {'type': 'donor', 'site': Region('splice_donor_site', mrna_obj.chrom, donor_start, donor_end, mrna_obj.strand, ''), 'window': Region('splice_donor_window', mrna_obj.chrom, donor_window_start, donor_window_end, mrna_obj.strand, '')}
|
|
145
|
+
mrna_obj.splice_junctions.append(donor_junction)
|
|
146
|
+
else:
|
|
147
|
+
logging.warning(f"Invalid donor site coordinates for mRNA {mrna_obj.id}: start ({donor_start}) > end ({donor_end}). Skipping.")
|
|
148
|
+
if acceptor_start <= acceptor_end:
|
|
149
|
+
acceptor_junction = {'type': 'acceptor', 'site': Region('splice_acceptor_site', mrna_obj.chrom, acceptor_start, acceptor_end, mrna_obj.strand, ''), 'window': Region('splice_acceptor_window', mrna_obj.chrom, acceptor_window_start, acceptor_window_end, mrna_obj.strand, '')}
|
|
150
|
+
mrna_obj.splice_junctions.append(acceptor_junction)
|
|
151
|
+
else:
|
|
152
|
+
logging.warning(f"Invalid acceptor site coordinates for mRNA {mrna_obj.id}: start ({acceptor_start}) > end ({acceptor_end}). Skipping.")
|
|
153
|
+
for gene_id, mrna_ids in gene_mRNAs.items():
|
|
154
|
+
if gene_id in self.gene_data:
|
|
155
|
+
self.gene_data[gene_id] = self.gene_data[gene_id]._replace(mRNAs=[self.mRNA_data[mid] for mid in mrna_ids if mid in self.mRNA_data])
|
|
156
|
+
logging.info("GFF parsing complete.")
|
|
157
|
+
|
|
158
|
+
def load_reference_sequences(self):
|
|
159
|
+
logging.info(f"Loading reference sequences from FASTA file: {self.fasta_file}")
|
|
160
|
+
try:
|
|
161
|
+
with pysam.FastaFile(self.fasta_file) as fasta:
|
|
162
|
+
for chrom_name in fasta.references:
|
|
163
|
+
self.reference_sequences[chrom_name] = fasta.fetch(chrom_name)
|
|
164
|
+
self.chrom_lengths[chrom_name] = fasta.get_reference_length(chrom_name)
|
|
165
|
+
logging.info("Reference sequences loaded.")
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logging.error(f"Error loading FASTA file: {e}")
|
|
168
|
+
raise
|
|
169
|
+
|
|
170
|
+
def extract_region_sequences(self):
|
|
171
|
+
logging.info("Extracting sequences for GFF regions.")
|
|
172
|
+
for mrna_obj in self.mRNA_data.values():
|
|
173
|
+
for regions in mrna_obj.regions.values():
|
|
174
|
+
for i, region in enumerate(regions):
|
|
175
|
+
if region.chrom not in self.reference_sequences: continue
|
|
176
|
+
seq = self.reference_sequences[region.chrom][region.start - 1:region.end]
|
|
177
|
+
regions[i] = region._replace(sequence=seq)
|
|
178
|
+
if hasattr(mrna_obj, 'splice_junctions'):
|
|
179
|
+
for i, junction in enumerate(mrna_obj.splice_junctions):
|
|
180
|
+
site_region = junction['site']
|
|
181
|
+
window_region = junction['window']
|
|
182
|
+
if site_region.chrom in self.reference_sequences:
|
|
183
|
+
site_seq = self.reference_sequences[site_region.chrom][site_region.start - 1:site_region.end]
|
|
184
|
+
mrna_obj.splice_junctions[i]['site'] = site_region._replace(sequence=site_seq)
|
|
185
|
+
if window_region.chrom in self.reference_sequences:
|
|
186
|
+
window_seq = self.reference_sequences[window_region.chrom][window_region.start - 1:window_region.end]
|
|
187
|
+
mrna_obj.splice_junctions[i]['window'] = window_region._replace(sequence=window_seq)
|
|
188
|
+
logging.info("Region sequence extraction complete.")
|
|
189
|
+
|
|
190
|
+
def assemble_cds_sequences(self):
|
|
191
|
+
logging.info("Assembling CDS sequences for mRNAs.")
|
|
192
|
+
for mrna_id, mrna_obj in self.mRNA_data.items():
|
|
193
|
+
cds_regions = mrna_obj.regions['CDS']
|
|
194
|
+
sorted_cds = sorted(cds_regions, key=lambda r: r.start)
|
|
195
|
+
full_cds_seq = "".join(cds.sequence for cds in sorted_cds)
|
|
196
|
+
self.mRNA_data[mrna_id] = mrna_obj._replace(cds_sequence=full_cds_seq)
|
|
197
|
+
for gene_id, gene_obj in self.gene_data.items():
|
|
198
|
+
updated_mRNAs = []
|
|
199
|
+
for old_mrna in gene_obj.mRNAs:
|
|
200
|
+
if old_mrna.id in self.mRNA_data:
|
|
201
|
+
updated_mRNAs.append(self.mRNA_data[old_mrna.id])
|
|
202
|
+
else:
|
|
203
|
+
updated_mRNAs.append(old_mrna)
|
|
204
|
+
new_gene_obj = gene_obj._replace(mRNAs=updated_mRNAs)
|
|
205
|
+
self.gene_data[gene_id] = new_gene_obj
|
|
206
|
+
logging.info("CDS sequence assembly complete.")
|
|
207
|
+
|