PyPI - PyBRAID - Versions diffs - 1.0.0__py3-none-any.whl - Mend

PyBRAID 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

braid/__init__.py +0 -0
braid/cli.py +156 -0
braid/data/test.fasta +3 -0
braid/data/test.fasta.fai +1 -0
braid/data/test.gff3 +15 -0
braid/data/test.vcf +8 -0
braid/data/test.vcf.gz +0 -0
braid/data/test.vcf.gz.csi +0 -0
braid/data/variant_analysis_output.alignment.txt +20 -0
braid/data/variant_analysis_output.log +191 -0
braid/data/variant_analysis_output.sample.txt +3 -0
braid/data/variant_analysis_output.tsv +5 -0
braid/genome.py +207 -0
braid/modifier.py +500 -0
braid/output.py +285 -0
braid/protein.py +141 -0
braid/utils.py +86 -0
braid/vcf.py +180 -0
pybraid-1.0.0.dist-info/METADATA +227 -0
pybraid-1.0.0.dist-info/RECORD +24 -0
pybraid-1.0.0.dist-info/WHEEL +5 -0
pybraid-1.0.0.dist-info/entry_points.txt +2 -0
pybraid-1.0.0.dist-info/licenses/LICENSE +21 -0
pybraid-1.0.0.dist-info/top_level.txt +1 -0

braid/__init__.py ADDED Viewed

File without changes

braid/cli.py ADDED Viewed

@@ -0,0 +1,156 @@
+import sys
+import argparse
+import os
+import pysam
+import logging
+from .utils import setup_logging, CheckpointManager
+from .modifier import SequenceModifier
+from .protein import ProteinAnalyzer
+from .genome import GenomeProcessor
+from .vcf import VCFProcessor
+from .output import ResultsOutputter
+try:
+    from importlib import resources
+except ImportError:
+    import importlib_resources as resources
+def get_test_data_path(filename):
+    try:
+        data_path = resources.files('braid').joinpath('data').joinpath(filename)
+        return str(data_path)
+    except AttributeError:
+        with resources.path('braid', 'data') as data_dir:
+            return str(data_dir / filename)
+def run_test():
+    vcf_path = get_test_data_path("test.vcf.gz")
+    fasta_path = get_test_data_path("test.fasta")
+    gff_path = get_test_data_path("test.gff3")
+    test_args = [
+        "-v", vcf_path,
+        "-r", fasta_path,
+        "-g", gff_path
+    ]
+    try:
+        main(test_args)
+    except Exception as e:
+        print(f"\n Failed: {e}")
+        import traceback
+        traceback.print_exc()
+def main(args_list=None):
+    parser = argparse.ArgumentParser(
+        description="Analyzes phased VCF data to predict variant effects on protein sequences for each haplotype.",
+        formatter_class=argparse.RawTextHelpFormatter
+    )
+    parser.add_argument('-g','--gff', help='Input GFF3 file.')
+    parser.add_argument('-r', '--reference', help='Reference genome FASTA file (indexed: .fai).')
+    parser.add_argument('-v', '--vcf',  help='Phased VCF file (indexed: .tbi/.csi).')
+    parser.add_argument('-o', '--output', default='variant_analysis_output.tsv', help='Output TSV file name.')
+    parser.add_argument('--force-unphased', action='store_true', help='Skip the phasing check and force the script to run on a potentially unphased VCF file.')
+    parser.add_argument('--ignore-intron', action='store_true', help='Ignore mutations marked only as intron.')
+    parser.add_argument('-s', '--sample', help='Path to file with specific sample IDs to analyze (one per line, no header).')
+    parser.add_argument('--gene', help='Path to file with specific gene IDs to analyze (one per line, no header).')
+    parser.add_argument('--lof-threshold', type=float, default=0.30, help="LOF classification threshold.")
+    parser.add_argument('--resume', action='store_true', help='Resume from the last successfully processed gene found in the log.')
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    parser_test = subparsers.add_parser("test", help="Run tests using built-in data.")
+    args = parser.parse_args(args_list)
+    if args.command == "test":
+        run_test()
+        return
+    if not args.gff or not args.reference or not args.vcf:
+        parser.print_help()
+        logging.error("Error: Arguments -g, -r, and -v are required for analysis.")
+        sys.exit(1)
+    output_basename = os.path.splitext(args.output)[0]
+    args.log = f"{output_basename}.log"
+    setup_logging(args.log)
+    args.align = f"{output_basename}.alignment.txt"
+    args.sample_matrix = f"{output_basename}.sample.txt"
+    ckpt_manager = CheckpointManager(args.log)
+    open_mode = 'w'
+    if args.resume:
+        logging.info("Received (Resume Mode)...")
+        ckpt_manager.load_completed_genes()
+        ckpt_manager.truncate_if_incomplete(args.output, file_type='tsv')
+        ckpt_manager.truncate_if_incomplete(args.sample_matrix, file_type='tsv')
+        ckpt_manager.truncate_if_incomplete(args.align, file_type='align')
+        open_mode = 'a'
+    else:
+        logging.info("Start new task (Start from scratch)...")
+    try:
+        genome = GenomeProcessor(args.gff, args.reference)
+        genome.load_reference_sequences()
+        genome.parse_gff()
+        genome.extract_region_sequences()
+        genome.assemble_cds_sequences()
+        all_genes = list(genome.gene_data.items())
+        genes_to_process = all_genes
+        if args.gene:
+            if not os.path.exists(args.gene):
+                logging.error(f"Gene list file not found: {args.gene}")
+                sys.exit(1)
+            logging.info(f"Loading target genes from {args.gene}...")
+            with open(args.gene, 'r') as f:
+                target_gene_ids = set(line.strip() for line in f if line.strip())
+            genes_to_process = [(gid, obj) for gid, obj in all_genes if gid in target_gene_ids]
+            logging.info(f"Target genes loaded: {len(target_gene_ids)}. Found in GFF: {len(genes_to_process)}.")
+            if len(genes_to_process) == 0:
+                logging.warning("No target genes found in the GFF file! Please check IDs.")
+                sys.exit(0)
+        vcf_processor = VCFProcessor(genome, args.vcf, args.force_unphased, args.sample, args.ignore_intron)
+        sequence_modifier = SequenceModifier(genome)
+        protein_analyzer = ProteinAnalyzer(genome, args.lof_threshold)
+        results_outputter = ResultsOutputter(genome, vcf_processor, sequence_modifier, protein_analyzer)
+        if args.resume and os.path.exists(args.output) and os.path.getsize(args.output) > 0:
+            results_outputter.header_written = True
+        with pysam.VariantFile(args.vcf) as vcf_in, \
+             open(args.output, open_mode, buffering=1) as out_f, \
+             open(args.align, open_mode, buffering=1) as align_f, \
+             open(args.sample_matrix, open_mode, buffering=1) as sample_f:
+            vcf_processor.load_and_validate_samples(vcf_in)
+            results_outputter.write_header(out_f, sample_f)
+            logging.info(f"Starting pipeline. Total genes: {len(genes_to_process)}")
+            for i, (gene_id, gene_obj) in enumerate(genes_to_process, 1):
+                if gene_id in ckpt_manager.completed_genes:
+                    continue
+                logging.info(f"Processing Gene {i}/{len(genes_to_process)}: {gene_id}...")
+                gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
+                if gene_variant_combinations:
+                    results_outputter.analyze_and_write_gene_results(gene_obj, gene_variant_combinations, out_f, align_f, sample_f)
+                    out_f.flush()
+                    align_f.flush()
+                    sample_f.flush()
+                ckpt_manager.log_completion(gene_id)
+        logging.info("Pipeline finished successfully.")
+    except Exception as e:
+        logging.error(f"Pipeline failed: {e}", exc_info=True)
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

braid/data/test.fasta ADDED Viewed

@@ -0,0 +1,3 @@
+>1
+CGTACGTAGCTATGAGCTTAGCTAGCTCAGCTAACGATGTCGTTAAGTAGATGATCGATC
+GATCGATCGATCGATCGGTCGATCGATCAGATCGATCGATCGATCGATCGATCGTGAC

braid/data/test.fasta.fai ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1 118 3 60 61

braid/data/test.gff3 ADDED Viewed

@@ -0,0 +1,15 @@
+1	ensembl	gene	2	117	.	+	.	ID=gene1;biotype=protein_coding;gene_id=gene1
+1	ensembl	mRNA	2	117	.	+	.	ID=transcript1;Parent=gene1;biotype=protein_coding;transcript_id=transcript1
+1	ensembl	five_prime_UTR	2	11	.	+	.	Parent=transcript1
+1	ensembl	exon	2	38	.	+	.	Parent=transcript1;Name=exon1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon1;rank=1;version=1
+1	ensembl	CDS	12	38	.	+	0	ID=CDS:protein1;Parent=transcript1;protein_id=protein1
+1	ensembl	exon	51	77	.	+	.	Parent=transcript1;Name=exon2;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon2;rank=2;version=1
+1	ensembl	CDS	51	77	.	+	0	ID=CDS:protein1;Parent=transcript1;protein_id=protein1
+1	ensembl	exon	91	117	.	+	.	Parent=transcript1;Name=exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon3;rank=3;version=1
+1	ensembl	CDS	91	117	.	+	0	ID=CDS:protein1;Parent=transcript1;protein_id=protein1
+1	ensembl	mRNA	2	77	.	+	.	ID=transcript2;Parent=gene1;biotype=protein_coding;transcript_id=transcript2
+1	ensembl	five_prime_UTR	2	11	.	+	.	Parent=transcript1
+1	ensembl	exon	2	38	.	+	.	Parent=transcript2;Name=exon1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon1;rank=1;version=1
+1	ensembl	CDS	12	38	.	+	0	ID=CDS:protein2;Parent=transcript2;protein_id=protein2
+1	ensembl	exon	91	117	.	+	.	Parent=transcript2;Name=exon3;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=exon3;rank=2;version=1
+1	ensembl	CDS	91	117	.	+	0	ID=CDS:protein2;Parent=transcript2;protein_id=protein2

braid/data/test.vcf ADDED Viewed

@@ -0,0 +1,8 @@
+##fileformat=VCFv4.2
+##contig=<ID=1>
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample1	sample2	sample3	sample4
+1	17	.	CTTAG	C	.	PASS	.	GT	0|1	1|0	1|1	0|0
+1	27	.	T	TT	.	PASS	.	GT	0|1	1|0	1|1	0|0

braid/data/test.vcf.gz ADDED Viewed

Binary file

braid/data/test.vcf.gz.csi ADDED Viewed

Binary file

braid/data/variant_analysis_output.alignment.txt ADDED Viewed

@@ -0,0 +1,20 @@
+Haplotype_ID: transcript1:1
+Gene: gene1 | mRNA: transcript1
+Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
+Variant_Type: NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion||||||||||||||
+Protein_Changes: Del(5)S
+Alignment:
+  Ref: MSLASSANDMIDRSIDRSIDRSIDRS*
+       |||| ||||||||||||||||||||||
+  Alt: MSLA-SANDMIDRSIDRSIDRSIDRS*
+Haplotype_ID: transcript2:1
+Gene: gene1 | mRNA: transcript2
+Haplotype_Mutations: 1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]
+Variant_Type: NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18)|||deletion||||||||||||||
+Protein_Changes: Del(5)S
+Alignment:
+  Ref: MSLASSANDIDRSIDRS*
+       |||| |||||||||||||
+  Alt: MSLA-SANDIDRSIDRS*

braid/data/variant_analysis_output.log ADDED Viewed

@@ -0,0 +1,191 @@
+2026-01-16 15:41:45,245 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 15:41:45,246 - INFO - Start new task (Start from scratch)...
+2026-01-16 15:41:45,246 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 15:41:45,246 - INFO - Reference sequences loaded.
+2026-01-16 15:41:45,246 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff
+2026-01-16 15:41:45,246 - ERROR - Pipeline failed: [Errno 2] No such file or directory: '/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff'
+Traceback (most recent call last):
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 95, in main
+    genome.parse_gff()
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/genome.py", line 31, in parse_gff
+    with open(self.gff_file, 'r') as f:
+FileNotFoundError: [Errno 2] No such file or directory: '/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff'
+2026-01-16 15:43:18,538 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 15:43:18,539 - INFO - Start new task (Start from scratch)...
+2026-01-16 15:43:18,539 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 15:43:18,539 - INFO - Reference sequences loaded.
+2026-01-16 15:43:18,539 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 15:43:18,539 - INFO - Detected GFF3 format.
+2026-01-16 15:43:18,540 - INFO - GFF parsing complete.
+2026-01-16 15:43:18,540 - INFO - Extracting sequences for GFF regions.
+2026-01-16 15:43:18,540 - INFO - Region sequence extraction complete.
+2026-01-16 15:43:18,540 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 15:43:18,540 - INFO - CDS sequence assembly complete.
+2026-01-16 15:43:18,540 - ERROR - Pipeline failed: name 'PairwiseAligner' is not defined
+Traceback (most recent call last):
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 115, in main
+    sequence_modifier = SequenceModifier(genome)
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/modifier.py", line 11, in __init__
+    self.aligner = PairwiseAligner()
+NameError: name 'PairwiseAligner' is not defined
+2026-01-16 15:45:26,079 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 15:45:26,080 - INFO - Start new task (Start from scratch)...
+2026-01-16 15:45:26,080 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 15:45:26,080 - INFO - Reference sequences loaded.
+2026-01-16 15:45:26,080 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 15:45:26,080 - INFO - Detected GFF3 format.
+2026-01-16 15:45:26,080 - INFO - GFF parsing complete.
+2026-01-16 15:45:26,081 - INFO - Extracting sequences for GFF regions.
+2026-01-16 15:45:26,081 - INFO - Region sequence extraction complete.
+2026-01-16 15:45:26,081 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 15:45:26,081 - INFO - CDS sequence assembly complete.
+2026-01-16 15:45:26,082 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
+2026-01-16 15:45:26,082 - ERROR - Pipeline failed: name 'pysam' is not defined
+Traceback (most recent call last):
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 123, in main
+    with pysam.VariantFile(args.vcf) as vcf_in, \
+NameError: name 'pysam' is not defined
+2026-01-16 15:46:22,451 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 15:46:22,451 - INFO - Start new task (Start from scratch)...
+2026-01-16 15:46:22,451 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 15:46:22,455 - INFO - Reference sequences loaded.
+2026-01-16 15:46:22,455 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 15:46:22,455 - INFO - Detected GFF3 format.
+2026-01-16 15:46:22,455 - INFO - GFF parsing complete.
+2026-01-16 15:46:22,455 - INFO - Extracting sequences for GFF regions.
+2026-01-16 15:46:22,456 - INFO - Region sequence extraction complete.
+2026-01-16 15:46:22,456 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 15:46:22,456 - INFO - CDS sequence assembly complete.
+2026-01-16 15:46:22,457 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
+2026-01-16 15:46:22,458 - INFO - Loading and validating VCF samples...
+2026-01-16 15:46:22,459 - ERROR - Pipeline failed: name 'os' is not defined
+Traceback (most recent call last):
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 129, in main
+    vcf_processor.load_and_validate_samples(vcf_in)
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 28, in load_and_validate_samples
+    if not os.path.exists(vcf_path + ".tbi") and not os.path.exists(vcf_path + ".csi"):
+NameError: name 'os' is not defined
+2026-01-16 15:47:15,122 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 15:47:15,122 - INFO - Start new task (Start from scratch)...
+2026-01-16 15:47:15,123 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 15:47:15,123 - INFO - Reference sequences loaded.
+2026-01-16 15:47:15,123 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 15:47:15,123 - INFO - Detected GFF3 format.
+2026-01-16 15:47:15,123 - INFO - GFF parsing complete.
+2026-01-16 15:47:15,123 - INFO - Extracting sequences for GFF regions.
+2026-01-16 15:47:15,124 - INFO - Region sequence extraction complete.
+2026-01-16 15:47:15,124 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 15:47:15,124 - INFO - CDS sequence assembly complete.
+2026-01-16 15:47:15,125 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
+2026-01-16 15:47:15,126 - INFO - Loading and validating VCF samples...
+2026-01-16 15:47:15,126 - ERROR - Index tbi/csi is missing, run 'bcftools index /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.vcf' to produce index
+2026-01-16 15:47:15,126 - ERROR - Pipeline failed: name 'sys' is not defined
+Traceback (most recent call last):
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 129, in main
+    vcf_processor.load_and_validate_samples(vcf_in)
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 33, in load_and_validate_samples
+    sys.exit(1)
+NameError: name 'sys' is not defined
+2026-01-16 15:48:10,135 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 15:48:10,135 - INFO - Start new task (Start from scratch)...
+2026-01-16 15:48:10,135 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 15:48:10,135 - INFO - Reference sequences loaded.
+2026-01-16 15:48:10,135 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 15:48:10,135 - INFO - Detected GFF3 format.
+2026-01-16 15:48:10,136 - INFO - GFF parsing complete.
+2026-01-16 15:48:10,136 - INFO - Extracting sequences for GFF regions.
+2026-01-16 15:48:10,136 - INFO - Region sequence extraction complete.
+2026-01-16 15:48:10,136 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 15:48:10,136 - INFO - CDS sequence assembly complete.
+2026-01-16 15:48:10,137 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
+2026-01-16 15:48:10,138 - INFO - Loading and validating VCF samples...
+2026-01-16 15:48:10,138 - ERROR - Index tbi/csi is missing, run 'bcftools index /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.vcf' to produce index
+2026-01-16 15:49:38,190 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 15:49:38,190 - INFO - Start new task (Start from scratch)...
+2026-01-16 15:49:38,190 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 15:49:38,191 - INFO - Reference sequences loaded.
+2026-01-16 15:49:38,191 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 15:49:38,191 - INFO - Detected GFF3 format.
+2026-01-16 15:49:38,191 - INFO - GFF parsing complete.
+2026-01-16 15:49:38,191 - INFO - Extracting sequences for GFF regions.
+2026-01-16 15:49:38,191 - INFO - Region sequence extraction complete.
+2026-01-16 15:49:38,191 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 15:49:38,191 - INFO - CDS sequence assembly complete.
+2026-01-16 15:49:38,193 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
+2026-01-16 15:49:38,194 - INFO - Loading and validating VCF samples...
+2026-01-16 15:49:38,194 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
+2026-01-16 15:49:38,194 - INFO - Checking VCF for phased genotype information...
+2026-01-16 15:49:38,194 - INFO - Phasing check passed. VCF appears to be phased.
+2026-01-16 15:49:38,194 - INFO - Starting pipeline. Total genes: 1
+2026-01-16 15:49:38,194 - INFO - Processing Gene 1/1: gene1...
+2026-01-16 15:49:38,195 - ERROR - Pipeline failed: name 'Mutation' is not defined
+Traceback (most recent call last):
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 138, in main
+    gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 139, in process_variants_for_gene
+    mut_obj = self._create_mutation_from_record(record, alleles[allele_idx], labels)
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 92, in _create_mutation_from_record
+    return Mutation(mutation_id, chrom, record.pos, ref, alt, mut_type, False, labels)
+NameError: name 'Mutation' is not defined
+2026-01-16 18:10:14,143 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 18:10:14,158 - INFO - Start new task (Start from scratch)...
+2026-01-16 18:10:14,158 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 18:10:14,159 - INFO - Reference sequences loaded.
+2026-01-16 18:10:14,159 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 18:10:14,159 - INFO - Detected GFF3 format.
+2026-01-16 18:10:14,160 - INFO - GFF parsing complete.
+2026-01-16 18:10:14,160 - INFO - Extracting sequences for GFF regions.
+2026-01-16 18:10:14,160 - INFO - Region sequence extraction complete.
+2026-01-16 18:10:14,160 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 18:10:14,160 - INFO - CDS sequence assembly complete.
+2026-01-16 18:10:14,162 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
+2026-01-16 18:10:14,163 - INFO - Loading and validating VCF samples...
+2026-01-16 18:10:14,163 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
+2026-01-16 18:10:14,163 - INFO - Checking VCF for phased genotype information...
+2026-01-16 18:10:14,164 - INFO - Phasing check passed. VCF appears to be phased.
+2026-01-16 18:10:14,164 - INFO - Starting pipeline. Total genes: 1
+2026-01-16 18:10:14,164 - INFO - Processing Gene 1/1: gene1...
+2026-01-16 18:10:14,164 - ERROR - Pipeline failed: name 'Mutation' is not defined
+Traceback (most recent call last):
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/cli.py", line 138, in main
+    gene_variant_combinations = vcf_processor.process_variants_for_gene(gene_obj, vcf_in)
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 139, in process_variants_for_gene
+    mut_obj = self._create_mutation_from_record(record, alleles[allele_idx], labels)
+  File "/mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/vcf.py", line 92, in _create_mutation_from_record
+    return Mutation(mutation_id, chrom, record.pos, ref, alt, mut_type, False, labels)
+NameError: name 'Mutation' is not defined
+2026-01-16 18:14:22,484 - INFO - Logging configured. Log file: variant_analysis_output.log
+2026-01-16 18:14:22,494 - INFO - Start new task (Start from scratch)...
+2026-01-16 18:14:22,494 - INFO - Loading reference sequences from FASTA file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.fasta
+2026-01-16 18:14:22,494 - INFO - Reference sequences loaded.
+2026-01-16 18:14:22,494 - INFO - Parsing GFF file: /mnt/gs21/scratch/huang292/lab/beagle/py_haplo/BRAID/src/braid/data/test.gff3
+2026-01-16 18:14:22,495 - INFO - Detected GFF3 format.
+2026-01-16 18:14:22,495 - INFO - GFF parsing complete.
+2026-01-16 18:14:22,495 - INFO - Extracting sequences for GFF regions.
+2026-01-16 18:14:22,495 - INFO - Region sequence extraction complete.
+2026-01-16 18:14:22,495 - INFO - Assembling CDS sequences for mRNAs.
+2026-01-16 18:14:22,495 - INFO - CDS sequence assembly complete.
+2026-01-16 18:14:22,497 - INFO - ProteinAnalyzer initialized with full, integrated functionality.
+2026-01-16 18:14:22,498 - INFO - Loading and validating VCF samples...
+2026-01-16 18:14:22,498 - INFO - No sample file provided. Analyzing all 4 samples found in VCF.
+2026-01-16 18:14:22,498 - INFO - Checking VCF for phased genotype information...
+2026-01-16 18:14:22,498 - INFO - Phasing check passed. VCF appears to be phased.
+2026-01-16 18:14:22,498 - INFO - Starting pipeline. Total genes: 1
+2026-01-16 18:14:22,498 - INFO - Processing Gene 1/1: gene1...
+2026-01-16 18:14:22,500 - INFO - pos: 27, ref_crosses_boundary: False
+2026-01-16 18:14:22,500 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['T']
+2026-01-16 18:14:22,500 - INFO - alt_for_cds: TT
+2026-01-16 18:14:22,500 - INFO - pos: 27, len(cds_seq_list_after_alt): 82
+2026-01-16 18:14:22,500 - INFO - pos: 17, ref_crosses_boundary: False
+2026-01-16 18:14:22,500 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['C', 'T', 'T', 'A', 'G']
+2026-01-16 18:14:22,500 - INFO - alt_for_cds: C
+2026-01-16 18:14:22,500 - INFO - pos: 17, len(cds_seq_list_after_alt): 78
+2026-01-16 18:14:22,501 - INFO - pos: 27, ref_crosses_boundary: False
+2026-01-16 18:14:22,501 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['T']
+2026-01-16 18:14:22,501 - INFO - alt_for_cds: TT
+2026-01-16 18:14:22,501 - INFO - pos: 27, len(cds_seq_list_after_alt): 55
+2026-01-16 18:14:22,501 - INFO - pos: 17, ref_crosses_boundary: False
+2026-01-16 18:14:22,501 - INFO - cds_seq_list[dynamic_cds_pos : dynamic_cds_pos + num_bases_to_remove]: ['C', 'T', 'T', 'A', 'G']
+2026-01-16 18:14:22,501 - INFO - alt_for_cds: C
+2026-01-16 18:14:22,501 - INFO - pos: 17, len(cds_seq_list_after_alt): 51
+2026-01-16 18:14:22,501 - INFO - Finished processing gene: gene1

braid/data/variant_analysis_output.sample.txt ADDED Viewed

@@ -0,0 +1,3 @@
+Gene_ID	mRNA_ID	Ref_ID	Alt_IDs	sample1	sample2	sample3	sample4
+gene1	transcript1	transcript1:REF	transcript1:1	0|1	1|0	1|1	0|0
+gene1	transcript2	transcript2:REF	transcript2:1	0|1	1|0	1|1	0|0

braid/data/variant_analysis_output.tsv ADDED Viewed

@@ -0,0 +1,5 @@
+Gene_ID	Haplotype_ID	mRNA	Haplotype_Count	Frequency	Variant_Type	Protein_Changes	Haplotype_Mutations	Sample_Sources	Ref_Protein	Alt_Protein	Ref_CDS	Alt_CDS	Aligned_Ref	Comparison_String	Aligned_Alt
+gene1	transcript1:REF	transcript1	.	.	NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:27)|||||||||||||||||	.	.	.	MSLASSANDMIDRSIDRSIDRSIDRS*	MSLASSANDMIDRSIDRSIDRSIDRS*	ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA	ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA	MSLASSANDMIDRSIDRSIDRSIDRS*	|||||||||||||||||||||||||||	MSLASSANDMIDRSIDRSIDRSIDRS*
+gene1	transcript1:1	transcript1	4	0.500000	NoLOF(non_identity_rate:3.70%,non_identical_AAs:1,total_ref_AAs:27)|||deletion||||||||||||||	Del(5)S	1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]	sample1(Hap2);sample2(Hap1);sample3(Homo)	MSLASSANDMIDRSIDRSIDRSIDRS*	MSLASANDMIDRSIDRSIDRSIDRS*	ATGAGCTTAGCTAGCTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA	ATGAGCCTAGCTTCAGCTAACGATATGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGTGA	MSLASSANDMIDRSIDRSIDRSIDRS*	|||| ||||||||||||||||||||||	MSLA-SANDMIDRSIDRSIDRSIDRS*
+gene1	transcript2:REF	transcript2	.	.	NoLOF(non_identity_rate:0.00%,non_identical_AAs:0,total_ref_AAs:18)|||||||||||||||||	.	.	.	MSLASSANDIDRSIDRS*	MSLASSANDIDRSIDRS*	ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA	ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA	MSLASSANDIDRSIDRS*	||||||||||||||||||	MSLASSANDIDRSIDRS*
+gene1	transcript2:1	transcript2	4	0.500000	NoLOF(non_identity_rate:5.56%,non_identical_AAs:1,total_ref_AAs:18)|||deletion||||||||||||||	Del(5)S	1:17_CTTAG>C[CDS,EXON];1:27_T>TT[CDS,EXON]	sample1(Hap2);sample2(Hap1);sample3(Homo)	MSLASSANDIDRSIDRS*	MSLASANDIDRSIDRS*	ATGAGCTTAGCTAGCTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA	ATGAGCCTAGCTTCAGCTAACGATATCGATCGATCGATCGATCGATCGTGA	MSLASSANDIDRSIDRS*	|||| |||||||||||||	MSLA-SANDIDRSIDRS*

braid/genome.py ADDED Viewed

@@ -0,0 +1,207 @@
+import logging
+import re
+import pysam
+from collections import defaultdict, namedtuple
+from Bio.Seq import Seq
+from .utils import reverse_complement
+Gene = namedtuple('Gene', ['id', 'chrom', 'start', 'end', 'strand', 'mRNAs'])
+mRNA = namedtuple('mRNA', ['id', 'chrom', 'start', 'end', 'strand', 'regions', 'cds_sequence', 'splice_junctions'])
+Region = namedtuple('Region', ['type', 'chrom', 'start', 'end', 'strand', 'sequence'])
+Mutation = namedtuple('Mutation', ['id', 'chrom', 'pos', 'ref', 'alt', 'type', 'overlapping', 'labels'])
+class GenomeProcessor:
+    def __init__(self, gff_file, fasta_file):
+        self.gff_file = gff_file
+        self.fasta_file = fasta_file
+        self.gene_data = {}
+        self.mRNA_data = {}
+        self.reference_sequences = {}
+        self.chrom_lengths = {}
+    def parse_gff(self):
+        logging.info(f"Parsing GFF file: {self.gff_file}")
+        gene_mRNAs = defaultdict(list)
+        mrna_exons = defaultdict(list)
+        mrna_cds = defaultdict(list)
+        mrna_five_prime_utrs = defaultdict(list)
+        mrna_three_prime_utrs = defaultdict(list)
+        file_format = None
+        with open(self.gff_file, 'r') as f:
+            for line in f:
+                if line.startswith('#'): continue
+                parts = line.strip().split('\t')
+                if len(parts) < 9: continue
+                chrom = parts[0]
+                feature_type = parts[2]
+                start, end = int(parts[3]), int(parts[4])
+                strand = parts[6]
+                if '"' in parts[8] and ' ' in parts[8]:
+                    if file_format is None:
+                        logging.info("Detected GTF format.")
+                        file_format = 'gtf'
+                    attributes = dict(re.findall(r'(\w+)\s+"([^"]+)"', parts[8]))
+                elif '=' in parts[8]:
+                    if file_format is None:
+                        logging.info("Detected GFF3 format.")
+                        file_format = 'gff'
+                    attributes = dict(re.findall(r'(\w+)=([^;]+)', parts[8]))
+                else:
+                    if file_format is None:
+                        logging.warning("Could not definitively determine format. Assuming GFF3.")
+                        file_format = 'gff'
+                    attributes = dict(re.findall(r'(\w+)=([^;]+)', parts[8]))
+                if file_format == 'gtf':
+                    if feature_type == 'gene':
+                        gene_id = attributes.get('gene_id')
+                        if gene_id: self.gene_data[gene_id] = Gene(gene_id, chrom, start, end, strand, [])
+                    elif feature_type == 'transcript' or feature_type == 'mRNA':
+                        mrna_id, parent_gene_id = attributes.get('transcript_id'), attributes.get('gene_id')
+                        if mrna_id and parent_gene_id:
+                            self.mRNA_data[mrna_id] = mRNA(mrna_id, chrom, start, end, strand, defaultdict(list), '', [])
+                            gene_mRNAs[parent_gene_id].append(mrna_id)
+                    elif feature_type == 'exon':
+                        parent_mrna_id = attributes.get('transcript_id')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_exons[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+                    elif feature_type == 'CDS':
+                        parent_mrna_id = attributes.get('transcript_id')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_cds[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+                    elif feature_type == 'five_prime_UTR':
+                        parent_mrna_id = attributes.get('transcript_id')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_five_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+                    elif feature_type == 'three_prime_UTR':
+                        parent_mrna_id = attributes.get('transcript_id')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_three_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+                elif file_format == 'gff':
+                    if feature_type == 'gene':
+                        gene_id = attributes.get('ID')
+                        if gene_id: self.gene_data[gene_id] = Gene(gene_id, chrom, start, end, strand, [])
+                    elif feature_type == 'transcript' or feature_type == 'mRNA':
+                        mrna_id, parent_gene_id = attributes.get('ID'), attributes.get('Parent')
+                        if mrna_id and parent_gene_id:
+                            self.mRNA_data[mrna_id] = mRNA(mrna_id, chrom, start, end, strand, defaultdict(list), '', [])
+                            gene_mRNAs[parent_gene_id].append(mrna_id)
+                    elif feature_type == 'exon':
+                        parent_mrna_id = attributes.get('Parent')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_exons[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+                    elif feature_type == 'CDS':
+                        parent_mrna_id = attributes.get('Parent')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_cds[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+                    elif feature_type == 'five_prime_UTR':
+                        parent_mrna_id = attributes.get('Parent')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_five_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+                    elif feature_type == 'three_prime_UTR':
+                        parent_mrna_id = attributes.get('Parent')
+                        if parent_mrna_id in self.mRNA_data:
+                            mrna_three_prime_utrs[parent_mrna_id].append({'chrom': chrom, 'start': start, 'end': end})
+        for mrna_id in mrna_exons: mrna_exons[mrna_id].sort(key=lambda x: x['start'])
+        for mrna_id in mrna_cds: mrna_cds[mrna_id].sort(key=lambda x: x['start'])
+        for mrna_id, mrna_obj in self.mRNA_data.items():
+            for exon in mrna_exons.get(mrna_id, []):
+                mrna_obj.regions['EXON'].append(Region('EXON', mrna_obj.chrom, exon['start'], exon['end'], mrna_obj.strand, ''))
+            for cds in mrna_cds.get(mrna_id, []):
+                 mrna_obj.regions['CDS'].append(Region('CDS', mrna_obj.chrom, cds['start'], cds['end'], mrna_obj.strand, ''))
+            for utr in mrna_five_prime_utrs.get(mrna_id, []):
+                mrna_obj.regions['five_prime_UTR'].append(Region('five_prime_UTR', chrom, utr['start'], utr['end'], strand, ''))
+            for utr in mrna_three_prime_utrs.get(mrna_id, []):
+                mrna_obj.regions['three_prime_UTR'].append(Region('three_prime_UTR', chrom, utr['start'], utr['end'], strand, ''))
+            sorted_exons = sorted(mrna_obj.regions['EXON'], key=lambda r: r.start)
+            if len(sorted_exons) > 1:
+                for i in range(len(sorted_exons) - 1):
+                    intron_start = sorted_exons[i].end + 1
+                    intron_end = sorted_exons[i+1].start - 1
+                    if intron_start <= intron_end:
+                        mrna_obj.regions['INTRON'].append(Region('INTRON', mrna_obj.chrom, intron_start, intron_end, mrna_obj.strand, ''))
+            sorted_introns = sorted(mrna_obj.regions['INTRON'], key=lambda r: r.start)
+            filtered_introns = [i for i in sorted_introns if (i.end - i.start) >= 4]
+            for intron in filtered_introns:
+                chrom_len = self.chrom_lengths.get(mrna_obj.chrom, 0)
+                if mrna_obj.strand == '+': # define donor and acceptor sites and their windows
+                    donor_start, donor_end = intron.start, intron.start + 1
+                    acceptor_start, acceptor_end = intron.end - 1, intron.end
+                    donor_window_start = max(1, intron.start - 3)
+                    donor_window_end = min(chrom_len, intron.start + 8)
+                    acceptor_window_start = max(1, intron.end - 8)
+                    acceptor_window_end = min(chrom_len, intron.end + 3)
+                else:
+                    donor_start, donor_end = intron.end - 1, intron.end
+                    acceptor_start, acceptor_end = intron.start, intron.start + 1
+                    donor_window_start = max(1, intron.end - 8)
+                    donor_window_end = min(chrom_len, intron.end + 3)
+                    acceptor_window_start = max(1, intron.start - 3)
+                    acceptor_window_end = min(chrom_len, intron.start + 8)
+                if donor_start <= donor_end:
+                    donor_junction = {'type': 'donor', 'site': Region('splice_donor_site', mrna_obj.chrom, donor_start, donor_end, mrna_obj.strand, ''), 'window': Region('splice_donor_window', mrna_obj.chrom, donor_window_start, donor_window_end, mrna_obj.strand, '')}
+                    mrna_obj.splice_junctions.append(donor_junction)
+                else:
+                    logging.warning(f"Invalid donor site coordinates for mRNA {mrna_obj.id}: start ({donor_start}) > end ({donor_end}). Skipping.")
+                if acceptor_start <= acceptor_end:
+                    acceptor_junction = {'type': 'acceptor', 'site': Region('splice_acceptor_site', mrna_obj.chrom, acceptor_start, acceptor_end, mrna_obj.strand, ''), 'window': Region('splice_acceptor_window', mrna_obj.chrom, acceptor_window_start, acceptor_window_end, mrna_obj.strand, '')}
+                    mrna_obj.splice_junctions.append(acceptor_junction)
+                else:
+                    logging.warning(f"Invalid acceptor site coordinates for mRNA {mrna_obj.id}: start ({acceptor_start}) > end ({acceptor_end}). Skipping.")
+        for gene_id, mrna_ids in gene_mRNAs.items():
+            if gene_id in self.gene_data:
+                self.gene_data[gene_id] = self.gene_data[gene_id]._replace(mRNAs=[self.mRNA_data[mid] for mid in mrna_ids if mid in self.mRNA_data])
+        logging.info("GFF parsing complete.")
+    def load_reference_sequences(self):
+        logging.info(f"Loading reference sequences from FASTA file: {self.fasta_file}")
+        try:
+            with pysam.FastaFile(self.fasta_file) as fasta:
+                for chrom_name in fasta.references:
+                   self.reference_sequences[chrom_name] = fasta.fetch(chrom_name)
+                   self.chrom_lengths[chrom_name] = fasta.get_reference_length(chrom_name)
+            logging.info("Reference sequences loaded.")
+        except Exception as e:
+            logging.error(f"Error loading FASTA file: {e}")
+            raise
+    def extract_region_sequences(self):
+        logging.info("Extracting sequences for GFF regions.")
+        for mrna_obj in self.mRNA_data.values():
+            for regions in mrna_obj.regions.values():
+                for i, region in enumerate(regions):
+                    if region.chrom not in self.reference_sequences: continue
+                    seq = self.reference_sequences[region.chrom][region.start - 1:region.end]
+                    regions[i] = region._replace(sequence=seq)
+            if hasattr(mrna_obj, 'splice_junctions'):
+                for i, junction in enumerate(mrna_obj.splice_junctions):
+                    site_region = junction['site']
+                    window_region = junction['window']
+                    if site_region.chrom in self.reference_sequences:
+                        site_seq = self.reference_sequences[site_region.chrom][site_region.start - 1:site_region.end]
+                        mrna_obj.splice_junctions[i]['site'] = site_region._replace(sequence=site_seq)
+                    if window_region.chrom in self.reference_sequences:
+                        window_seq = self.reference_sequences[window_region.chrom][window_region.start - 1:window_region.end]
+                        mrna_obj.splice_junctions[i]['window'] = window_region._replace(sequence=window_seq)
+        logging.info("Region sequence extraction complete.")
+    def assemble_cds_sequences(self):
+        logging.info("Assembling CDS sequences for mRNAs.")
+        for mrna_id, mrna_obj in self.mRNA_data.items():
+            cds_regions = mrna_obj.regions['CDS']
+            sorted_cds = sorted(cds_regions, key=lambda r: r.start)
+            full_cds_seq = "".join(cds.sequence for cds in sorted_cds)
+            self.mRNA_data[mrna_id] = mrna_obj._replace(cds_sequence=full_cds_seq)
+        for gene_id, gene_obj in self.gene_data.items():
+            updated_mRNAs = []
+            for old_mrna in gene_obj.mRNAs:
+                if old_mrna.id in self.mRNA_data:
+                    updated_mRNAs.append(self.mRNA_data[old_mrna.id])
+                else:
+                    updated_mRNAs.append(old_mrna)
+            new_gene_obj = gene_obj._replace(mRNAs=updated_mRNAs)
+            self.gene_data[gene_id] = new_gene_obj
+        logging.info("CDS sequence assembly complete.")