py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/io/output.py ADDED
@@ -0,0 +1,361 @@
1
+ """
2
+ Output Writers: Formatting results for VCF and MAF.
3
+
4
+ This module provides classes to write processed variants and their counts
5
+ to output files, handling format-specific columns and headers.
6
+ """
7
+
8
+ import csv
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from ..models.core import Variant
13
+
14
+ # Import BaseCounts from Rust extension via a wrapper or direct import if possible
15
+ # For now, we assume we receive objects with the attributes defined in Rust
16
+ # We can define a Protocol for type hinting if needed.
17
+
18
+
19
+ class OutputWriter:
20
+ """Abstract base class for output writers."""
21
+
22
+ def write(self, variant: Variant, counts: Any):
23
+ raise NotImplementedError
24
+
25
+ def close(self):
26
+ pass
27
+
28
+
29
+ class MafWriter(OutputWriter):
30
+ """Writes results to a MAF-like file (Fillout format)."""
31
+
32
+ def __init__(self, path: Path):
33
+ self.path = path
34
+ self.file = open(path, "w")
35
+ self.writer: csv.DictWriter | None = None
36
+ self._headers_written = False
37
+
38
+ def _init_writer(self):
39
+ # Standard GDC MAF columns (plus our custom ones)
40
+ # Based on GDC MAF Format v1.0.0
41
+ self.fieldnames = [
42
+ "Hugo_Symbol",
43
+ "Entrez_Gene_Id",
44
+ "Center",
45
+ "NCBI_Build",
46
+ "Chromosome",
47
+ "Start_Position",
48
+ "End_Position",
49
+ "Strand",
50
+ "Variant_Classification",
51
+ "Variant_Type",
52
+ "Reference_Allele",
53
+ "Tumor_Seq_Allele1",
54
+ "Tumor_Seq_Allele2",
55
+ "dbSNP_RS",
56
+ "dbSNP_Val_Status",
57
+ "Tumor_Sample_Barcode",
58
+ "Matched_Norm_Sample_Barcode",
59
+ "Match_Norm_Seq_Allele1",
60
+ "Match_Norm_Seq_Allele2",
61
+ "Tumor_Validation_Allele1",
62
+ "Tumor_Validation_Allele2",
63
+ "Match_Norm_Validation_Allele1",
64
+ "Match_Norm_Validation_Allele2",
65
+ "Verification_Status",
66
+ "Validation_Status",
67
+ "Mutation_Status",
68
+ "Sequencing_Phase",
69
+ "Sequence_Source",
70
+ "Validation_Method",
71
+ "Score",
72
+ "BAM_File",
73
+ "Sequencer",
74
+ "Tumor_Sample_UUID",
75
+ "Matched_Norm_Sample_UUID",
76
+ "HGVSc",
77
+ "HGVSp",
78
+ "HGVSp_Short",
79
+ "Transcript_ID",
80
+ "Exon_Number",
81
+ "t_depth",
82
+ "t_ref_count",
83
+ "t_alt_count",
84
+ "n_depth",
85
+ "n_ref_count",
86
+ "n_alt_count",
87
+ "all_effects",
88
+ "Allele",
89
+ "Gene",
90
+ "Feature",
91
+ "Feature_type",
92
+ "Consequence",
93
+ "cDNA_position",
94
+ "CDS_position",
95
+ "Protein_position",
96
+ "Amino_acids",
97
+ "Codons",
98
+ "Existing_variation",
99
+ "DISTANCE",
100
+ "STRAND",
101
+ "FLAGS",
102
+ "SYMBOL",
103
+ "SYMBOL_SOURCE",
104
+ "HGNC_ID",
105
+ "BIOTYPE",
106
+ "CANONICAL",
107
+ "CCDS",
108
+ "ENSP",
109
+ "SWISSPROT",
110
+ "TREMBL",
111
+ "UNIPARC",
112
+ "RefSeq",
113
+ "SIFT",
114
+ "PolyPhen",
115
+ "EXON",
116
+ "INTRON",
117
+ "DOMAINS",
118
+ "GMAF",
119
+ "AFR_MAF",
120
+ "AMR_MAF",
121
+ "ASN_MAF",
122
+ "EUR_MAF",
123
+ "AA_MAF",
124
+ "EA_MAF",
125
+ "CLIN_SIG",
126
+ "SOMATIC",
127
+ "PUBMED",
128
+ "MOTIF_NAME",
129
+ "MOTIF_POS",
130
+ "HIGH_INF_POS",
131
+ "MOTIF_SCORE_CHANGE",
132
+ "IMPACT",
133
+ "PICK",
134
+ "VARIANT_CLASS",
135
+ "TSL",
136
+ "HGVS_OFFSET",
137
+ "PHENO",
138
+ "MINIMISED",
139
+ "ExAC_AF",
140
+ "ExAC_AF_AFR",
141
+ "ExAC_AF_AMR",
142
+ "ExAC_AF_EAS",
143
+ "ExAC_AF_FIN",
144
+ "ExAC_AF_NFE",
145
+ "ExAC_AF_OTH",
146
+ "ExAC_AF_SAS",
147
+ "GENE_PHENO",
148
+ "FILTER",
149
+ "flanking_bps",
150
+ "vcf_id",
151
+ "vcf_qual",
152
+ "gnomAD_AF",
153
+ "gnomAD_AFR_AF",
154
+ "gnomAD_AMR_AF",
155
+ "gnomAD_ASJ_AF",
156
+ "gnomAD_EAS_AF",
157
+ "gnomAD_FIN_AF",
158
+ "gnomAD_NFE_AF",
159
+ "gnomAD_OTH_AF",
160
+ "gnomAD_SAS_AF",
161
+ "vcf_pos",
162
+ "vcf_region",
163
+ # Custom columns
164
+ "t_total_count",
165
+ "t_vaf",
166
+ "t_ref_count_fragment",
167
+ "t_alt_count_fragment",
168
+ "t_total_count_fragment",
169
+ "t_vaf_fragment",
170
+ "strand_bias_p_value",
171
+ "strand_bias_odds_ratio",
172
+ "fragment_strand_bias_p_value",
173
+ "fragment_strand_bias_odds_ratio",
174
+ # Strand counts
175
+ "t_ref_count_forward",
176
+ "t_ref_count_reverse",
177
+ "t_alt_count_forward",
178
+ "t_alt_count_reverse",
179
+ "t_ref_count_fragment_forward",
180
+ "t_ref_count_fragment_reverse",
181
+ "t_alt_count_fragment_forward",
182
+ "t_alt_count_fragment_reverse",
183
+ ]
184
+ self.writer = csv.DictWriter(
185
+ self.file, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
186
+ )
187
+ self.writer.writeheader()
188
+ self._headers_written = True
189
+
190
+ def write(self, variant: Variant, counts: Any, sample_name: str = "TUMOR"):
191
+ if not self._headers_written:
192
+ self._init_writer()
193
+
194
+ assert self.writer is not None
195
+
196
+ # Calculate VAFs
197
+ total = counts.rd + counts.ad
198
+ vaf = counts.ad / total if total > 0 else 0.0
199
+
200
+ total_frag = counts.rdf + counts.adf
201
+ vaf_frag = counts.adf / total_frag if total_frag > 0 else 0.0
202
+
203
+ # MAF Coordinates (1-based)
204
+ start_pos = variant.pos + 1
205
+ end_pos = start_pos
206
+
207
+ if variant.variant_type == "DELETION":
208
+ end_pos = start_pos + len(variant.ref) - 1
209
+ elif variant.variant_type == "INSERTION":
210
+ # MAF for insertion: Start and End are the same (anchor), or Start=Anchor, End=Anchor+1?
211
+ # GDC: Start_Position is the last base of the reference allele (anchor).
212
+ # End_Position is Start_Position + 1.
213
+ # Let's follow GDC convention if possible, or stick to VCF-like anchor.
214
+ # For now, let's keep it simple: Start=End=Anchor for Ins?
215
+ # Actually, standard MAF usually has Start=End for insertions (between bases).
216
+ end_pos = start_pos + 1 # To indicate range?
217
+
218
+ # Populate row with defaults for missing fields, starting with metadata
219
+ row = dict.fromkeys(self.fieldnames, "")
220
+ if variant.metadata:
221
+ row.update(variant.metadata)
222
+
223
+ # Fill known fields
224
+ row.update(
225
+ {
226
+ "Chromosome": variant.chrom,
227
+ "Start_Position": str(start_pos),
228
+ "End_Position": str(end_pos),
229
+ "Reference_Allele": variant.ref,
230
+ "Tumor_Seq_Allele2": variant.alt,
231
+ "Tumor_Sample_Barcode": sample_name,
232
+ "Variant_Type": variant.variant_type,
233
+ "t_ref_count": str(counts.rd),
234
+ "t_alt_count": str(counts.ad),
235
+ "t_total_count": str(counts.dp),
236
+ "t_vaf": f"{vaf:.4f}",
237
+ "t_ref_count_fragment": str(counts.rdf),
238
+ "t_alt_count_fragment": str(counts.adf),
239
+ "t_total_count_fragment": str(counts.dpf),
240
+ "t_vaf_fragment": f"{vaf_frag:.4f}",
241
+ "strand_bias_p_value": f"{counts.sb_pval:.4e}",
242
+ "strand_bias_odds_ratio": f"{counts.sb_or:.4f}",
243
+ "fragment_strand_bias_p_value": f"{counts.fsb_pval:.4e}",
244
+ "fragment_strand_bias_odds_ratio": f"{counts.fsb_or:.4f}",
245
+ "vcf_region": f"{variant.chrom}:{start_pos}-{end_pos}", # Simple region string
246
+ "vcf_pos": str(start_pos),
247
+ # Strand counts
248
+ "t_ref_count_forward": str(counts.rd_fwd),
249
+ "t_ref_count_reverse": str(counts.rd_rev),
250
+ "t_alt_count_forward": str(counts.ad_fwd),
251
+ "t_alt_count_reverse": str(counts.ad_rev),
252
+ "t_ref_count_fragment_forward": str(counts.rdf_fwd),
253
+ "t_ref_count_fragment_reverse": str(counts.rdf_rev),
254
+ "t_alt_count_fragment_forward": str(counts.adf_fwd),
255
+ "t_alt_count_fragment_reverse": str(counts.adf_rev),
256
+ }
257
+ )
258
+
259
+ if variant.original_id:
260
+ row["vcf_id"] = variant.original_id
261
+
262
+ self.writer.writerow(row)
263
+
264
+ def close(self):
265
+ self.file.close()
266
+
267
+
268
+ class VcfWriter(OutputWriter):
269
+ """Writes results to a VCF file."""
270
+
271
+ def __init__(self, path: Path, sample_name: str = "SAMPLE"):
272
+ self.path = path
273
+ self.sample_name = sample_name
274
+ self.file = open(path, "w")
275
+ self._headers_written = False
276
+
277
+ def _write_header(self):
278
+ # Minimal VCF header
279
+ headers = [
280
+ "##fileformat=VCFv4.2",
281
+ "##source=gbcms_v2",
282
+ '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
283
+ '##INFO=<ID=SB_PVAL,Number=1,Type=Float,Description="Fisher strand bias p-value">',
284
+ '##INFO=<ID=SB_OR,Number=1,Type=Float,Description="Fisher strand bias odds ratio">',
285
+ '##INFO=<ID=FSB_PVAL,Number=1,Type=Float,Description="Fisher fragment strand bias p-value">',
286
+ '##INFO=<ID=FSB_OR,Number=1,Type=Float,Description="Fisher fragment strand bias odds ratio">',
287
+ '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
288
+ '##FORMAT=<ID=AD,Number=2,Type=Integer,Description="Allelic depths for the ref and alt alleles (fwd,rev)">',
289
+ '##FORMAT=<ID=DP,Number=2,Type=Integer,Description="Approximate read depth (ref_total,alt_total)">',
290
+ '##FORMAT=<ID=RD,Number=2,Type=Integer,Description="Reference read depth (fwd,rev)">',
291
+ '##FORMAT=<ID=RDF,Number=2,Type=Integer,Description="Ref Fragment Count (fwd,rev)">',
292
+ '##FORMAT=<ID=ADF,Number=2,Type=Integer,Description="Alt Fragment Count (fwd,rev)">',
293
+ '##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Fraction (read level)">',
294
+ '##FORMAT=<ID=FAF,Number=1,Type=Float,Description="Variant Allele Fraction (fragment level)">',
295
+ f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{self.sample_name}",
296
+ ]
297
+ self.file.write("\n".join(headers) + "\n")
298
+ self._headers_written = True
299
+
300
+ def write(self, variant: Variant, counts: Any, sample_name: str = "SAMPLE"):
301
+ if not self._headers_written:
302
+ self._write_header()
303
+
304
+ # VCF POS is 1-based
305
+ pos = variant.pos + 1
306
+
307
+ # INFO fields
308
+ info = f"DP={counts.dp};SB_PVAL={counts.sb_pval:.4e};SB_OR={counts.sb_or:.4f};FSB_PVAL={counts.fsb_pval:.4e};FSB_OR={counts.fsb_or:.4f}"
309
+
310
+ # FORMAT fields
311
+ # GT: Simple 0/1 if alt > 0? Or ./1?
312
+ # Let's assume 0/1 if we have alt counts, else 0/0
313
+ gt = "0/1" if counts.ad > 0 else "0/0"
314
+
315
+ # DP: ref_total,alt_total
316
+ dp = f"{counts.rd},{counts.ad}"
317
+
318
+ # RD: ref_fwd,ref_rev
319
+ rd = f"{counts.rd_fwd},{counts.rd_rev}"
320
+
321
+ # AD: alt_fwd,alt_rev
322
+ ad = f"{counts.ad_fwd},{counts.ad_rev}"
323
+
324
+ # RDF: ref_frag_fwd,ref_frag_rev
325
+ rdf = f"{counts.rdf_fwd},{counts.rdf_rev}"
326
+
327
+ # ADF: alt_frag_fwd,alt_frag_rev
328
+ adf = f"{counts.adf_fwd},{counts.adf_rev}"
329
+
330
+ # VAF calculations
331
+ total_reads = counts.rd + counts.ad
332
+ vaf = counts.ad / total_reads if total_reads > 0 else 0.0
333
+
334
+ total_frags = counts.rdf + counts.adf
335
+ faf = counts.adf / total_frags if total_frags > 0 else 0.0
336
+
337
+ format_str = "GT:DP:RD:AD:RDF:ADF:VAF:FAF"
338
+ sample_data = f"{gt}:{dp}:{rd}:{ad}:{rdf}:{adf}:{vaf:.4f}:{faf:.4f}"
339
+
340
+ row = [
341
+ variant.chrom,
342
+ str(pos),
343
+ variant.original_id or ".",
344
+ variant.ref,
345
+ variant.alt,
346
+ ".", # QUAL
347
+ ".", # FILTER
348
+ info,
349
+ format_str,
350
+ sample_data,
351
+ ]
352
+
353
+ self.file.write("\t".join(row) + "\n")
354
+
355
+ def close(self):
356
+ self.file.close()
357
+
358
+
359
+ # VCF Writer would require a template VCF or constructing a header from scratch.
360
+ # For MVP, MAF/Table output is often preferred for downstream analysis.
361
+ # We can add VCFWriter later if strictly needed, or use pysam.
gbcms/models/core.py ADDED
@@ -0,0 +1,133 @@
1
+ """
2
+ Core data models for gbcms v2.
3
+ """
4
+
5
+ from enum import Enum
6
+ from pathlib import Path
7
+
8
+ from pydantic import BaseModel, Field, field_validator, model_validator
9
+
10
+
11
+ class VariantType(str, Enum):
12
+ """Type of genomic variant."""
13
+
14
+ SNP = "SNP"
15
+ INSERTION = "INSERTION"
16
+ DELETION = "DELETION"
17
+ COMPLEX = "COMPLEX"
18
+
19
+
20
+ class GenomicInterval(BaseModel):
21
+ """
22
+ Represents a 0-based, half-open genomic interval [start, end).
23
+
24
+ This is the canonical internal representation for all coordinates.
25
+ """
26
+
27
+ chrom: str
28
+ start: int = Field(ge=0, description="0-based start position (inclusive)")
29
+ end: int = Field(ge=0, description="0-based end position (exclusive)")
30
+
31
+ @model_validator(mode="after")
32
+ def validate_interval(self) -> "GenomicInterval":
33
+ if self.end < self.start:
34
+ raise ValueError(f"End position ({self.end}) must be >= start position ({self.start})")
35
+ return self
36
+
37
+
38
+ class Variant(BaseModel):
39
+ """
40
+ Normalized representation of a genomic variant.
41
+ """
42
+
43
+ chrom: str
44
+ pos: int = Field(ge=0, description="0-based position of the variant")
45
+ ref: str
46
+ alt: str
47
+ variant_type: VariantType
48
+
49
+ # Original input metadata (optional)
50
+ original_id: str | None = None
51
+ metadata: dict[str, str] = Field(
52
+ default_factory=dict, description="Original input metadata/columns"
53
+ )
54
+
55
+ @property
56
+ def interval(self) -> GenomicInterval:
57
+ """
58
+ Get the genomic interval covered by this variant.
59
+
60
+ For SNP: [pos, pos+1)
61
+ For Deletion: [pos, pos+len(ref)) (ref includes anchor? depends on normalization)
62
+ For Insertion: [pos, pos+1) (anchor base)
63
+ """
64
+ # Note: This logic depends on strict normalization rules which we will implement in the kernel.
65
+ # For now, a simple approximation based on ref length.
66
+ return GenomicInterval(chrom=self.chrom, start=self.pos, end=self.pos + len(self.ref))
67
+
68
+
69
+ class InputFormat(str, Enum):
70
+ VCF = "vcf"
71
+ MAF = "maf"
72
+
73
+
74
+ class OutputFormat(str, Enum):
75
+ VCF = "vcf"
76
+ MAF = "maf"
77
+
78
+
79
+ class GbcmsConfig(BaseModel):
80
+ """
81
+ Global configuration for gbcms execution.
82
+ """
83
+
84
+ # Input
85
+ variant_file: Path
86
+ bam_files: dict[str, Path] # sample_name -> bam_path
87
+ reference_fasta: Path
88
+
89
+ # Output
90
+ output_dir: Path
91
+ output_format: OutputFormat = OutputFormat.VCF
92
+ output_suffix: str = ""
93
+
94
+ # Filters
95
+ min_mapping_quality: int = Field(default=20, ge=0)
96
+ min_base_quality: int = Field(default=0, ge=0)
97
+ filter_duplicates: bool = True
98
+ filter_secondary: bool = False
99
+ filter_supplementary: bool = False
100
+ filter_qc_failed: bool = False
101
+ filter_improper_pair: bool = False
102
+ filter_indel: bool = False
103
+
104
+ # Performance
105
+ threads: int = Field(default=1, ge=1)
106
+
107
+ # Advanced
108
+ fragment_counting: bool = False
109
+
110
+ @field_validator("variant_file", "reference_fasta")
111
+ @classmethod
112
+ def validate_file_exists(cls, v: Path) -> Path:
113
+ if not v.exists():
114
+ raise ValueError(f"File not found: {v}")
115
+ return v
116
+
117
+ @field_validator("output_dir")
118
+ @classmethod
119
+ def validate_output_dir(cls, v: Path) -> Path:
120
+ if not v.exists():
121
+ # Try to create it? Or just fail?
122
+ # Usually safer to fail or let the pipeline create it.
123
+ # But for config validation, let's just ensure it's not a file.
124
+ if v.is_file():
125
+ raise ValueError(f"Output path must be a directory, not a file: {v}")
126
+ return v
127
+
128
+ @model_validator(mode="after")
129
+ def validate_bams(self) -> "GbcmsConfig":
130
+ for name, path in self.bam_files.items():
131
+ if not path.exists():
132
+ raise ValueError(f"BAM file for sample '{name}' not found: {path}")
133
+ return self