py-gbcms 2.2.0__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/io/output.py ADDED
@@ -0,0 +1,354 @@
1
+ """
2
+ Output Writers: Formatting results for VCF and MAF.
3
+
4
+ This module provides classes to write processed variants and their counts
5
+ to output files, handling format-specific columns and headers.
6
+ """
7
+
8
+ import csv
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from ..models.core import Variant
13
+
14
+ __all__ = ["OutputWriter", "MafWriter", "VcfWriter"]
15
+
16
+
17
+ class OutputWriter:
18
+ """Abstract base class for output writers."""
19
+
20
+ def write(self, variant: Variant, counts: Any):
21
+ raise NotImplementedError
22
+
23
+ def close(self):
24
+ pass
25
+
26
+
27
+ class MafWriter(OutputWriter):
28
+ """Writes results to a MAF-like file (Fillout format)."""
29
+
30
+ def __init__(self, path: Path):
31
+ self.path = path
32
+ self.file = open(path, "w")
33
+ self.writer: csv.DictWriter | None = None
34
+ self._headers_written = False
35
+
36
+ def _init_writer(self):
37
+ # Standard GDC MAF columns (plus our custom ones)
38
+ # Based on GDC MAF Format v1.0.0
39
+ self.fieldnames = [
40
+ "Hugo_Symbol",
41
+ "Entrez_Gene_Id",
42
+ "Center",
43
+ "NCBI_Build",
44
+ "Chromosome",
45
+ "Start_Position",
46
+ "End_Position",
47
+ "Strand",
48
+ "Variant_Classification",
49
+ "Variant_Type",
50
+ "Reference_Allele",
51
+ "Tumor_Seq_Allele1",
52
+ "Tumor_Seq_Allele2",
53
+ "dbSNP_RS",
54
+ "dbSNP_Val_Status",
55
+ "Tumor_Sample_Barcode",
56
+ "Matched_Norm_Sample_Barcode",
57
+ "Match_Norm_Seq_Allele1",
58
+ "Match_Norm_Seq_Allele2",
59
+ "Tumor_Validation_Allele1",
60
+ "Tumor_Validation_Allele2",
61
+ "Match_Norm_Validation_Allele1",
62
+ "Match_Norm_Validation_Allele2",
63
+ "Verification_Status",
64
+ "Validation_Status",
65
+ "Mutation_Status",
66
+ "Sequencing_Phase",
67
+ "Sequence_Source",
68
+ "Validation_Method",
69
+ "Score",
70
+ "BAM_File",
71
+ "Sequencer",
72
+ "Tumor_Sample_UUID",
73
+ "Matched_Norm_Sample_UUID",
74
+ "HGVSc",
75
+ "HGVSp",
76
+ "HGVSp_Short",
77
+ "Transcript_ID",
78
+ "Exon_Number",
79
+ "t_depth",
80
+ "t_ref_count",
81
+ "t_alt_count",
82
+ "n_depth",
83
+ "n_ref_count",
84
+ "n_alt_count",
85
+ "all_effects",
86
+ "Allele",
87
+ "Gene",
88
+ "Feature",
89
+ "Feature_type",
90
+ "Consequence",
91
+ "cDNA_position",
92
+ "CDS_position",
93
+ "Protein_position",
94
+ "Amino_acids",
95
+ "Codons",
96
+ "Existing_variation",
97
+ "DISTANCE",
98
+ "STRAND",
99
+ "FLAGS",
100
+ "SYMBOL",
101
+ "SYMBOL_SOURCE",
102
+ "HGNC_ID",
103
+ "BIOTYPE",
104
+ "CANONICAL",
105
+ "CCDS",
106
+ "ENSP",
107
+ "SWISSPROT",
108
+ "TREMBL",
109
+ "UNIPARC",
110
+ "RefSeq",
111
+ "SIFT",
112
+ "PolyPhen",
113
+ "EXON",
114
+ "INTRON",
115
+ "DOMAINS",
116
+ "GMAF",
117
+ "AFR_MAF",
118
+ "AMR_MAF",
119
+ "ASN_MAF",
120
+ "EUR_MAF",
121
+ "AA_MAF",
122
+ "EA_MAF",
123
+ "CLIN_SIG",
124
+ "SOMATIC",
125
+ "PUBMED",
126
+ "MOTIF_NAME",
127
+ "MOTIF_POS",
128
+ "HIGH_INF_POS",
129
+ "MOTIF_SCORE_CHANGE",
130
+ "IMPACT",
131
+ "PICK",
132
+ "VARIANT_CLASS",
133
+ "TSL",
134
+ "HGVS_OFFSET",
135
+ "PHENO",
136
+ "MINIMISED",
137
+ "ExAC_AF",
138
+ "ExAC_AF_AFR",
139
+ "ExAC_AF_AMR",
140
+ "ExAC_AF_EAS",
141
+ "ExAC_AF_FIN",
142
+ "ExAC_AF_NFE",
143
+ "ExAC_AF_OTH",
144
+ "ExAC_AF_SAS",
145
+ "GENE_PHENO",
146
+ "FILTER",
147
+ "flanking_bps",
148
+ "vcf_id",
149
+ "vcf_qual",
150
+ "gnomAD_AF",
151
+ "gnomAD_AFR_AF",
152
+ "gnomAD_AMR_AF",
153
+ "gnomAD_ASJ_AF",
154
+ "gnomAD_EAS_AF",
155
+ "gnomAD_FIN_AF",
156
+ "gnomAD_NFE_AF",
157
+ "gnomAD_OTH_AF",
158
+ "gnomAD_SAS_AF",
159
+ "vcf_pos",
160
+ "vcf_region",
161
+ # Custom columns
162
+ "t_total_count",
163
+ "t_vaf",
164
+ "t_ref_count_fragment",
165
+ "t_alt_count_fragment",
166
+ "t_total_count_fragment",
167
+ "t_vaf_fragment",
168
+ "strand_bias_p_value",
169
+ "strand_bias_odds_ratio",
170
+ "fragment_strand_bias_p_value",
171
+ "fragment_strand_bias_odds_ratio",
172
+ # Strand counts
173
+ "t_ref_count_forward",
174
+ "t_ref_count_reverse",
175
+ "t_alt_count_forward",
176
+ "t_alt_count_reverse",
177
+ "t_ref_count_fragment_forward",
178
+ "t_ref_count_fragment_reverse",
179
+ "t_alt_count_fragment_forward",
180
+ "t_alt_count_fragment_reverse",
181
+ ]
182
+ self.writer = csv.DictWriter(
183
+ self.file, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore"
184
+ )
185
+ self.writer.writeheader()
186
+ self._headers_written = True
187
+
188
+ def write(self, variant: Variant, counts: Any, sample_name: str = "TUMOR"):
189
+ if not self._headers_written:
190
+ self._init_writer()
191
+
192
+ assert self.writer is not None
193
+
194
+ # Calculate VAFs
195
+ total = counts.rd + counts.ad
196
+ vaf = counts.ad / total if total > 0 else 0.0
197
+
198
+ total_frag = counts.rdf + counts.adf
199
+ vaf_frag = counts.adf / total_frag if total_frag > 0 else 0.0
200
+
201
+ # MAF Coordinates (1-based)
202
+ start_pos = variant.pos + 1
203
+ end_pos = start_pos
204
+
205
+ if variant.variant_type == "DELETION":
206
+ end_pos = start_pos + len(variant.ref) - 1
207
+ elif variant.variant_type == "INSERTION":
208
+ # MAF for insertion: Start and End are the same (anchor), or Start=Anchor, End=Anchor+1?
209
+ # GDC: Start_Position is the last base of the reference allele (anchor).
210
+ # End_Position is Start_Position + 1.
211
+ # Let's follow GDC convention if possible, or stick to VCF-like anchor.
212
+ # For now, let's keep it simple: Start=End=Anchor for Ins?
213
+ # Actually, standard MAF usually has Start=End for insertions (between bases).
214
+ end_pos = start_pos + 1 # To indicate range?
215
+
216
+ # Populate row with defaults for missing fields, starting with metadata
217
+ row = dict.fromkeys(self.fieldnames, "")
218
+ if variant.metadata:
219
+ row.update(variant.metadata)
220
+
221
+ # Fill known fields
222
+ row.update(
223
+ {
224
+ "Chromosome": variant.chrom,
225
+ "Start_Position": str(start_pos),
226
+ "End_Position": str(end_pos),
227
+ "Reference_Allele": variant.ref,
228
+ "Tumor_Seq_Allele2": variant.alt,
229
+ "Tumor_Sample_Barcode": sample_name,
230
+ "Variant_Type": variant.variant_type,
231
+ "t_ref_count": str(counts.rd),
232
+ "t_alt_count": str(counts.ad),
233
+ "t_total_count": str(counts.dp),
234
+ "t_vaf": f"{vaf:.4f}",
235
+ "t_ref_count_fragment": str(counts.rdf),
236
+ "t_alt_count_fragment": str(counts.adf),
237
+ "t_total_count_fragment": str(counts.dpf),
238
+ "t_vaf_fragment": f"{vaf_frag:.4f}",
239
+ "strand_bias_p_value": f"{counts.sb_pval:.4e}",
240
+ "strand_bias_odds_ratio": f"{counts.sb_or:.4f}",
241
+ "fragment_strand_bias_p_value": f"{counts.fsb_pval:.4e}",
242
+ "fragment_strand_bias_odds_ratio": f"{counts.fsb_or:.4f}",
243
+ "vcf_region": f"{variant.chrom}:{start_pos}-{end_pos}", # Simple region string
244
+ "vcf_pos": str(start_pos),
245
+ # Strand counts
246
+ "t_ref_count_forward": str(counts.rd_fwd),
247
+ "t_ref_count_reverse": str(counts.rd_rev),
248
+ "t_alt_count_forward": str(counts.ad_fwd),
249
+ "t_alt_count_reverse": str(counts.ad_rev),
250
+ "t_ref_count_fragment_forward": str(counts.rdf_fwd),
251
+ "t_ref_count_fragment_reverse": str(counts.rdf_rev),
252
+ "t_alt_count_fragment_forward": str(counts.adf_fwd),
253
+ "t_alt_count_fragment_reverse": str(counts.adf_rev),
254
+ }
255
+ )
256
+
257
+ if variant.original_id:
258
+ row["vcf_id"] = variant.original_id
259
+
260
+ self.writer.writerow(row)
261
+
262
+ def close(self):
263
+ self.file.close()
264
+
265
+
266
+ class VcfWriter(OutputWriter):
267
+ """Writes results to a VCF file."""
268
+
269
+ def __init__(self, path: Path, sample_name: str = "SAMPLE"):
270
+ self.path = path
271
+ self.sample_name = sample_name
272
+ self.file = open(path, "w")
273
+ self._headers_written = False
274
+
275
+ def _write_header(self):
276
+ # Minimal VCF header
277
+ headers = [
278
+ "##fileformat=VCFv4.2",
279
+ "##source=gbcms_v2",
280
+ '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">',
281
+ '##INFO=<ID=SB_PVAL,Number=1,Type=Float,Description="Fisher strand bias p-value">',
282
+ '##INFO=<ID=SB_OR,Number=1,Type=Float,Description="Fisher strand bias odds ratio">',
283
+ '##INFO=<ID=FSB_PVAL,Number=1,Type=Float,Description="Fisher fragment strand bias p-value">',
284
+ '##INFO=<ID=FSB_OR,Number=1,Type=Float,Description="Fisher fragment strand bias odds ratio">',
285
+ '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
286
+ '##FORMAT=<ID=AD,Number=2,Type=Integer,Description="Allelic depths for the ref and alt alleles (fwd,rev)">',
287
+ '##FORMAT=<ID=DP,Number=2,Type=Integer,Description="Approximate read depth (ref_total,alt_total)">',
288
+ '##FORMAT=<ID=RD,Number=2,Type=Integer,Description="Reference read depth (fwd,rev)">',
289
+ '##FORMAT=<ID=RDF,Number=2,Type=Integer,Description="Ref Fragment Count (fwd,rev)">',
290
+ '##FORMAT=<ID=ADF,Number=2,Type=Integer,Description="Alt Fragment Count (fwd,rev)">',
291
+ '##FORMAT=<ID=VAF,Number=1,Type=Float,Description="Variant Allele Fraction (read level)">',
292
+ '##FORMAT=<ID=FAF,Number=1,Type=Float,Description="Variant Allele Fraction (fragment level)">',
293
+ f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{self.sample_name}",
294
+ ]
295
+ self.file.write("\n".join(headers) + "\n")
296
+ self._headers_written = True
297
+
298
+ def write(self, variant: Variant, counts: Any, sample_name: str = "SAMPLE"):
299
+ if not self._headers_written:
300
+ self._write_header()
301
+
302
+ # VCF POS is 1-based
303
+ pos = variant.pos + 1
304
+
305
+ # INFO fields
306
+ info = f"DP={counts.dp};SB_PVAL={counts.sb_pval:.4e};SB_OR={counts.sb_or:.4f};FSB_PVAL={counts.fsb_pval:.4e};FSB_OR={counts.fsb_or:.4f}"
307
+
308
+ # FORMAT fields
309
+ # GT: Simple 0/1 if alt > 0? Or ./1?
310
+ # Let's assume 0/1 if we have alt counts, else 0/0
311
+ gt = "0/1" if counts.ad > 0 else "0/0"
312
+
313
+ # DP: ref_total,alt_total
314
+ dp = f"{counts.rd},{counts.ad}"
315
+
316
+ # RD: ref_fwd,ref_rev
317
+ rd = f"{counts.rd_fwd},{counts.rd_rev}"
318
+
319
+ # AD: alt_fwd,alt_rev
320
+ ad = f"{counts.ad_fwd},{counts.ad_rev}"
321
+
322
+ # RDF: ref_frag_fwd,ref_frag_rev
323
+ rdf = f"{counts.rdf_fwd},{counts.rdf_rev}"
324
+
325
+ # ADF: alt_frag_fwd,alt_frag_rev
326
+ adf = f"{counts.adf_fwd},{counts.adf_rev}"
327
+
328
+ # VAF calculations
329
+ total_reads = counts.rd + counts.ad
330
+ vaf = counts.ad / total_reads if total_reads > 0 else 0.0
331
+
332
+ total_frags = counts.rdf + counts.adf
333
+ faf = counts.adf / total_frags if total_frags > 0 else 0.0
334
+
335
+ format_str = "GT:DP:RD:AD:RDF:ADF:VAF:FAF"
336
+ sample_data = f"{gt}:{dp}:{rd}:{ad}:{rdf}:{adf}:{vaf:.4f}:{faf:.4f}"
337
+
338
+ row = [
339
+ variant.chrom,
340
+ str(pos),
341
+ variant.original_id or ".",
342
+ variant.ref,
343
+ variant.alt,
344
+ ".", # QUAL
345
+ ".", # FILTER
346
+ info,
347
+ format_str,
348
+ sample_data,
349
+ ]
350
+
351
+ self.file.write("\t".join(row) + "\n")
352
+
353
+ def close(self):
354
+ self.file.close()
@@ -0,0 +1,27 @@
1
+ """
2
+ Data models for gbcms.
3
+
4
+ Provides Pydantic models for variants, configuration, and core data structures.
5
+ """
6
+
7
+ from .core import (
8
+ GbcmsConfig,
9
+ GenomicInterval,
10
+ OutputConfig,
11
+ OutputFormat,
12
+ QualityThresholds,
13
+ ReadFilters,
14
+ Variant,
15
+ VariantType,
16
+ )
17
+
18
+ __all__ = [
19
+ "GbcmsConfig",
20
+ "GenomicInterval",
21
+ "OutputConfig",
22
+ "OutputFormat",
23
+ "QualityThresholds",
24
+ "ReadFilters",
25
+ "Variant",
26
+ "VariantType",
27
+ ]
gbcms/models/core.py ADDED
@@ -0,0 +1,172 @@
1
+ """
2
+ Core data models for gbcms v2.
3
+
4
+ This module defines the data models for variants, configuration, and nested
5
+ config groups (filters, quality thresholds, output settings).
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ if sys.version_info >= (3, 11):
12
+ from enum import StrEnum
13
+ else:
14
+ from enum import Enum
15
+
16
+ class StrEnum(str, Enum):
17
+ """Backport of StrEnum for Python 3.10."""
18
+
19
+ pass
20
+
21
+
22
+ from pydantic import BaseModel, Field, field_validator, model_validator
23
+
24
+ __all__ = [
25
+ "VariantType",
26
+ "GenomicInterval",
27
+ "Variant",
28
+ "OutputFormat",
29
+ "ReadFilters",
30
+ "QualityThresholds",
31
+ "OutputConfig",
32
+ "GbcmsConfig",
33
+ ]
34
+
35
+
36
+ class VariantType(StrEnum):
37
+ """Type of genomic variant."""
38
+
39
+ SNP = "SNP"
40
+ INSERTION = "INSERTION"
41
+ DELETION = "DELETION"
42
+ COMPLEX = "COMPLEX"
43
+
44
+
45
+ class GenomicInterval(BaseModel):
46
+ """
47
+ Represents a 0-based, half-open genomic interval [start, end).
48
+
49
+ This is the canonical internal representation for all coordinates.
50
+ """
51
+
52
+ chrom: str
53
+ start: int = Field(ge=0, description="0-based start position (inclusive)")
54
+ end: int = Field(ge=0, description="0-based end position (exclusive)")
55
+
56
+ @model_validator(mode="after")
57
+ def validate_interval(self) -> "GenomicInterval":
58
+ if self.end < self.start:
59
+ raise ValueError(f"End position ({self.end}) must be >= start position ({self.start})")
60
+ return self
61
+
62
+
63
+ class Variant(BaseModel):
64
+ """Normalized representation of a genomic variant."""
65
+
66
+ chrom: str
67
+ pos: int = Field(ge=0, description="0-based position of the variant")
68
+ ref: str
69
+ alt: str
70
+ variant_type: VariantType
71
+
72
+ # Original input metadata (optional)
73
+ original_id: str | None = None
74
+ metadata: dict[str, str] = Field(
75
+ default_factory=dict, description="Original input metadata/columns"
76
+ )
77
+
78
+ @property
79
+ def interval(self) -> GenomicInterval:
80
+ """Get the genomic interval covered by this variant."""
81
+ return GenomicInterval(chrom=self.chrom, start=self.pos, end=self.pos + len(self.ref))
82
+
83
+
84
+ class OutputFormat(StrEnum):
85
+ """Supported output formats for gbcms."""
86
+
87
+ VCF = "vcf"
88
+ MAF = "maf"
89
+
90
+
91
+ # =============================================================================
92
+ # Nested Configuration Models
93
+ # =============================================================================
94
+
95
+
96
+ class ReadFilters(BaseModel):
97
+ """
98
+ Filters for read selection during BAM processing.
99
+
100
+ These flags control which reads are excluded from counting.
101
+ When True, reads with the corresponding flag are filtered out.
102
+ """
103
+
104
+ duplicates: bool = Field(default=True, description="Filter duplicate reads")
105
+ secondary: bool = Field(default=False, description="Filter secondary alignments")
106
+ supplementary: bool = Field(default=False, description="Filter supplementary alignments")
107
+ qc_failed: bool = Field(default=False, description="Filter reads failing QC")
108
+ improper_pair: bool = Field(default=False, description="Filter improperly paired reads")
109
+ indel: bool = Field(default=False, description="Filter reads containing indels")
110
+
111
+
112
+ class QualityThresholds(BaseModel):
113
+ """Quality score thresholds for filtering reads and bases."""
114
+
115
+ min_mapping_quality: int = Field(default=20, ge=0, description="Minimum mapping quality (MAPQ)")
116
+ min_base_quality: int = Field(default=0, ge=0, description="Minimum base quality (BQ)")
117
+
118
+
119
+ class OutputConfig(BaseModel):
120
+ """Output configuration settings."""
121
+
122
+ directory: Path = Field(description="Directory to write output files")
123
+ format: OutputFormat = Field(default=OutputFormat.VCF, description="Output format (vcf or maf)")
124
+ suffix: str = Field(default="", description="Suffix to append to output filename")
125
+
126
+ @field_validator("directory")
127
+ @classmethod
128
+ def validate_output_dir(cls, v: Path) -> Path:
129
+ """Ensure output path is not a file."""
130
+ if v.exists() and v.is_file():
131
+ raise ValueError(f"Output path must be a directory, not a file: {v}")
132
+ return v
133
+
134
+
135
+ class GbcmsConfig(BaseModel):
136
+ """
137
+ Global configuration for gbcms execution.
138
+
139
+ Groups related settings into nested models for cleaner organization.
140
+ """
141
+
142
+ # Input files
143
+ variant_file: Path
144
+ bam_files: dict[str, Path] # sample_name -> bam_path
145
+ reference_fasta: Path
146
+
147
+ # Nested configuration groups
148
+ output: OutputConfig
149
+ filters: ReadFilters = Field(default_factory=ReadFilters)
150
+ quality: QualityThresholds = Field(default_factory=QualityThresholds)
151
+
152
+ # Performance
153
+ threads: int = Field(default=1, ge=1, description="Number of threads")
154
+
155
+ # Advanced
156
+ fragment_counting: bool = Field(default=False, description="Enable fragment-based counting")
157
+
158
+ @field_validator("variant_file", "reference_fasta")
159
+ @classmethod
160
+ def validate_file_exists(cls, v: Path) -> Path:
161
+ """Validate that input files exist."""
162
+ if not v.exists():
163
+ raise ValueError(f"File not found: {v}")
164
+ return v
165
+
166
+ @model_validator(mode="after")
167
+ def validate_bams(self) -> "GbcmsConfig":
168
+ """Validate that all BAM files exist."""
169
+ for name, path in self.bam_files.items():
170
+ if not path.exists():
171
+ raise ValueError(f"BAM file for sample '{name}' not found: {path}")
172
+ return self