py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/variant.py DELETED
@@ -1,390 +0,0 @@
1
- """Variant loading and representation."""
2
-
3
- import logging
4
- from dataclasses import dataclass, field
5
- from typing import Optional
6
-
7
- import numpy as np
8
-
9
- from .config import CountType
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
- # Try to import cyvcf2 for fast VCF parsing
14
- try:
15
- from cyvcf2 import VCF
16
-
17
- HAS_CYVCF2 = True
18
- logger.debug("cyvcf2 available - using fast VCF parsing")
19
- except ImportError:
20
- HAS_CYVCF2 = False
21
- logger.debug("cyvcf2 not available - using pure Python VCF parsing")
22
-
23
-
24
- @dataclass
25
- class VariantEntry:
26
- """Represents a variant with its counts across samples."""
27
-
28
- chrom: str
29
- pos: int # 0-indexed
30
- end_pos: int
31
- ref: str
32
- alt: str
33
- snp: bool = False
34
- dnp: bool = False
35
- dnp_len: int = 0
36
- insertion: bool = False
37
- deletion: bool = False
38
- tumor_sample: str = ""
39
- normal_sample: str = ""
40
- gene: str = ""
41
- effect: str = ""
42
- t_ref_count: int = 0
43
- t_alt_count: int = 0
44
- n_ref_count: int = 0
45
- n_alt_count: int = 0
46
- maf_pos: int = 0
47
- maf_end_pos: int = 0
48
- maf_ref: str = ""
49
- maf_alt: str = ""
50
- caller: str = ""
51
- base_count: dict[str, np.ndarray] = field(default_factory=dict)
52
- duplicate_variant_ptr: Optional["VariantEntry"] = None
53
- maf_line: str = "" # Store original MAF line for output
54
-
55
- def get_variant_key(self) -> tuple[str, int, str, str]:
56
- """Return unique key for variant identification."""
57
- return (self.chrom, self.pos, self.ref, self.alt)
58
-
59
- def initialize_counts(self, sample_names: list[str]) -> None:
60
- """Initialize count arrays for all samples."""
61
- for sample in sample_names:
62
- if sample not in self.base_count:
63
- self.base_count[sample] = np.zeros(len(CountType), dtype=np.float32)
64
-
65
- def get_count(self, sample: str, count_type: CountType) -> float:
66
- """Get count for specific sample and type."""
67
- if sample not in self.base_count:
68
- return 0.0
69
- return float(self.base_count[sample][count_type])
70
-
71
- def __lt__(self, other: "VariantEntry") -> bool:
72
- """Compare variants for sorting."""
73
- if self.chrom != other.chrom:
74
- return self._chrom_sort_key() < other._chrom_sort_key()
75
- return self.pos < other.pos
76
-
77
- def _chrom_sort_key(self) -> tuple:
78
- """Generate sort key for chromosome."""
79
- chrom = self.chrom.replace("chr", "")
80
- try:
81
- return (0, int(chrom))
82
- except ValueError:
83
- if chrom == "X":
84
- return (1, 0)
85
- elif chrom == "Y":
86
- return (1, 1)
87
- elif chrom == "M" or chrom == "MT":
88
- return (1, 2)
89
- else:
90
- return (2, chrom)
91
-
92
-
93
- class VariantLoader:
94
- """Loads variants from VCF or MAF files."""
95
-
96
- def __init__(self, reference_getter=None):
97
- """
98
- Initialize variant loader.
99
-
100
- Args:
101
- reference_getter: Callable that takes (chrom, pos) and returns base
102
- """
103
- self.reference_getter = reference_getter
104
-
105
- def load_vcf(self, vcf_file: str) -> list[VariantEntry]:
106
- """
107
- Load variants from VCF file.
108
-
109
- Uses cyvcf2 for fast parsing if available, otherwise falls back to pure Python.
110
-
111
- Args:
112
- vcf_file: Path to VCF file (can be .vcf or .vcf.gz)
113
-
114
- Returns:
115
- List of VariantEntry objects
116
- """
117
- if HAS_CYVCF2:
118
- return self._load_vcf_cyvcf2(vcf_file)
119
- else:
120
- return self._load_vcf_python(vcf_file)
121
-
122
- def _load_vcf_cyvcf2(self, vcf_file: str) -> list[VariantEntry]:
123
- """
124
- Load variants from VCF using cyvcf2 (fast, C-based parser).
125
-
126
- This is 10-100x faster than pure Python parsing.
127
- """
128
- logger.info(f"Loading variants file with cyvcf2: {vcf_file}")
129
- variants = []
130
-
131
- try:
132
- vcf = VCF(vcf_file)
133
-
134
- for variant in vcf:
135
- chrom = variant.CHROM
136
- pos = variant.POS - 1 # Convert to 0-indexed
137
- ref = variant.REF
138
-
139
- # Handle multiple alts - take first one
140
- alt = variant.ALT[0] if variant.ALT else ""
141
-
142
- end_pos = pos + len(ref) - 1
143
-
144
- # Determine variant type
145
- snp = len(alt) == len(ref) == 1
146
- dnp = len(alt) == len(ref) > 1
147
- dnp_len = len(ref) if dnp else 0
148
- insertion = len(alt) > len(ref)
149
- deletion = len(alt) < len(ref)
150
-
151
- entry = VariantEntry(
152
- chrom=chrom,
153
- pos=pos,
154
- end_pos=end_pos,
155
- ref=ref,
156
- alt=alt,
157
- snp=snp,
158
- dnp=dnp,
159
- dnp_len=dnp_len,
160
- insertion=insertion,
161
- deletion=deletion,
162
- )
163
- variants.append(entry)
164
-
165
- vcf.close()
166
-
167
- except Exception as e:
168
- logger.error(f"Error loading VCF with cyvcf2: {e}")
169
- logger.info("Falling back to pure Python VCF parser")
170
- return self._load_vcf_python(vcf_file)
171
-
172
- logger.info(f"{len(variants)} variants loaded from file: {vcf_file}")
173
- return variants
174
-
175
- def _load_vcf_python(self, vcf_file: str) -> list[VariantEntry]:
176
- """
177
- Load variants from VCF using pure Python (slower but always works).
178
-
179
- This is the fallback method when cyvcf2 is not available.
180
- """
181
- logger.info(f"Loading variants file with Python parser: {vcf_file}")
182
- variants = []
183
-
184
- with open(vcf_file) as f:
185
- for line in f:
186
- line = line.strip()
187
- if not line or line.startswith("#"):
188
- continue
189
-
190
- fields = line.split("\t")
191
- if len(fields) < 5:
192
- logger.error(f"Incorrectly formatted VCF entry: {line}")
193
- continue
194
-
195
- chrom = fields[0]
196
- pos = int(fields[1]) - 1 # Convert to 0-indexed
197
- ref = fields[3]
198
- alt = fields[4]
199
-
200
- # Handle multiple alts - take first one
201
- if "," in alt:
202
- alt = alt.split(",")[0]
203
-
204
- end_pos = pos + len(ref) - 1
205
-
206
- # Determine variant type
207
- snp = len(alt) == len(ref) == 1
208
- dnp = len(alt) == len(ref) > 1
209
- dnp_len = len(ref) if dnp else 0
210
- insertion = len(alt) > len(ref)
211
- deletion = len(alt) < len(ref)
212
-
213
- variant = VariantEntry(
214
- chrom=chrom,
215
- pos=pos,
216
- end_pos=end_pos,
217
- ref=ref,
218
- alt=alt,
219
- snp=snp,
220
- dnp=dnp,
221
- dnp_len=dnp_len,
222
- insertion=insertion,
223
- deletion=deletion,
224
- )
225
- variants.append(variant)
226
-
227
- logger.info(f"{len(variants)} variants loaded from file: {vcf_file}")
228
- return variants
229
-
230
- def load_maf(self, maf_file: str) -> list[VariantEntry]:
231
- """Load variants from MAF file."""
232
- logger.info(f"Loading variants file: {maf_file}")
233
- variants = []
234
- header_map = {}
235
-
236
- with open(maf_file) as f:
237
- # Find header line
238
- for line in f:
239
- line = line.strip()
240
- if not line.startswith("#"):
241
- # This is the header
242
- headers = line.split("\t")
243
- header_map = {h: i for i, h in enumerate(headers)}
244
- break
245
-
246
- # Validate required columns
247
- required_cols = [
248
- "Hugo_Symbol",
249
- "Chromosome",
250
- "Start_Position",
251
- "End_Position",
252
- "Reference_Allele",
253
- "Tumor_Seq_Allele1",
254
- "Tumor_Seq_Allele2",
255
- "Tumor_Sample_Barcode",
256
- "Matched_Norm_Sample_Barcode",
257
- "t_ref_count",
258
- "t_alt_count",
259
- "n_ref_count",
260
- "n_alt_count",
261
- "Variant_Classification",
262
- ]
263
-
264
- missing_cols = [col for col in required_cols if col not in header_map]
265
- if missing_cols:
266
- logger.error(f"Missing required MAF columns: {missing_cols}")
267
- raise ValueError("Incorrect MAF file header")
268
-
269
- # Load variants
270
- for line in f:
271
- line = line.strip()
272
- if not line:
273
- continue
274
-
275
- fields = line.split("\t")
276
- variant = self._parse_maf_line(fields, header_map, line)
277
- if variant:
278
- variants.append(variant)
279
-
280
- logger.info(f"{len(variants)} variants loaded from file: {maf_file}")
281
- return variants
282
-
283
- def _parse_maf_line(
284
- self, fields: list[str], header_map: dict[str, int], original_line: str
285
- ) -> VariantEntry | None:
286
- """Parse a single MAF line into VariantEntry."""
287
- try:
288
- gene = fields[header_map["Hugo_Symbol"]]
289
- chrom = fields[header_map["Chromosome"]]
290
- pos = int(fields[header_map["Start_Position"]]) - 1 # Convert to 0-indexed
291
- end_pos = int(fields[header_map["End_Position"]]) - 1
292
- ref = fields[header_map["Reference_Allele"]]
293
- alt = fields[header_map["Tumor_Seq_Allele1"]]
294
-
295
- # Use Tumor_Seq_Allele2 if Allele1 is empty or same as ref
296
- if not alt or alt == ref:
297
- alt = fields[header_map["Tumor_Seq_Allele2"]]
298
-
299
- if not alt or alt == ref:
300
- logger.warning(f"Could not find valid alt allele for variant: {chrom}:{pos + 1}")
301
- return None
302
-
303
- tumor_sample = fields[header_map["Tumor_Sample_Barcode"]]
304
- normal_sample = fields[header_map["Matched_Norm_Sample_Barcode"]]
305
- t_ref_count = int(fields[header_map["t_ref_count"]])
306
- t_alt_count = int(fields[header_map["t_alt_count"]])
307
- n_ref_count = int(fields[header_map["n_ref_count"]])
308
- n_alt_count = int(fields[header_map["n_alt_count"]])
309
- effect = fields[header_map["Variant_Classification"]]
310
-
311
- caller = ""
312
- if "Caller" in header_map and len(fields) > header_map["Caller"]:
313
- caller = fields[header_map["Caller"]]
314
-
315
- # Store original MAF coordinates
316
- maf_pos = pos
317
- maf_end_pos = end_pos
318
- maf_ref = ref
319
- maf_alt = alt
320
-
321
- # Convert MAF format to VCF format
322
- if ref == "-": # Insertion in MAF format
323
- if self.reference_getter:
324
- prev_base = self.reference_getter(chrom, pos)
325
- ref = prev_base
326
- alt = prev_base + alt
327
- end_pos -= 1
328
- else:
329
- logger.warning(f"Cannot convert MAF insertion without reference: {chrom}:{pos}")
330
- return None
331
- elif alt == "-": # Deletion in MAF format
332
- pos -= 1
333
- if self.reference_getter:
334
- prev_base = self.reference_getter(chrom, pos)
335
- ref = prev_base + ref
336
- alt = prev_base
337
- else:
338
- logger.warning(f"Cannot convert MAF deletion without reference: {chrom}:{pos}")
339
- return None
340
- elif len(alt) != len(ref): # Complex indel
341
- pos -= 1
342
- if self.reference_getter:
343
- prev_base = self.reference_getter(chrom, pos)
344
- ref = prev_base + ref
345
- alt = prev_base + alt
346
- else:
347
- logger.warning(
348
- f"Cannot convert MAF complex indel without reference: {chrom}:{pos}"
349
- )
350
- return None
351
-
352
- # Determine variant type
353
- snp = len(alt) == len(ref) == 1
354
- dnp = len(alt) == len(ref) > 1
355
- dnp_len = len(ref) if dnp else 0
356
- insertion = len(alt) > len(ref)
357
- deletion = len(alt) < len(ref)
358
-
359
- variant = VariantEntry(
360
- chrom=chrom,
361
- pos=pos,
362
- end_pos=end_pos,
363
- ref=ref,
364
- alt=alt,
365
- snp=snp,
366
- dnp=dnp,
367
- dnp_len=dnp_len,
368
- insertion=insertion,
369
- deletion=deletion,
370
- tumor_sample=tumor_sample,
371
- normal_sample=normal_sample,
372
- gene=gene,
373
- effect=effect,
374
- t_ref_count=t_ref_count,
375
- t_alt_count=t_alt_count,
376
- n_ref_count=n_ref_count,
377
- n_alt_count=n_alt_count,
378
- maf_pos=maf_pos,
379
- maf_end_pos=maf_end_pos,
380
- maf_ref=maf_ref,
381
- maf_alt=maf_alt,
382
- caller=caller,
383
- maf_line=original_line,
384
- )
385
-
386
- return variant
387
-
388
- except Exception as e:
389
- logger.error(f"Error parsing MAF line: {e}")
390
- return None