py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +1 -13
- gbcms/cli.py +134 -716
- gbcms/core/kernel.py +126 -0
- gbcms/io/input.py +222 -0
- gbcms/io/output.py +361 -0
- gbcms/models/core.py +133 -0
- gbcms/pipeline.py +212 -0
- gbcms/py.typed +0 -0
- py_gbcms-2.1.1.dist-info/METADATA +216 -0
- py_gbcms-2.1.1.dist-info/RECORD +13 -0
- gbcms/config.py +0 -98
- gbcms/counter.py +0 -1074
- gbcms/models.py +0 -295
- gbcms/numba_counter.py +0 -394
- gbcms/output.py +0 -573
- gbcms/parallel.py +0 -129
- gbcms/processor.py +0 -293
- gbcms/reference.py +0 -86
- gbcms/variant.py +0 -390
- py_gbcms-2.0.0.dist-info/METADATA +0 -506
- py_gbcms-2.0.0.dist-info/RECORD +0 -16
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/WHEEL +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/entry_points.txt +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/licenses/LICENSE +0 -0
gbcms/variant.py
DELETED
|
@@ -1,390 +0,0 @@
|
|
|
1
|
-
"""Variant loading and representation."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
|
|
9
|
-
from .config import CountType
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
|
-
|
|
13
|
-
# Try to import cyvcf2 for fast VCF parsing
|
|
14
|
-
try:
|
|
15
|
-
from cyvcf2 import VCF
|
|
16
|
-
|
|
17
|
-
HAS_CYVCF2 = True
|
|
18
|
-
logger.debug("cyvcf2 available - using fast VCF parsing")
|
|
19
|
-
except ImportError:
|
|
20
|
-
HAS_CYVCF2 = False
|
|
21
|
-
logger.debug("cyvcf2 not available - using pure Python VCF parsing")
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@dataclass
|
|
25
|
-
class VariantEntry:
|
|
26
|
-
"""Represents a variant with its counts across samples."""
|
|
27
|
-
|
|
28
|
-
chrom: str
|
|
29
|
-
pos: int # 0-indexed
|
|
30
|
-
end_pos: int
|
|
31
|
-
ref: str
|
|
32
|
-
alt: str
|
|
33
|
-
snp: bool = False
|
|
34
|
-
dnp: bool = False
|
|
35
|
-
dnp_len: int = 0
|
|
36
|
-
insertion: bool = False
|
|
37
|
-
deletion: bool = False
|
|
38
|
-
tumor_sample: str = ""
|
|
39
|
-
normal_sample: str = ""
|
|
40
|
-
gene: str = ""
|
|
41
|
-
effect: str = ""
|
|
42
|
-
t_ref_count: int = 0
|
|
43
|
-
t_alt_count: int = 0
|
|
44
|
-
n_ref_count: int = 0
|
|
45
|
-
n_alt_count: int = 0
|
|
46
|
-
maf_pos: int = 0
|
|
47
|
-
maf_end_pos: int = 0
|
|
48
|
-
maf_ref: str = ""
|
|
49
|
-
maf_alt: str = ""
|
|
50
|
-
caller: str = ""
|
|
51
|
-
base_count: dict[str, np.ndarray] = field(default_factory=dict)
|
|
52
|
-
duplicate_variant_ptr: Optional["VariantEntry"] = None
|
|
53
|
-
maf_line: str = "" # Store original MAF line for output
|
|
54
|
-
|
|
55
|
-
def get_variant_key(self) -> tuple[str, int, str, str]:
|
|
56
|
-
"""Return unique key for variant identification."""
|
|
57
|
-
return (self.chrom, self.pos, self.ref, self.alt)
|
|
58
|
-
|
|
59
|
-
def initialize_counts(self, sample_names: list[str]) -> None:
|
|
60
|
-
"""Initialize count arrays for all samples."""
|
|
61
|
-
for sample in sample_names:
|
|
62
|
-
if sample not in self.base_count:
|
|
63
|
-
self.base_count[sample] = np.zeros(len(CountType), dtype=np.float32)
|
|
64
|
-
|
|
65
|
-
def get_count(self, sample: str, count_type: CountType) -> float:
|
|
66
|
-
"""Get count for specific sample and type."""
|
|
67
|
-
if sample not in self.base_count:
|
|
68
|
-
return 0.0
|
|
69
|
-
return float(self.base_count[sample][count_type])
|
|
70
|
-
|
|
71
|
-
def __lt__(self, other: "VariantEntry") -> bool:
|
|
72
|
-
"""Compare variants for sorting."""
|
|
73
|
-
if self.chrom != other.chrom:
|
|
74
|
-
return self._chrom_sort_key() < other._chrom_sort_key()
|
|
75
|
-
return self.pos < other.pos
|
|
76
|
-
|
|
77
|
-
def _chrom_sort_key(self) -> tuple:
|
|
78
|
-
"""Generate sort key for chromosome."""
|
|
79
|
-
chrom = self.chrom.replace("chr", "")
|
|
80
|
-
try:
|
|
81
|
-
return (0, int(chrom))
|
|
82
|
-
except ValueError:
|
|
83
|
-
if chrom == "X":
|
|
84
|
-
return (1, 0)
|
|
85
|
-
elif chrom == "Y":
|
|
86
|
-
return (1, 1)
|
|
87
|
-
elif chrom == "M" or chrom == "MT":
|
|
88
|
-
return (1, 2)
|
|
89
|
-
else:
|
|
90
|
-
return (2, chrom)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
class VariantLoader:
|
|
94
|
-
"""Loads variants from VCF or MAF files."""
|
|
95
|
-
|
|
96
|
-
def __init__(self, reference_getter=None):
|
|
97
|
-
"""
|
|
98
|
-
Initialize variant loader.
|
|
99
|
-
|
|
100
|
-
Args:
|
|
101
|
-
reference_getter: Callable that takes (chrom, pos) and returns base
|
|
102
|
-
"""
|
|
103
|
-
self.reference_getter = reference_getter
|
|
104
|
-
|
|
105
|
-
def load_vcf(self, vcf_file: str) -> list[VariantEntry]:
|
|
106
|
-
"""
|
|
107
|
-
Load variants from VCF file.
|
|
108
|
-
|
|
109
|
-
Uses cyvcf2 for fast parsing if available, otherwise falls back to pure Python.
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
vcf_file: Path to VCF file (can be .vcf or .vcf.gz)
|
|
113
|
-
|
|
114
|
-
Returns:
|
|
115
|
-
List of VariantEntry objects
|
|
116
|
-
"""
|
|
117
|
-
if HAS_CYVCF2:
|
|
118
|
-
return self._load_vcf_cyvcf2(vcf_file)
|
|
119
|
-
else:
|
|
120
|
-
return self._load_vcf_python(vcf_file)
|
|
121
|
-
|
|
122
|
-
def _load_vcf_cyvcf2(self, vcf_file: str) -> list[VariantEntry]:
|
|
123
|
-
"""
|
|
124
|
-
Load variants from VCF using cyvcf2 (fast, C-based parser).
|
|
125
|
-
|
|
126
|
-
This is 10-100x faster than pure Python parsing.
|
|
127
|
-
"""
|
|
128
|
-
logger.info(f"Loading variants file with cyvcf2: {vcf_file}")
|
|
129
|
-
variants = []
|
|
130
|
-
|
|
131
|
-
try:
|
|
132
|
-
vcf = VCF(vcf_file)
|
|
133
|
-
|
|
134
|
-
for variant in vcf:
|
|
135
|
-
chrom = variant.CHROM
|
|
136
|
-
pos = variant.POS - 1 # Convert to 0-indexed
|
|
137
|
-
ref = variant.REF
|
|
138
|
-
|
|
139
|
-
# Handle multiple alts - take first one
|
|
140
|
-
alt = variant.ALT[0] if variant.ALT else ""
|
|
141
|
-
|
|
142
|
-
end_pos = pos + len(ref) - 1
|
|
143
|
-
|
|
144
|
-
# Determine variant type
|
|
145
|
-
snp = len(alt) == len(ref) == 1
|
|
146
|
-
dnp = len(alt) == len(ref) > 1
|
|
147
|
-
dnp_len = len(ref) if dnp else 0
|
|
148
|
-
insertion = len(alt) > len(ref)
|
|
149
|
-
deletion = len(alt) < len(ref)
|
|
150
|
-
|
|
151
|
-
entry = VariantEntry(
|
|
152
|
-
chrom=chrom,
|
|
153
|
-
pos=pos,
|
|
154
|
-
end_pos=end_pos,
|
|
155
|
-
ref=ref,
|
|
156
|
-
alt=alt,
|
|
157
|
-
snp=snp,
|
|
158
|
-
dnp=dnp,
|
|
159
|
-
dnp_len=dnp_len,
|
|
160
|
-
insertion=insertion,
|
|
161
|
-
deletion=deletion,
|
|
162
|
-
)
|
|
163
|
-
variants.append(entry)
|
|
164
|
-
|
|
165
|
-
vcf.close()
|
|
166
|
-
|
|
167
|
-
except Exception as e:
|
|
168
|
-
logger.error(f"Error loading VCF with cyvcf2: {e}")
|
|
169
|
-
logger.info("Falling back to pure Python VCF parser")
|
|
170
|
-
return self._load_vcf_python(vcf_file)
|
|
171
|
-
|
|
172
|
-
logger.info(f"{len(variants)} variants loaded from file: {vcf_file}")
|
|
173
|
-
return variants
|
|
174
|
-
|
|
175
|
-
def _load_vcf_python(self, vcf_file: str) -> list[VariantEntry]:
|
|
176
|
-
"""
|
|
177
|
-
Load variants from VCF using pure Python (slower but always works).
|
|
178
|
-
|
|
179
|
-
This is the fallback method when cyvcf2 is not available.
|
|
180
|
-
"""
|
|
181
|
-
logger.info(f"Loading variants file with Python parser: {vcf_file}")
|
|
182
|
-
variants = []
|
|
183
|
-
|
|
184
|
-
with open(vcf_file) as f:
|
|
185
|
-
for line in f:
|
|
186
|
-
line = line.strip()
|
|
187
|
-
if not line or line.startswith("#"):
|
|
188
|
-
continue
|
|
189
|
-
|
|
190
|
-
fields = line.split("\t")
|
|
191
|
-
if len(fields) < 5:
|
|
192
|
-
logger.error(f"Incorrectly formatted VCF entry: {line}")
|
|
193
|
-
continue
|
|
194
|
-
|
|
195
|
-
chrom = fields[0]
|
|
196
|
-
pos = int(fields[1]) - 1 # Convert to 0-indexed
|
|
197
|
-
ref = fields[3]
|
|
198
|
-
alt = fields[4]
|
|
199
|
-
|
|
200
|
-
# Handle multiple alts - take first one
|
|
201
|
-
if "," in alt:
|
|
202
|
-
alt = alt.split(",")[0]
|
|
203
|
-
|
|
204
|
-
end_pos = pos + len(ref) - 1
|
|
205
|
-
|
|
206
|
-
# Determine variant type
|
|
207
|
-
snp = len(alt) == len(ref) == 1
|
|
208
|
-
dnp = len(alt) == len(ref) > 1
|
|
209
|
-
dnp_len = len(ref) if dnp else 0
|
|
210
|
-
insertion = len(alt) > len(ref)
|
|
211
|
-
deletion = len(alt) < len(ref)
|
|
212
|
-
|
|
213
|
-
variant = VariantEntry(
|
|
214
|
-
chrom=chrom,
|
|
215
|
-
pos=pos,
|
|
216
|
-
end_pos=end_pos,
|
|
217
|
-
ref=ref,
|
|
218
|
-
alt=alt,
|
|
219
|
-
snp=snp,
|
|
220
|
-
dnp=dnp,
|
|
221
|
-
dnp_len=dnp_len,
|
|
222
|
-
insertion=insertion,
|
|
223
|
-
deletion=deletion,
|
|
224
|
-
)
|
|
225
|
-
variants.append(variant)
|
|
226
|
-
|
|
227
|
-
logger.info(f"{len(variants)} variants loaded from file: {vcf_file}")
|
|
228
|
-
return variants
|
|
229
|
-
|
|
230
|
-
def load_maf(self, maf_file: str) -> list[VariantEntry]:
|
|
231
|
-
"""Load variants from MAF file."""
|
|
232
|
-
logger.info(f"Loading variants file: {maf_file}")
|
|
233
|
-
variants = []
|
|
234
|
-
header_map = {}
|
|
235
|
-
|
|
236
|
-
with open(maf_file) as f:
|
|
237
|
-
# Find header line
|
|
238
|
-
for line in f:
|
|
239
|
-
line = line.strip()
|
|
240
|
-
if not line.startswith("#"):
|
|
241
|
-
# This is the header
|
|
242
|
-
headers = line.split("\t")
|
|
243
|
-
header_map = {h: i for i, h in enumerate(headers)}
|
|
244
|
-
break
|
|
245
|
-
|
|
246
|
-
# Validate required columns
|
|
247
|
-
required_cols = [
|
|
248
|
-
"Hugo_Symbol",
|
|
249
|
-
"Chromosome",
|
|
250
|
-
"Start_Position",
|
|
251
|
-
"End_Position",
|
|
252
|
-
"Reference_Allele",
|
|
253
|
-
"Tumor_Seq_Allele1",
|
|
254
|
-
"Tumor_Seq_Allele2",
|
|
255
|
-
"Tumor_Sample_Barcode",
|
|
256
|
-
"Matched_Norm_Sample_Barcode",
|
|
257
|
-
"t_ref_count",
|
|
258
|
-
"t_alt_count",
|
|
259
|
-
"n_ref_count",
|
|
260
|
-
"n_alt_count",
|
|
261
|
-
"Variant_Classification",
|
|
262
|
-
]
|
|
263
|
-
|
|
264
|
-
missing_cols = [col for col in required_cols if col not in header_map]
|
|
265
|
-
if missing_cols:
|
|
266
|
-
logger.error(f"Missing required MAF columns: {missing_cols}")
|
|
267
|
-
raise ValueError("Incorrect MAF file header")
|
|
268
|
-
|
|
269
|
-
# Load variants
|
|
270
|
-
for line in f:
|
|
271
|
-
line = line.strip()
|
|
272
|
-
if not line:
|
|
273
|
-
continue
|
|
274
|
-
|
|
275
|
-
fields = line.split("\t")
|
|
276
|
-
variant = self._parse_maf_line(fields, header_map, line)
|
|
277
|
-
if variant:
|
|
278
|
-
variants.append(variant)
|
|
279
|
-
|
|
280
|
-
logger.info(f"{len(variants)} variants loaded from file: {maf_file}")
|
|
281
|
-
return variants
|
|
282
|
-
|
|
283
|
-
def _parse_maf_line(
|
|
284
|
-
self, fields: list[str], header_map: dict[str, int], original_line: str
|
|
285
|
-
) -> VariantEntry | None:
|
|
286
|
-
"""Parse a single MAF line into VariantEntry."""
|
|
287
|
-
try:
|
|
288
|
-
gene = fields[header_map["Hugo_Symbol"]]
|
|
289
|
-
chrom = fields[header_map["Chromosome"]]
|
|
290
|
-
pos = int(fields[header_map["Start_Position"]]) - 1 # Convert to 0-indexed
|
|
291
|
-
end_pos = int(fields[header_map["End_Position"]]) - 1
|
|
292
|
-
ref = fields[header_map["Reference_Allele"]]
|
|
293
|
-
alt = fields[header_map["Tumor_Seq_Allele1"]]
|
|
294
|
-
|
|
295
|
-
# Use Tumor_Seq_Allele2 if Allele1 is empty or same as ref
|
|
296
|
-
if not alt or alt == ref:
|
|
297
|
-
alt = fields[header_map["Tumor_Seq_Allele2"]]
|
|
298
|
-
|
|
299
|
-
if not alt or alt == ref:
|
|
300
|
-
logger.warning(f"Could not find valid alt allele for variant: {chrom}:{pos + 1}")
|
|
301
|
-
return None
|
|
302
|
-
|
|
303
|
-
tumor_sample = fields[header_map["Tumor_Sample_Barcode"]]
|
|
304
|
-
normal_sample = fields[header_map["Matched_Norm_Sample_Barcode"]]
|
|
305
|
-
t_ref_count = int(fields[header_map["t_ref_count"]])
|
|
306
|
-
t_alt_count = int(fields[header_map["t_alt_count"]])
|
|
307
|
-
n_ref_count = int(fields[header_map["n_ref_count"]])
|
|
308
|
-
n_alt_count = int(fields[header_map["n_alt_count"]])
|
|
309
|
-
effect = fields[header_map["Variant_Classification"]]
|
|
310
|
-
|
|
311
|
-
caller = ""
|
|
312
|
-
if "Caller" in header_map and len(fields) > header_map["Caller"]:
|
|
313
|
-
caller = fields[header_map["Caller"]]
|
|
314
|
-
|
|
315
|
-
# Store original MAF coordinates
|
|
316
|
-
maf_pos = pos
|
|
317
|
-
maf_end_pos = end_pos
|
|
318
|
-
maf_ref = ref
|
|
319
|
-
maf_alt = alt
|
|
320
|
-
|
|
321
|
-
# Convert MAF format to VCF format
|
|
322
|
-
if ref == "-": # Insertion in MAF format
|
|
323
|
-
if self.reference_getter:
|
|
324
|
-
prev_base = self.reference_getter(chrom, pos)
|
|
325
|
-
ref = prev_base
|
|
326
|
-
alt = prev_base + alt
|
|
327
|
-
end_pos -= 1
|
|
328
|
-
else:
|
|
329
|
-
logger.warning(f"Cannot convert MAF insertion without reference: {chrom}:{pos}")
|
|
330
|
-
return None
|
|
331
|
-
elif alt == "-": # Deletion in MAF format
|
|
332
|
-
pos -= 1
|
|
333
|
-
if self.reference_getter:
|
|
334
|
-
prev_base = self.reference_getter(chrom, pos)
|
|
335
|
-
ref = prev_base + ref
|
|
336
|
-
alt = prev_base
|
|
337
|
-
else:
|
|
338
|
-
logger.warning(f"Cannot convert MAF deletion without reference: {chrom}:{pos}")
|
|
339
|
-
return None
|
|
340
|
-
elif len(alt) != len(ref): # Complex indel
|
|
341
|
-
pos -= 1
|
|
342
|
-
if self.reference_getter:
|
|
343
|
-
prev_base = self.reference_getter(chrom, pos)
|
|
344
|
-
ref = prev_base + ref
|
|
345
|
-
alt = prev_base + alt
|
|
346
|
-
else:
|
|
347
|
-
logger.warning(
|
|
348
|
-
f"Cannot convert MAF complex indel without reference: {chrom}:{pos}"
|
|
349
|
-
)
|
|
350
|
-
return None
|
|
351
|
-
|
|
352
|
-
# Determine variant type
|
|
353
|
-
snp = len(alt) == len(ref) == 1
|
|
354
|
-
dnp = len(alt) == len(ref) > 1
|
|
355
|
-
dnp_len = len(ref) if dnp else 0
|
|
356
|
-
insertion = len(alt) > len(ref)
|
|
357
|
-
deletion = len(alt) < len(ref)
|
|
358
|
-
|
|
359
|
-
variant = VariantEntry(
|
|
360
|
-
chrom=chrom,
|
|
361
|
-
pos=pos,
|
|
362
|
-
end_pos=end_pos,
|
|
363
|
-
ref=ref,
|
|
364
|
-
alt=alt,
|
|
365
|
-
snp=snp,
|
|
366
|
-
dnp=dnp,
|
|
367
|
-
dnp_len=dnp_len,
|
|
368
|
-
insertion=insertion,
|
|
369
|
-
deletion=deletion,
|
|
370
|
-
tumor_sample=tumor_sample,
|
|
371
|
-
normal_sample=normal_sample,
|
|
372
|
-
gene=gene,
|
|
373
|
-
effect=effect,
|
|
374
|
-
t_ref_count=t_ref_count,
|
|
375
|
-
t_alt_count=t_alt_count,
|
|
376
|
-
n_ref_count=n_ref_count,
|
|
377
|
-
n_alt_count=n_alt_count,
|
|
378
|
-
maf_pos=maf_pos,
|
|
379
|
-
maf_end_pos=maf_end_pos,
|
|
380
|
-
maf_ref=maf_ref,
|
|
381
|
-
maf_alt=maf_alt,
|
|
382
|
-
caller=caller,
|
|
383
|
-
maf_line=original_line,
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
return variant
|
|
387
|
-
|
|
388
|
-
except Exception as e:
|
|
389
|
-
logger.error(f"Error parsing MAF line: {e}")
|
|
390
|
-
return None
|