py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +1 -13
- gbcms/cli.py +134 -716
- gbcms/core/kernel.py +126 -0
- gbcms/io/input.py +222 -0
- gbcms/io/output.py +361 -0
- gbcms/models/core.py +133 -0
- gbcms/pipeline.py +212 -0
- gbcms/py.typed +0 -0
- py_gbcms-2.1.1.dist-info/METADATA +216 -0
- py_gbcms-2.1.1.dist-info/RECORD +13 -0
- gbcms/config.py +0 -98
- gbcms/counter.py +0 -1074
- gbcms/models.py +0 -295
- gbcms/numba_counter.py +0 -394
- gbcms/output.py +0 -573
- gbcms/parallel.py +0 -129
- gbcms/processor.py +0 -293
- gbcms/reference.py +0 -86
- gbcms/variant.py +0 -390
- py_gbcms-2.0.0.dist-info/METADATA +0 -506
- py_gbcms-2.0.0.dist-info/RECORD +0 -16
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/WHEEL +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/entry_points.txt +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/licenses/LICENSE +0 -0
gbcms/core/kernel.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Coordinate Kernel: The source of truth for genomic coordinate systems.
|
|
3
|
+
|
|
4
|
+
Handles conversion between:
|
|
5
|
+
- VCF (1-based)
|
|
6
|
+
- MAF (1-based)
|
|
7
|
+
- Internal (0-based, half-open [start, end))
|
|
8
|
+
|
|
9
|
+
Ensures consistent representation of variants:
|
|
10
|
+
- SNPs: 0-based index of the base.
|
|
11
|
+
- Insertions: 0-based index of the ANCHOR base (preceding the insertion).
|
|
12
|
+
- Deletions: 0-based index of the ANCHOR base (preceding the deletion).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from gbcms.models.core import Variant, VariantType
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CoordinateKernel:
|
|
19
|
+
"""
|
|
20
|
+
Stateless utility for coordinate transformations and normalization.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def vcf_to_internal(
|
|
25
|
+
chrom: str, pos: int, ref: str, alt: str, original_id: str | None = None
|
|
26
|
+
) -> Variant:
|
|
27
|
+
"""
|
|
28
|
+
Convert VCF coordinates (1-based) to internal normalized Variant.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
chrom: Chromosome name
|
|
32
|
+
pos: 1-based position from VCF
|
|
33
|
+
ref: Reference allele
|
|
34
|
+
alt: Alternate allele
|
|
35
|
+
original_id: Optional VCF ID
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Normalized Variant object
|
|
39
|
+
"""
|
|
40
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
41
|
+
|
|
42
|
+
# Determine variant type and internal position
|
|
43
|
+
if len(ref) == 1 and len(alt) == 1:
|
|
44
|
+
vtype = VariantType.SNP
|
|
45
|
+
# SNP: VCF POS is the base itself.
|
|
46
|
+
# 1-based 10 -> 0-based 9
|
|
47
|
+
internal_pos = pos - 1
|
|
48
|
+
|
|
49
|
+
elif len(ref) == 1 and len(alt) > 1:
|
|
50
|
+
vtype = VariantType.INSERTION
|
|
51
|
+
# Insertion: VCF POS is the base BEFORE the insertion (the anchor).
|
|
52
|
+
# VCF: POS=10, REF=A, ALT=AT (Insertion of T after A at 10)
|
|
53
|
+
# Internal: 0-based index of the ANCHOR base.
|
|
54
|
+
# 1-based 10 -> 0-based 9
|
|
55
|
+
internal_pos = pos - 1
|
|
56
|
+
|
|
57
|
+
elif len(ref) > 1 and len(alt) == 1:
|
|
58
|
+
vtype = VariantType.DELETION
|
|
59
|
+
# Deletion: VCF POS is the base BEFORE the deletion (the anchor).
|
|
60
|
+
# VCF: POS=10, REF=AT, ALT=A (Deletion of T after A at 10)
|
|
61
|
+
# Internal POS: 0-based index of the ANCHOR base.
|
|
62
|
+
# 1-based 10 -> 0-based 9
|
|
63
|
+
internal_pos = pos - 1
|
|
64
|
+
|
|
65
|
+
else:
|
|
66
|
+
vtype = VariantType.COMPLEX
|
|
67
|
+
# Complex: Treat start as 0-based index of first ref base
|
|
68
|
+
internal_pos = pos - 1
|
|
69
|
+
|
|
70
|
+
return Variant(
|
|
71
|
+
chrom=norm_chrom,
|
|
72
|
+
pos=internal_pos,
|
|
73
|
+
ref=ref,
|
|
74
|
+
alt=alt,
|
|
75
|
+
variant_type=vtype,
|
|
76
|
+
original_id=original_id,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def maf_to_internal(chrom: str, start_pos: int, end_pos: int, ref: str, alt: str) -> Variant:
|
|
81
|
+
"""
|
|
82
|
+
Convert MAF coordinates (1-based inclusive) to internal normalized Variant.
|
|
83
|
+
|
|
84
|
+
MAF coordinates are generally 1-based inclusive [start, end].
|
|
85
|
+
"""
|
|
86
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
87
|
+
|
|
88
|
+
# Handle MAF indels which often use '-'
|
|
89
|
+
if ref == "-" or alt == "-":
|
|
90
|
+
# MAF Insertion: Start_Position is the base BEFORE the insertion (anchor).
|
|
91
|
+
# ref='-', alt='T' -> VCF-like would be ref='A', alt='AT' (requires lookup)
|
|
92
|
+
# But if we just want to represent it internally:
|
|
93
|
+
if ref == "-": # Insertion
|
|
94
|
+
vtype = VariantType.INSERTION
|
|
95
|
+
# MAF Start_Position is usually the flanking base 0 or 1?
|
|
96
|
+
# Standard MAF: Start_Position is the base BEFORE the insertion.
|
|
97
|
+
internal_pos = start_pos - 1
|
|
98
|
+
else: # Deletion
|
|
99
|
+
vtype = VariantType.DELETION
|
|
100
|
+
# MAF Start_Position is the first deleted base? Or anchor?
|
|
101
|
+
# Usually first deleted base.
|
|
102
|
+
# We need to convert to anchor-based for consistency if possible,
|
|
103
|
+
# OR handle MAF-style internally.
|
|
104
|
+
# Let's assume we want VCF-style anchor-based internally.
|
|
105
|
+
# This effectively requires a reference lookup to get the anchor base.
|
|
106
|
+
# For now, we will mark it as needing normalization or handle it in the engine.
|
|
107
|
+
internal_pos = start_pos - 1
|
|
108
|
+
|
|
109
|
+
elif len(ref) == len(alt) == 1:
|
|
110
|
+
vtype = VariantType.SNP
|
|
111
|
+
internal_pos = start_pos - 1
|
|
112
|
+
|
|
113
|
+
else:
|
|
114
|
+
vtype = VariantType.COMPLEX
|
|
115
|
+
internal_pos = start_pos - 1
|
|
116
|
+
|
|
117
|
+
return Variant(chrom=norm_chrom, pos=internal_pos, ref=ref, alt=alt, variant_type=vtype)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def normalize_chromosome(chrom: str) -> str:
|
|
121
|
+
"""
|
|
122
|
+
Normalize chromosome name (remove 'chr' prefix).
|
|
123
|
+
"""
|
|
124
|
+
if chrom.lower().startswith("chr"):
|
|
125
|
+
return chrom[3:]
|
|
126
|
+
return chrom
|
gbcms/io/input.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Input Adapters: Handling VCF and MAF inputs.
|
|
3
|
+
|
|
4
|
+
This module provides classes to read variants from VCF and MAF files,
|
|
5
|
+
converting them into the internal normalized representation using CoordinateKernel.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
from collections.abc import Iterator
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import pysam
|
|
13
|
+
from pydantic import ValidationError
|
|
14
|
+
|
|
15
|
+
from ..core.kernel import CoordinateKernel
|
|
16
|
+
from ..models.core import Variant
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class VariantReader:
|
|
20
|
+
"""Abstract base class for variant readers."""
|
|
21
|
+
|
|
22
|
+
def __iter__(self) -> Iterator[Variant]:
|
|
23
|
+
raise NotImplementedError
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class VcfReader(VariantReader):
|
|
27
|
+
"""Reads variants from a VCF file."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, path: Path):
|
|
30
|
+
self.path = path
|
|
31
|
+
self._vcf = pysam.VariantFile(str(path))
|
|
32
|
+
|
|
33
|
+
def __iter__(self) -> Iterator[Variant]:
|
|
34
|
+
for record in self._vcf:
|
|
35
|
+
# VCF coordinates are 1-based
|
|
36
|
+
# pysam converts them to 0-based automatically?
|
|
37
|
+
# pysam.VariantFile returns 0-based pos (start)
|
|
38
|
+
# BUT CoordinateKernel.vcf_to_internal expects 1-based VCF POS.
|
|
39
|
+
# Let's check pysam documentation or behavior.
|
|
40
|
+
# pysam record.pos is 0-based. record.start is 0-based.
|
|
41
|
+
# The VCF file itself has 1-based POS.
|
|
42
|
+
# If we use record.pos + 1, we get the VCF POS.
|
|
43
|
+
|
|
44
|
+
# Handle multiple ALTs
|
|
45
|
+
for alt in record.alts or []:
|
|
46
|
+
# VCF POS is record.pos (1-based) or record.start + 1
|
|
47
|
+
if not record.ref:
|
|
48
|
+
continue # Skip if no REF
|
|
49
|
+
|
|
50
|
+
yield CoordinateKernel.vcf_to_internal(
|
|
51
|
+
chrom=record.chrom,
|
|
52
|
+
pos=record.pos,
|
|
53
|
+
ref=record.ref,
|
|
54
|
+
alt=alt,
|
|
55
|
+
original_id=record.id,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def close(self):
|
|
59
|
+
self._vcf.close()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class MafReader(VariantReader):
|
|
63
|
+
"""Reads variants from a MAF file."""
|
|
64
|
+
|
|
65
|
+
def __init__(self, path: Path, fasta_path: Path | None = None):
|
|
66
|
+
self.path = path
|
|
67
|
+
self.fasta = pysam.FastaFile(str(fasta_path)) if fasta_path else None
|
|
68
|
+
|
|
69
|
+
def __iter__(self) -> Iterator[Variant]:
|
|
70
|
+
with open(self.path) as f:
|
|
71
|
+
# Skip comments
|
|
72
|
+
while True:
|
|
73
|
+
pos = f.tell()
|
|
74
|
+
line = f.readline()
|
|
75
|
+
if not line.startswith("#"):
|
|
76
|
+
f.seek(pos)
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
80
|
+
|
|
81
|
+
for row in reader:
|
|
82
|
+
try:
|
|
83
|
+
chrom = row["Chromosome"]
|
|
84
|
+
start_pos = int(row["Start_Position"])
|
|
85
|
+
ref = row["Reference_Allele"]
|
|
86
|
+
alt = row["Tumor_Seq_Allele2"] # Standard MAF alt column
|
|
87
|
+
|
|
88
|
+
# Normalize Indels if FASTA is available
|
|
89
|
+
if self.fasta and (ref == "-" or alt == "-"):
|
|
90
|
+
if ref == "-": # Insertion
|
|
91
|
+
# MAF Start_Position is the base BEFORE the insertion (anchor)
|
|
92
|
+
# 1-based coordinate
|
|
93
|
+
anchor_pos_1based = start_pos
|
|
94
|
+
anchor_pos_0based = anchor_pos_1based - 1
|
|
95
|
+
|
|
96
|
+
# Fetch anchor base
|
|
97
|
+
# Try normalized and original chromosome names
|
|
98
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
99
|
+
try:
|
|
100
|
+
anchor_base = self.fasta.fetch(
|
|
101
|
+
norm_chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
102
|
+
).upper()
|
|
103
|
+
except (KeyError, ValueError):
|
|
104
|
+
try:
|
|
105
|
+
anchor_base = self.fasta.fetch(
|
|
106
|
+
chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
107
|
+
).upper()
|
|
108
|
+
except (KeyError, ValueError):
|
|
109
|
+
# If both fail, we can't normalize. Skip or raise?
|
|
110
|
+
# For now, skip/log
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
# VCF Style:
|
|
114
|
+
# POS = anchor_pos_1based
|
|
115
|
+
# REF = anchor_base
|
|
116
|
+
# ALT = anchor_base + inserted_seq
|
|
117
|
+
vcf_pos = anchor_pos_1based
|
|
118
|
+
vcf_ref = anchor_base
|
|
119
|
+
vcf_alt = anchor_base + alt
|
|
120
|
+
|
|
121
|
+
else: # Deletion (alt == '-')
|
|
122
|
+
# MAF Start_Position is the FIRST DELETED base
|
|
123
|
+
# Anchor is the base before that
|
|
124
|
+
first_deleted_1based = start_pos
|
|
125
|
+
anchor_pos_1based = first_deleted_1based - 1
|
|
126
|
+
anchor_pos_0based = anchor_pos_1based - 1
|
|
127
|
+
|
|
128
|
+
# Fetch anchor base
|
|
129
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
130
|
+
try:
|
|
131
|
+
anchor_base = self.fasta.fetch(
|
|
132
|
+
norm_chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
133
|
+
).upper()
|
|
134
|
+
except (KeyError, ValueError):
|
|
135
|
+
try:
|
|
136
|
+
anchor_base = self.fasta.fetch(
|
|
137
|
+
chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
138
|
+
).upper()
|
|
139
|
+
except (KeyError, ValueError):
|
|
140
|
+
continue
|
|
141
|
+
|
|
142
|
+
# VCF Style:
|
|
143
|
+
# POS = anchor_pos_1based
|
|
144
|
+
# REF = anchor_base + deleted_seq
|
|
145
|
+
# ALT = anchor_base
|
|
146
|
+
vcf_pos = anchor_pos_1based
|
|
147
|
+
vcf_ref = anchor_base + ref
|
|
148
|
+
vcf_alt = anchor_base
|
|
149
|
+
|
|
150
|
+
yield CoordinateKernel.vcf_to_internal(
|
|
151
|
+
chrom=chrom, pos=vcf_pos, ref=vcf_ref, alt=vcf_alt
|
|
152
|
+
).model_copy(update={"metadata": row})
|
|
153
|
+
else:
|
|
154
|
+
# Fallback to old behavior or direct mapping for SNPs
|
|
155
|
+
# For SNPs, MAF Start_Position == VCF POS
|
|
156
|
+
if len(ref) == len(alt) == 1 and ref != "-" and alt != "-":
|
|
157
|
+
yield CoordinateKernel.vcf_to_internal(
|
|
158
|
+
chrom=chrom, pos=start_pos, ref=ref, alt=alt
|
|
159
|
+
).model_copy(update={"metadata": row})
|
|
160
|
+
else:
|
|
161
|
+
# Fallback for complex/unhandled without FASTA
|
|
162
|
+
# This might fail in Rust engine if it expects anchor
|
|
163
|
+
yield CoordinateKernel.maf_to_internal(
|
|
164
|
+
chrom=chrom,
|
|
165
|
+
start_pos=start_pos,
|
|
166
|
+
end_pos=int(row["End_Position"]),
|
|
167
|
+
ref=ref,
|
|
168
|
+
alt=alt,
|
|
169
|
+
).model_copy(update={"metadata": row})
|
|
170
|
+
|
|
171
|
+
except (KeyError, ValueError, ValidationError):
|
|
172
|
+
# Log warning or skip malformed lines
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
def close(self):
|
|
176
|
+
if self.fasta:
|
|
177
|
+
self.fasta.close()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class ReferenceChecker:
|
|
181
|
+
"""
|
|
182
|
+
Utility to check variants against a reference FASTA.
|
|
183
|
+
Ensures that the REF allele matches the genome.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def __init__(self, fasta_path: Path):
|
|
187
|
+
self.fasta = pysam.FastaFile(str(fasta_path))
|
|
188
|
+
|
|
189
|
+
def validate(self, variant: Variant) -> bool:
|
|
190
|
+
"""
|
|
191
|
+
Check if variant REF matches reference genome.
|
|
192
|
+
"""
|
|
193
|
+
# Variant pos is 0-based.
|
|
194
|
+
# Fetch sequence of length REF
|
|
195
|
+
try:
|
|
196
|
+
# Try normalized and potentially 'chr' prefixed chromosome names
|
|
197
|
+
chrom = variant.chrom
|
|
198
|
+
# chrom is already normalized (e.g. "1") by CoordinateKernel
|
|
199
|
+
|
|
200
|
+
ref_seq = None
|
|
201
|
+
try:
|
|
202
|
+
ref_seq = self.fasta.fetch(chrom, variant.pos, variant.pos + len(variant.ref))
|
|
203
|
+
except (ValueError, KeyError):
|
|
204
|
+
try:
|
|
205
|
+
# Try adding 'chr' prefix
|
|
206
|
+
ref_seq = self.fasta.fetch(
|
|
207
|
+
f"chr{chrom}", variant.pos, variant.pos + len(variant.ref)
|
|
208
|
+
)
|
|
209
|
+
except (ValueError, KeyError) as e:
|
|
210
|
+
print(f"DEBUG: Failed to fetch {chrom} and chr{chrom}: {e}")
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
if ref_seq is None:
|
|
214
|
+
return False
|
|
215
|
+
|
|
216
|
+
return ref_seq.upper() == variant.ref.upper()
|
|
217
|
+
|
|
218
|
+
except Exception:
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
def close(self):
|
|
222
|
+
self.fasta.close()
|