py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/core/kernel.py ADDED
@@ -0,0 +1,126 @@
1
+ """
2
+ Coordinate Kernel: The source of truth for genomic coordinate systems.
3
+
4
+ Handles conversion between:
5
+ - VCF (1-based)
6
+ - MAF (1-based)
7
+ - Internal (0-based, half-open [start, end))
8
+
9
+ Ensures consistent representation of variants:
10
+ - SNPs: 0-based index of the base.
11
+ - Insertions: 0-based index of the ANCHOR base (preceding the insertion).
12
+ - Deletions: 0-based index of the ANCHOR base (preceding the deletion).
13
+ """
14
+
15
+ from gbcms.models.core import Variant, VariantType
16
+
17
+
18
+ class CoordinateKernel:
19
+ """
20
+ Stateless utility for coordinate transformations and normalization.
21
+ """
22
+
23
+ @staticmethod
24
+ def vcf_to_internal(
25
+ chrom: str, pos: int, ref: str, alt: str, original_id: str | None = None
26
+ ) -> Variant:
27
+ """
28
+ Convert VCF coordinates (1-based) to internal normalized Variant.
29
+
30
+ Args:
31
+ chrom: Chromosome name
32
+ pos: 1-based position from VCF
33
+ ref: Reference allele
34
+ alt: Alternate allele
35
+ original_id: Optional VCF ID
36
+
37
+ Returns:
38
+ Normalized Variant object
39
+ """
40
+ norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
41
+
42
+ # Determine variant type and internal position
43
+ if len(ref) == 1 and len(alt) == 1:
44
+ vtype = VariantType.SNP
45
+ # SNP: VCF POS is the base itself.
46
+ # 1-based 10 -> 0-based 9
47
+ internal_pos = pos - 1
48
+
49
+ elif len(ref) == 1 and len(alt) > 1:
50
+ vtype = VariantType.INSERTION
51
+ # Insertion: VCF POS is the base BEFORE the insertion (the anchor).
52
+ # VCF: POS=10, REF=A, ALT=AT (Insertion of T after A at 10)
53
+ # Internal: 0-based index of the ANCHOR base.
54
+ # 1-based 10 -> 0-based 9
55
+ internal_pos = pos - 1
56
+
57
+ elif len(ref) > 1 and len(alt) == 1:
58
+ vtype = VariantType.DELETION
59
+ # Deletion: VCF POS is the base BEFORE the deletion (the anchor).
60
+ # VCF: POS=10, REF=AT, ALT=A (Deletion of T after A at 10)
61
+ # Internal POS: 0-based index of the ANCHOR base.
62
+ # 1-based 10 -> 0-based 9
63
+ internal_pos = pos - 1
64
+
65
+ else:
66
+ vtype = VariantType.COMPLEX
67
+ # Complex: Treat start as 0-based index of first ref base
68
+ internal_pos = pos - 1
69
+
70
+ return Variant(
71
+ chrom=norm_chrom,
72
+ pos=internal_pos,
73
+ ref=ref,
74
+ alt=alt,
75
+ variant_type=vtype,
76
+ original_id=original_id,
77
+ )
78
+
79
+ @staticmethod
80
+ def maf_to_internal(chrom: str, start_pos: int, end_pos: int, ref: str, alt: str) -> Variant:
81
+ """
82
+ Convert MAF coordinates (1-based inclusive) to internal normalized Variant.
83
+
84
+ MAF coordinates are generally 1-based inclusive [start, end].
85
+ """
86
+ norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
87
+
88
+ # Handle MAF indels which often use '-'
89
+ if ref == "-" or alt == "-":
90
+ # MAF Insertion: Start_Position is the base BEFORE the insertion (anchor).
91
+ # ref='-', alt='T' -> VCF-like would be ref='A', alt='AT' (requires lookup)
92
+ # But if we just want to represent it internally:
93
+ if ref == "-": # Insertion
94
+ vtype = VariantType.INSERTION
95
+ # MAF Start_Position is usually the flanking base 0 or 1?
96
+ # Standard MAF: Start_Position is the base BEFORE the insertion.
97
+ internal_pos = start_pos - 1
98
+ else: # Deletion
99
+ vtype = VariantType.DELETION
100
+ # MAF Start_Position is the first deleted base? Or anchor?
101
+ # Usually first deleted base.
102
+ # We need to convert to anchor-based for consistency if possible,
103
+ # OR handle MAF-style internally.
104
+ # Let's assume we want VCF-style anchor-based internally.
105
+ # This effectively requires a reference lookup to get the anchor base.
106
+ # For now, we will mark it as needing normalization or handle it in the engine.
107
+ internal_pos = start_pos - 1
108
+
109
+ elif len(ref) == len(alt) == 1:
110
+ vtype = VariantType.SNP
111
+ internal_pos = start_pos - 1
112
+
113
+ else:
114
+ vtype = VariantType.COMPLEX
115
+ internal_pos = start_pos - 1
116
+
117
+ return Variant(chrom=norm_chrom, pos=internal_pos, ref=ref, alt=alt, variant_type=vtype)
118
+
119
+ @staticmethod
120
+ def normalize_chromosome(chrom: str) -> str:
121
+ """
122
+ Normalize chromosome name (remove 'chr' prefix).
123
+ """
124
+ if chrom.lower().startswith("chr"):
125
+ return chrom[3:]
126
+ return chrom
gbcms/io/input.py ADDED
@@ -0,0 +1,222 @@
1
+ """
2
+ Input Adapters: Handling VCF and MAF inputs.
3
+
4
+ This module provides classes to read variants from VCF and MAF files,
5
+ converting them into the internal normalized representation using CoordinateKernel.
6
+ """
7
+
8
+ import csv
9
+ from collections.abc import Iterator
10
+ from pathlib import Path
11
+
12
+ import pysam
13
+ from pydantic import ValidationError
14
+
15
+ from ..core.kernel import CoordinateKernel
16
+ from ..models.core import Variant
17
+
18
+
19
+ class VariantReader:
20
+ """Abstract base class for variant readers."""
21
+
22
+ def __iter__(self) -> Iterator[Variant]:
23
+ raise NotImplementedError
24
+
25
+
26
+ class VcfReader(VariantReader):
27
+ """Reads variants from a VCF file."""
28
+
29
+ def __init__(self, path: Path):
30
+ self.path = path
31
+ self._vcf = pysam.VariantFile(str(path))
32
+
33
+ def __iter__(self) -> Iterator[Variant]:
34
+ for record in self._vcf:
35
+ # VCF coordinates are 1-based
36
+ # pysam converts them to 0-based automatically?
37
+ # pysam.VariantFile returns 0-based pos (start)
38
+ # BUT CoordinateKernel.vcf_to_internal expects 1-based VCF POS.
39
+ # Let's check pysam documentation or behavior.
40
+ # pysam record.pos is 0-based. record.start is 0-based.
41
+ # The VCF file itself has 1-based POS.
42
+ # If we use record.pos + 1, we get the VCF POS.
43
+
44
+ # Handle multiple ALTs
45
+ for alt in record.alts or []:
46
+ # VCF POS is record.pos (1-based) or record.start + 1
47
+ if not record.ref:
48
+ continue # Skip if no REF
49
+
50
+ yield CoordinateKernel.vcf_to_internal(
51
+ chrom=record.chrom,
52
+ pos=record.pos,
53
+ ref=record.ref,
54
+ alt=alt,
55
+ original_id=record.id,
56
+ )
57
+
58
+ def close(self):
59
+ self._vcf.close()
60
+
61
+
62
+ class MafReader(VariantReader):
63
+ """Reads variants from a MAF file."""
64
+
65
+ def __init__(self, path: Path, fasta_path: Path | None = None):
66
+ self.path = path
67
+ self.fasta = pysam.FastaFile(str(fasta_path)) if fasta_path else None
68
+
69
+ def __iter__(self) -> Iterator[Variant]:
70
+ with open(self.path) as f:
71
+ # Skip comments
72
+ while True:
73
+ pos = f.tell()
74
+ line = f.readline()
75
+ if not line.startswith("#"):
76
+ f.seek(pos)
77
+ break
78
+
79
+ reader = csv.DictReader(f, delimiter="\t")
80
+
81
+ for row in reader:
82
+ try:
83
+ chrom = row["Chromosome"]
84
+ start_pos = int(row["Start_Position"])
85
+ ref = row["Reference_Allele"]
86
+ alt = row["Tumor_Seq_Allele2"] # Standard MAF alt column
87
+
88
+ # Normalize Indels if FASTA is available
89
+ if self.fasta and (ref == "-" or alt == "-"):
90
+ if ref == "-": # Insertion
91
+ # MAF Start_Position is the base BEFORE the insertion (anchor)
92
+ # 1-based coordinate
93
+ anchor_pos_1based = start_pos
94
+ anchor_pos_0based = anchor_pos_1based - 1
95
+
96
+ # Fetch anchor base
97
+ # Try normalized and original chromosome names
98
+ norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
99
+ try:
100
+ anchor_base = self.fasta.fetch(
101
+ norm_chrom, anchor_pos_0based, anchor_pos_0based + 1
102
+ ).upper()
103
+ except (KeyError, ValueError):
104
+ try:
105
+ anchor_base = self.fasta.fetch(
106
+ chrom, anchor_pos_0based, anchor_pos_0based + 1
107
+ ).upper()
108
+ except (KeyError, ValueError):
109
+ # If both fail, we can't normalize. Skip or raise?
110
+ # For now, skip/log
111
+ continue
112
+
113
+ # VCF Style:
114
+ # POS = anchor_pos_1based
115
+ # REF = anchor_base
116
+ # ALT = anchor_base + inserted_seq
117
+ vcf_pos = anchor_pos_1based
118
+ vcf_ref = anchor_base
119
+ vcf_alt = anchor_base + alt
120
+
121
+ else: # Deletion (alt == '-')
122
+ # MAF Start_Position is the FIRST DELETED base
123
+ # Anchor is the base before that
124
+ first_deleted_1based = start_pos
125
+ anchor_pos_1based = first_deleted_1based - 1
126
+ anchor_pos_0based = anchor_pos_1based - 1
127
+
128
+ # Fetch anchor base
129
+ norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
130
+ try:
131
+ anchor_base = self.fasta.fetch(
132
+ norm_chrom, anchor_pos_0based, anchor_pos_0based + 1
133
+ ).upper()
134
+ except (KeyError, ValueError):
135
+ try:
136
+ anchor_base = self.fasta.fetch(
137
+ chrom, anchor_pos_0based, anchor_pos_0based + 1
138
+ ).upper()
139
+ except (KeyError, ValueError):
140
+ continue
141
+
142
+ # VCF Style:
143
+ # POS = anchor_pos_1based
144
+ # REF = anchor_base + deleted_seq
145
+ # ALT = anchor_base
146
+ vcf_pos = anchor_pos_1based
147
+ vcf_ref = anchor_base + ref
148
+ vcf_alt = anchor_base
149
+
150
+ yield CoordinateKernel.vcf_to_internal(
151
+ chrom=chrom, pos=vcf_pos, ref=vcf_ref, alt=vcf_alt
152
+ ).model_copy(update={"metadata": row})
153
+ else:
154
+ # Fallback to old behavior or direct mapping for SNPs
155
+ # For SNPs, MAF Start_Position == VCF POS
156
+ if len(ref) == len(alt) == 1 and ref != "-" and alt != "-":
157
+ yield CoordinateKernel.vcf_to_internal(
158
+ chrom=chrom, pos=start_pos, ref=ref, alt=alt
159
+ ).model_copy(update={"metadata": row})
160
+ else:
161
+ # Fallback for complex/unhandled without FASTA
162
+ # This might fail in Rust engine if it expects anchor
163
+ yield CoordinateKernel.maf_to_internal(
164
+ chrom=chrom,
165
+ start_pos=start_pos,
166
+ end_pos=int(row["End_Position"]),
167
+ ref=ref,
168
+ alt=alt,
169
+ ).model_copy(update={"metadata": row})
170
+
171
+ except (KeyError, ValueError, ValidationError):
172
+ # Log warning or skip malformed lines
173
+ continue
174
+
175
+ def close(self):
176
+ if self.fasta:
177
+ self.fasta.close()
178
+
179
+
180
+ class ReferenceChecker:
181
+ """
182
+ Utility to check variants against a reference FASTA.
183
+ Ensures that the REF allele matches the genome.
184
+ """
185
+
186
+ def __init__(self, fasta_path: Path):
187
+ self.fasta = pysam.FastaFile(str(fasta_path))
188
+
189
+ def validate(self, variant: Variant) -> bool:
190
+ """
191
+ Check if variant REF matches reference genome.
192
+ """
193
+ # Variant pos is 0-based.
194
+ # Fetch sequence of length REF
195
+ try:
196
+ # Try normalized and potentially 'chr' prefixed chromosome names
197
+ chrom = variant.chrom
198
+ # chrom is already normalized (e.g. "1") by CoordinateKernel
199
+
200
+ ref_seq = None
201
+ try:
202
+ ref_seq = self.fasta.fetch(chrom, variant.pos, variant.pos + len(variant.ref))
203
+ except (ValueError, KeyError):
204
+ try:
205
+ # Try adding 'chr' prefix
206
+ ref_seq = self.fasta.fetch(
207
+ f"chr{chrom}", variant.pos, variant.pos + len(variant.ref)
208
+ )
209
+ except (ValueError, KeyError) as e:
210
+ print(f"DEBUG: Failed to fetch {chrom} and chr{chrom}: {e}")
211
+ return False
212
+
213
+ if ref_seq is None:
214
+ return False
215
+
216
+ return ref_seq.upper() == variant.ref.upper()
217
+
218
+ except Exception:
219
+ return False
220
+
221
+ def close(self):
222
+ self.fasta.close()