py-gbcms 2.2.0__cp311-cp311-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +23 -0
- gbcms/_rs.cpython-311-darwin.so +0 -0
- gbcms/_rs.pyi +49 -0
- gbcms/cli.py +204 -0
- gbcms/core/__init__.py +9 -0
- gbcms/core/kernel.py +128 -0
- gbcms/io/__init__.py +18 -0
- gbcms/io/input.py +227 -0
- gbcms/io/output.py +354 -0
- gbcms/models/__init__.py +27 -0
- gbcms/models/core.py +172 -0
- gbcms/pipeline.py +257 -0
- gbcms/py.typed +0 -0
- gbcms/utils/__init__.py +14 -0
- gbcms/utils/logging.py +123 -0
- py_gbcms-2.2.0.dist-info/METADATA +217 -0
- py_gbcms-2.2.0.dist-info/RECORD +20 -0
- py_gbcms-2.2.0.dist-info/WHEEL +4 -0
- py_gbcms-2.2.0.dist-info/entry_points.txt +2 -0
- py_gbcms-2.2.0.dist-info/licenses/LICENSE +664 -0
gbcms/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
gbcms (Get Base Counts Multi-Sample) - A tool for counting bases at variant positions.
|
|
3
|
+
|
|
4
|
+
This package provides a command-line interface and Python API for genotyping
|
|
5
|
+
variants in BAM files using a high-performance Rust counting engine.
|
|
6
|
+
|
|
7
|
+
Example usage:
|
|
8
|
+
$ gbcms run -v variants.vcf -b sample.bam -f reference.fa -o output/
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "2.2.0"
|
|
12
|
+
|
|
13
|
+
from .models.core import GbcmsConfig, OutputFormat, Variant, VariantType
|
|
14
|
+
from .pipeline import Pipeline
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"__version__",
|
|
18
|
+
"GbcmsConfig",
|
|
19
|
+
"OutputFormat",
|
|
20
|
+
"Pipeline",
|
|
21
|
+
"Variant",
|
|
22
|
+
"VariantType",
|
|
23
|
+
]
|
|
Binary file
|
gbcms/_rs.pyi
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Type stubs for the Rust extension module (gbcms._rs)
|
|
2
|
+
# This file tells mypy about the types in the native extension
|
|
3
|
+
|
|
4
|
+
class Variant:
|
|
5
|
+
chrom: str
|
|
6
|
+
pos: int
|
|
7
|
+
ref_allele: str
|
|
8
|
+
alt_allele: str
|
|
9
|
+
variant_type: str
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
chrom: str,
|
|
14
|
+
pos: int,
|
|
15
|
+
ref_allele: str,
|
|
16
|
+
alt_allele: str,
|
|
17
|
+
variant_type: str,
|
|
18
|
+
) -> None: ...
|
|
19
|
+
|
|
20
|
+
class BaseCounts:
|
|
21
|
+
chrom: str
|
|
22
|
+
pos: int
|
|
23
|
+
ref: str
|
|
24
|
+
alt: str
|
|
25
|
+
dp: int
|
|
26
|
+
rd: int
|
|
27
|
+
ad: int
|
|
28
|
+
rd_fwd: int
|
|
29
|
+
rd_rev: int
|
|
30
|
+
ad_fwd: int
|
|
31
|
+
ad_rev: int
|
|
32
|
+
dp_fragment: int
|
|
33
|
+
rd_fragment: int
|
|
34
|
+
ad_fragment: int
|
|
35
|
+
sb_pvalue: float
|
|
36
|
+
|
|
37
|
+
def count_bam(
|
|
38
|
+
bam_path: str,
|
|
39
|
+
variants: list[Variant],
|
|
40
|
+
min_mapq: int = 20,
|
|
41
|
+
min_baseq: int = 0,
|
|
42
|
+
filter_duplicates: bool = True,
|
|
43
|
+
filter_secondary: bool = False,
|
|
44
|
+
filter_supplementary: bool = False,
|
|
45
|
+
filter_qc_failed: bool = False,
|
|
46
|
+
filter_improper_pair: bool = False,
|
|
47
|
+
filter_indel: bool = False,
|
|
48
|
+
threads: int = 1,
|
|
49
|
+
) -> list[BaseCounts]: ...
|
gbcms/cli.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI Entry Point: Exposes the gbcms functionality via command line.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
|
|
10
|
+
from .models.core import (
|
|
11
|
+
GbcmsConfig,
|
|
12
|
+
OutputConfig,
|
|
13
|
+
OutputFormat,
|
|
14
|
+
QualityThresholds,
|
|
15
|
+
ReadFilters,
|
|
16
|
+
)
|
|
17
|
+
from .pipeline import Pipeline
|
|
18
|
+
from .utils import setup_logging
|
|
19
|
+
|
|
20
|
+
__all__ = ["app", "run"]
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
app = typer.Typer(help="gbcms: Get Base Counts Multi-Sample")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.callback()
|
|
28
|
+
def main():
|
|
29
|
+
"""
|
|
30
|
+
gbcms: Get Base Counts Multi-Sample
|
|
31
|
+
"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.command()
|
|
36
|
+
def run(
|
|
37
|
+
# Input options
|
|
38
|
+
variant_file: Path = typer.Option(
|
|
39
|
+
..., "--variants", "-v", help="Path to VCF or MAF file containing variants"
|
|
40
|
+
),
|
|
41
|
+
bam_files: list[Path] | None = typer.Option(
|
|
42
|
+
None, "--bam", "-b", help="Path to BAM file(s). Can be specified multiple times."
|
|
43
|
+
),
|
|
44
|
+
bam_list: Path | None = typer.Option(
|
|
45
|
+
None, "--bam-list", "-L", help="File containing list of BAM paths (one per line)"
|
|
46
|
+
),
|
|
47
|
+
reference: Path = typer.Option(..., "--fasta", "-f", help="Path to reference FASTA file"),
|
|
48
|
+
# Output options
|
|
49
|
+
output_dir: Path = typer.Option(
|
|
50
|
+
..., "--output-dir", "-o", help="Directory to write output files"
|
|
51
|
+
),
|
|
52
|
+
output_format: OutputFormat = typer.Option(
|
|
53
|
+
OutputFormat.VCF, "--format", help="Output format (vcf or maf)"
|
|
54
|
+
),
|
|
55
|
+
output_suffix: str = typer.Option(
|
|
56
|
+
"", "--suffix", "-S", help="Suffix to append to output filename (e.g. '.genotyped')"
|
|
57
|
+
),
|
|
58
|
+
# Quality thresholds
|
|
59
|
+
min_mapq: int = typer.Option(20, "--min-mapq", help="Minimum mapping quality"),
|
|
60
|
+
min_baseq: int = typer.Option(0, "--min-baseq", help="Minimum base quality"),
|
|
61
|
+
# Read filters
|
|
62
|
+
filter_duplicates: bool = typer.Option(True, help="Filter duplicate reads"),
|
|
63
|
+
filter_secondary: bool = typer.Option(False, help="Filter secondary alignments"),
|
|
64
|
+
filter_supplementary: bool = typer.Option(False, help="Filter supplementary alignments"),
|
|
65
|
+
filter_qc_failed: bool = typer.Option(False, help="Filter reads failing QC"),
|
|
66
|
+
filter_improper_pair: bool = typer.Option(False, help="Filter improperly paired reads"),
|
|
67
|
+
filter_indel: bool = typer.Option(False, help="Filter reads containing indels"),
|
|
68
|
+
# Performance
|
|
69
|
+
threads: int = typer.Option(
|
|
70
|
+
1, "--threads", "-t", help="Number of threads for parallel processing"
|
|
71
|
+
),
|
|
72
|
+
verbose: bool = typer.Option(False, "--verbose", "-V", help="Enable verbose debug logging"),
|
|
73
|
+
):
|
|
74
|
+
"""
|
|
75
|
+
Run gbcms on one or more BAM files.
|
|
76
|
+
"""
|
|
77
|
+
# Configure logging
|
|
78
|
+
setup_logging(verbose=verbose)
|
|
79
|
+
|
|
80
|
+
# Parse BAM inputs
|
|
81
|
+
bams_dict = _parse_bam_inputs(bam_files, bam_list)
|
|
82
|
+
|
|
83
|
+
if not bams_dict:
|
|
84
|
+
logger.error("No valid BAM files provided via --bam or --bam-list")
|
|
85
|
+
raise typer.Exit(code=1)
|
|
86
|
+
|
|
87
|
+
logger.info("Found %d BAM file(s) to process", len(bams_dict))
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
# Build nested config objects
|
|
91
|
+
output_config = OutputConfig(
|
|
92
|
+
directory=output_dir,
|
|
93
|
+
format=output_format,
|
|
94
|
+
suffix=output_suffix,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
quality_config = QualityThresholds(
|
|
98
|
+
min_mapping_quality=min_mapq,
|
|
99
|
+
min_base_quality=min_baseq,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
filter_config = ReadFilters(
|
|
103
|
+
duplicates=filter_duplicates,
|
|
104
|
+
secondary=filter_secondary,
|
|
105
|
+
supplementary=filter_supplementary,
|
|
106
|
+
qc_failed=filter_qc_failed,
|
|
107
|
+
improper_pair=filter_improper_pair,
|
|
108
|
+
indel=filter_indel,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
config = GbcmsConfig(
|
|
112
|
+
variant_file=variant_file,
|
|
113
|
+
bam_files=bams_dict,
|
|
114
|
+
reference_fasta=reference,
|
|
115
|
+
output=output_config,
|
|
116
|
+
quality=quality_config,
|
|
117
|
+
filters=filter_config,
|
|
118
|
+
threads=threads,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
pipeline = Pipeline(config)
|
|
122
|
+
pipeline.run()
|
|
123
|
+
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.exception("Pipeline failed: %s", e)
|
|
126
|
+
raise typer.Exit(code=1) from e
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _parse_bam_inputs(bam_files: list[Path] | None, bam_list: Path | None) -> dict[str, Path]:
|
|
130
|
+
"""
|
|
131
|
+
Parse BAM inputs from direct arguments and/or BAM list file.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
bam_files: List of BAM paths (optionally with sample_id:path format).
|
|
135
|
+
bam_list: Path to file containing BAM paths (one per line).
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Dictionary mapping sample names to BAM paths.
|
|
139
|
+
"""
|
|
140
|
+
bams_dict: dict[str, Path] = {}
|
|
141
|
+
|
|
142
|
+
# 1. Process direct BAM arguments
|
|
143
|
+
if bam_files:
|
|
144
|
+
for bam_arg in bam_files:
|
|
145
|
+
sample_name, bam_path = _parse_bam_arg(bam_arg)
|
|
146
|
+
|
|
147
|
+
if not bam_path.exists():
|
|
148
|
+
logger.error("BAM file not found: %s", bam_path)
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
bams_dict[sample_name] = bam_path
|
|
152
|
+
|
|
153
|
+
# 2. Process BAM list file
|
|
154
|
+
if bam_list:
|
|
155
|
+
if not bam_list.exists():
|
|
156
|
+
logger.error("BAM list file not found: %s", bam_list)
|
|
157
|
+
return bams_dict
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
with open(bam_list) as f:
|
|
161
|
+
for line in f:
|
|
162
|
+
line = line.strip()
|
|
163
|
+
if not line or line.startswith("#"):
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
parts = line.split()
|
|
167
|
+
if len(parts) >= 2:
|
|
168
|
+
sample_name = parts[0]
|
|
169
|
+
bam_path = Path(parts[1])
|
|
170
|
+
else:
|
|
171
|
+
bam_path = Path(parts[0])
|
|
172
|
+
sample_name = bam_path.stem
|
|
173
|
+
|
|
174
|
+
if not bam_path.exists():
|
|
175
|
+
logger.warning("BAM file from list not found: %s", bam_path)
|
|
176
|
+
continue
|
|
177
|
+
|
|
178
|
+
bams_dict[sample_name] = bam_path
|
|
179
|
+
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.error("Error reading BAM list file %s: %s", bam_list, e)
|
|
182
|
+
|
|
183
|
+
return bams_dict
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _parse_bam_arg(bam_arg: Path) -> tuple[str, Path]:
|
|
187
|
+
"""
|
|
188
|
+
Parse a BAM argument that may be in sample_id:path format.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
bam_arg: Path object (may contain sample_id:path as string).
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
Tuple of (sample_name, bam_path).
|
|
195
|
+
"""
|
|
196
|
+
bam_str = str(bam_arg)
|
|
197
|
+
if ":" in bam_str:
|
|
198
|
+
parts = bam_str.split(":", 1)
|
|
199
|
+
return parts[0], Path(parts[1])
|
|
200
|
+
return bam_arg.stem, bam_arg
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == "__main__":
|
|
204
|
+
app()
|
gbcms/core/__init__.py
ADDED
gbcms/core/kernel.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Coordinate Kernel: The source of truth for genomic coordinate systems.
|
|
3
|
+
|
|
4
|
+
Handles conversion between:
|
|
5
|
+
- VCF (1-based)
|
|
6
|
+
- MAF (1-based)
|
|
7
|
+
- Internal (0-based, half-open [start, end))
|
|
8
|
+
|
|
9
|
+
Ensures consistent representation of variants:
|
|
10
|
+
- SNPs: 0-based index of the base.
|
|
11
|
+
- Insertions: 0-based index of the ANCHOR base (preceding the insertion).
|
|
12
|
+
- Deletions: 0-based index of the ANCHOR base (preceding the deletion).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from gbcms.models.core import Variant, VariantType
|
|
16
|
+
|
|
17
|
+
__all__ = ["CoordinateKernel"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CoordinateKernel:
|
|
21
|
+
"""
|
|
22
|
+
Stateless utility for coordinate transformations and normalization.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def vcf_to_internal(
|
|
27
|
+
chrom: str, pos: int, ref: str, alt: str, original_id: str | None = None
|
|
28
|
+
) -> Variant:
|
|
29
|
+
"""
|
|
30
|
+
Convert VCF coordinates (1-based) to internal normalized Variant.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
chrom: Chromosome name
|
|
34
|
+
pos: 1-based position from VCF
|
|
35
|
+
ref: Reference allele
|
|
36
|
+
alt: Alternate allele
|
|
37
|
+
original_id: Optional VCF ID
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Normalized Variant object
|
|
41
|
+
"""
|
|
42
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
43
|
+
|
|
44
|
+
# Determine variant type and internal position
|
|
45
|
+
if len(ref) == 1 and len(alt) == 1:
|
|
46
|
+
vtype = VariantType.SNP
|
|
47
|
+
# SNP: VCF POS is the base itself.
|
|
48
|
+
# 1-based 10 -> 0-based 9
|
|
49
|
+
internal_pos = pos - 1
|
|
50
|
+
|
|
51
|
+
elif len(ref) == 1 and len(alt) > 1:
|
|
52
|
+
vtype = VariantType.INSERTION
|
|
53
|
+
# Insertion: VCF POS is the base BEFORE the insertion (the anchor).
|
|
54
|
+
# VCF: POS=10, REF=A, ALT=AT (Insertion of T after A at 10)
|
|
55
|
+
# Internal: 0-based index of the ANCHOR base.
|
|
56
|
+
# 1-based 10 -> 0-based 9
|
|
57
|
+
internal_pos = pos - 1
|
|
58
|
+
|
|
59
|
+
elif len(ref) > 1 and len(alt) == 1:
|
|
60
|
+
vtype = VariantType.DELETION
|
|
61
|
+
# Deletion: VCF POS is the base BEFORE the deletion (the anchor).
|
|
62
|
+
# VCF: POS=10, REF=AT, ALT=A (Deletion of T after A at 10)
|
|
63
|
+
# Internal POS: 0-based index of the ANCHOR base.
|
|
64
|
+
# 1-based 10 -> 0-based 9
|
|
65
|
+
internal_pos = pos - 1
|
|
66
|
+
|
|
67
|
+
else:
|
|
68
|
+
vtype = VariantType.COMPLEX
|
|
69
|
+
# Complex: Treat start as 0-based index of first ref base
|
|
70
|
+
internal_pos = pos - 1
|
|
71
|
+
|
|
72
|
+
return Variant(
|
|
73
|
+
chrom=norm_chrom,
|
|
74
|
+
pos=internal_pos,
|
|
75
|
+
ref=ref,
|
|
76
|
+
alt=alt,
|
|
77
|
+
variant_type=vtype,
|
|
78
|
+
original_id=original_id,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
@staticmethod
|
|
82
|
+
def maf_to_internal(chrom: str, start_pos: int, end_pos: int, ref: str, alt: str) -> Variant:
|
|
83
|
+
"""
|
|
84
|
+
Convert MAF coordinates (1-based inclusive) to internal normalized Variant.
|
|
85
|
+
|
|
86
|
+
MAF coordinates are generally 1-based inclusive [start, end].
|
|
87
|
+
"""
|
|
88
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
89
|
+
|
|
90
|
+
# Handle MAF indels which often use '-'
|
|
91
|
+
if ref == "-" or alt == "-":
|
|
92
|
+
# MAF Insertion: Start_Position is the base BEFORE the insertion (anchor).
|
|
93
|
+
# ref='-', alt='T' -> VCF-like would be ref='A', alt='AT' (requires lookup)
|
|
94
|
+
# But if we just want to represent it internally:
|
|
95
|
+
if ref == "-": # Insertion
|
|
96
|
+
vtype = VariantType.INSERTION
|
|
97
|
+
# MAF Start_Position is usually the flanking base 0 or 1?
|
|
98
|
+
# Standard MAF: Start_Position is the base BEFORE the insertion.
|
|
99
|
+
internal_pos = start_pos - 1
|
|
100
|
+
else: # Deletion
|
|
101
|
+
vtype = VariantType.DELETION
|
|
102
|
+
# MAF Start_Position is the first deleted base? Or anchor?
|
|
103
|
+
# Usually first deleted base.
|
|
104
|
+
# We need to convert to anchor-based for consistency if possible,
|
|
105
|
+
# OR handle MAF-style internally.
|
|
106
|
+
# Let's assume we want VCF-style anchor-based internally.
|
|
107
|
+
# This effectively requires a reference lookup to get the anchor base.
|
|
108
|
+
# For now, we will mark it as needing normalization or handle it in the engine.
|
|
109
|
+
internal_pos = start_pos - 1
|
|
110
|
+
|
|
111
|
+
elif len(ref) == len(alt) == 1:
|
|
112
|
+
vtype = VariantType.SNP
|
|
113
|
+
internal_pos = start_pos - 1
|
|
114
|
+
|
|
115
|
+
else:
|
|
116
|
+
vtype = VariantType.COMPLEX
|
|
117
|
+
internal_pos = start_pos - 1
|
|
118
|
+
|
|
119
|
+
return Variant(chrom=norm_chrom, pos=internal_pos, ref=ref, alt=alt, variant_type=vtype)
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def normalize_chromosome(chrom: str) -> str:
|
|
123
|
+
"""
|
|
124
|
+
Normalize chromosome name (remove 'chr' prefix).
|
|
125
|
+
"""
|
|
126
|
+
if chrom.lower().startswith("chr"):
|
|
127
|
+
return chrom[3:]
|
|
128
|
+
return chrom
|
gbcms/io/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
I/O module for gbcms.
|
|
3
|
+
|
|
4
|
+
Provides readers and writers for variant files (VCF, MAF format).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .input import MafReader, ReferenceChecker, VariantReader, VcfReader
|
|
8
|
+
from .output import MafWriter, OutputWriter, VcfWriter
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"MafReader",
|
|
12
|
+
"MafWriter",
|
|
13
|
+
"OutputWriter",
|
|
14
|
+
"ReferenceChecker",
|
|
15
|
+
"VariantReader",
|
|
16
|
+
"VcfReader",
|
|
17
|
+
"VcfWriter",
|
|
18
|
+
]
|
gbcms/io/input.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Input Adapters: Handling VCF and MAF inputs.
|
|
3
|
+
|
|
4
|
+
This module provides classes to read variants from VCF and MAF files,
|
|
5
|
+
converting them into the internal normalized representation using CoordinateKernel.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
import logging
|
|
10
|
+
from collections.abc import Iterator
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import pysam
|
|
14
|
+
from pydantic import ValidationError
|
|
15
|
+
|
|
16
|
+
from ..core.kernel import CoordinateKernel
|
|
17
|
+
from ..models.core import Variant
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
__all__ = ["VariantReader", "VcfReader", "MafReader", "ReferenceChecker"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class VariantReader:
|
|
25
|
+
"""Abstract base class for variant readers."""
|
|
26
|
+
|
|
27
|
+
def __iter__(self) -> Iterator[Variant]:
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class VcfReader(VariantReader):
|
|
32
|
+
"""Reads variants from a VCF file."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, path: Path):
|
|
35
|
+
self.path = path
|
|
36
|
+
self._vcf = pysam.VariantFile(str(path))
|
|
37
|
+
|
|
38
|
+
def __iter__(self) -> Iterator[Variant]:
|
|
39
|
+
for record in self._vcf:
|
|
40
|
+
# VCF coordinates are 1-based
|
|
41
|
+
# pysam converts them to 0-based automatically?
|
|
42
|
+
# pysam.VariantFile returns 0-based pos (start)
|
|
43
|
+
# BUT CoordinateKernel.vcf_to_internal expects 1-based VCF POS.
|
|
44
|
+
# Let's check pysam documentation or behavior.
|
|
45
|
+
# pysam record.pos is 0-based. record.start is 0-based.
|
|
46
|
+
# The VCF file itself has 1-based POS.
|
|
47
|
+
# If we use record.pos + 1, we get the VCF POS.
|
|
48
|
+
|
|
49
|
+
# Handle multiple ALTs
|
|
50
|
+
for alt in record.alts or []:
|
|
51
|
+
# VCF POS is record.pos (1-based) or record.start + 1
|
|
52
|
+
if not record.ref:
|
|
53
|
+
continue # Skip if no REF
|
|
54
|
+
|
|
55
|
+
yield CoordinateKernel.vcf_to_internal(
|
|
56
|
+
chrom=record.chrom,
|
|
57
|
+
pos=record.pos,
|
|
58
|
+
ref=record.ref,
|
|
59
|
+
alt=alt,
|
|
60
|
+
original_id=record.id,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def close(self):
|
|
64
|
+
self._vcf.close()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class MafReader(VariantReader):
|
|
68
|
+
"""Reads variants from a MAF file."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, path: Path, fasta_path: Path | None = None):
|
|
71
|
+
self.path = path
|
|
72
|
+
self.fasta = pysam.FastaFile(str(fasta_path)) if fasta_path else None
|
|
73
|
+
|
|
74
|
+
def __iter__(self) -> Iterator[Variant]:
|
|
75
|
+
with open(self.path) as f:
|
|
76
|
+
# Skip comments
|
|
77
|
+
while True:
|
|
78
|
+
pos = f.tell()
|
|
79
|
+
line = f.readline()
|
|
80
|
+
if not line.startswith("#"):
|
|
81
|
+
f.seek(pos)
|
|
82
|
+
break
|
|
83
|
+
|
|
84
|
+
reader = csv.DictReader(f, delimiter="\t")
|
|
85
|
+
|
|
86
|
+
for row in reader:
|
|
87
|
+
try:
|
|
88
|
+
chrom = row["Chromosome"]
|
|
89
|
+
start_pos = int(row["Start_Position"])
|
|
90
|
+
ref = row["Reference_Allele"]
|
|
91
|
+
alt = row["Tumor_Seq_Allele2"] # Standard MAF alt column
|
|
92
|
+
|
|
93
|
+
# Normalize Indels if FASTA is available
|
|
94
|
+
if self.fasta and (ref == "-" or alt == "-"):
|
|
95
|
+
if ref == "-": # Insertion
|
|
96
|
+
# MAF Start_Position is the base BEFORE the insertion (anchor)
|
|
97
|
+
# 1-based coordinate
|
|
98
|
+
anchor_pos_1based = start_pos
|
|
99
|
+
anchor_pos_0based = anchor_pos_1based - 1
|
|
100
|
+
|
|
101
|
+
# Fetch anchor base
|
|
102
|
+
# Try normalized and original chromosome names
|
|
103
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
104
|
+
try:
|
|
105
|
+
anchor_base = self.fasta.fetch(
|
|
106
|
+
norm_chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
107
|
+
).upper()
|
|
108
|
+
except (KeyError, ValueError):
|
|
109
|
+
try:
|
|
110
|
+
anchor_base = self.fasta.fetch(
|
|
111
|
+
chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
112
|
+
).upper()
|
|
113
|
+
except (KeyError, ValueError):
|
|
114
|
+
# If both fail, we can't normalize. Skip or raise?
|
|
115
|
+
# For now, skip/log
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
# VCF Style:
|
|
119
|
+
# POS = anchor_pos_1based
|
|
120
|
+
# REF = anchor_base
|
|
121
|
+
# ALT = anchor_base + inserted_seq
|
|
122
|
+
vcf_pos = anchor_pos_1based
|
|
123
|
+
vcf_ref = anchor_base
|
|
124
|
+
vcf_alt = anchor_base + alt
|
|
125
|
+
|
|
126
|
+
else: # Deletion (alt == '-')
|
|
127
|
+
# MAF Start_Position is the FIRST DELETED base
|
|
128
|
+
# Anchor is the base before that
|
|
129
|
+
first_deleted_1based = start_pos
|
|
130
|
+
anchor_pos_1based = first_deleted_1based - 1
|
|
131
|
+
anchor_pos_0based = anchor_pos_1based - 1
|
|
132
|
+
|
|
133
|
+
# Fetch anchor base
|
|
134
|
+
norm_chrom = CoordinateKernel.normalize_chromosome(chrom)
|
|
135
|
+
try:
|
|
136
|
+
anchor_base = self.fasta.fetch(
|
|
137
|
+
norm_chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
138
|
+
).upper()
|
|
139
|
+
except (KeyError, ValueError):
|
|
140
|
+
try:
|
|
141
|
+
anchor_base = self.fasta.fetch(
|
|
142
|
+
chrom, anchor_pos_0based, anchor_pos_0based + 1
|
|
143
|
+
).upper()
|
|
144
|
+
except (KeyError, ValueError):
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# VCF Style:
|
|
148
|
+
# POS = anchor_pos_1based
|
|
149
|
+
# REF = anchor_base + deleted_seq
|
|
150
|
+
# ALT = anchor_base
|
|
151
|
+
vcf_pos = anchor_pos_1based
|
|
152
|
+
vcf_ref = anchor_base + ref
|
|
153
|
+
vcf_alt = anchor_base
|
|
154
|
+
|
|
155
|
+
yield CoordinateKernel.vcf_to_internal(
|
|
156
|
+
chrom=chrom, pos=vcf_pos, ref=vcf_ref, alt=vcf_alt
|
|
157
|
+
).model_copy(update={"metadata": row})
|
|
158
|
+
else:
|
|
159
|
+
# Fallback to old behavior or direct mapping for SNPs
|
|
160
|
+
# For SNPs, MAF Start_Position == VCF POS
|
|
161
|
+
if len(ref) == len(alt) == 1 and ref != "-" and alt != "-":
|
|
162
|
+
yield CoordinateKernel.vcf_to_internal(
|
|
163
|
+
chrom=chrom, pos=start_pos, ref=ref, alt=alt
|
|
164
|
+
).model_copy(update={"metadata": row})
|
|
165
|
+
else:
|
|
166
|
+
# Fallback for complex/unhandled without FASTA
|
|
167
|
+
# This might fail in Rust engine if it expects anchor
|
|
168
|
+
yield CoordinateKernel.maf_to_internal(
|
|
169
|
+
chrom=chrom,
|
|
170
|
+
start_pos=start_pos,
|
|
171
|
+
end_pos=int(row["End_Position"]),
|
|
172
|
+
ref=ref,
|
|
173
|
+
alt=alt,
|
|
174
|
+
).model_copy(update={"metadata": row})
|
|
175
|
+
|
|
176
|
+
except (KeyError, ValueError, ValidationError):
|
|
177
|
+
# Log warning or skip malformed lines
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
def close(self):
|
|
181
|
+
if self.fasta:
|
|
182
|
+
self.fasta.close()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class ReferenceChecker:
|
|
186
|
+
"""
|
|
187
|
+
Utility to check variants against a reference FASTA.
|
|
188
|
+
Ensures that the REF allele matches the genome.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def __init__(self, fasta_path: Path):
|
|
192
|
+
self.fasta = pysam.FastaFile(str(fasta_path))
|
|
193
|
+
|
|
194
|
+
def validate(self, variant: Variant) -> bool:
|
|
195
|
+
"""
|
|
196
|
+
Check if variant REF matches reference genome.
|
|
197
|
+
"""
|
|
198
|
+
# Variant pos is 0-based.
|
|
199
|
+
# Fetch sequence of length REF
|
|
200
|
+
try:
|
|
201
|
+
# Try normalized and potentially 'chr' prefixed chromosome names
|
|
202
|
+
chrom = variant.chrom
|
|
203
|
+
# chrom is already normalized (e.g. "1") by CoordinateKernel
|
|
204
|
+
|
|
205
|
+
ref_seq = None
|
|
206
|
+
try:
|
|
207
|
+
ref_seq = self.fasta.fetch(chrom, variant.pos, variant.pos + len(variant.ref))
|
|
208
|
+
except (ValueError, KeyError):
|
|
209
|
+
try:
|
|
210
|
+
# Try adding 'chr' prefix
|
|
211
|
+
ref_seq = self.fasta.fetch(
|
|
212
|
+
f"chr{chrom}", variant.pos, variant.pos + len(variant.ref)
|
|
213
|
+
)
|
|
214
|
+
except (ValueError, KeyError) as e:
|
|
215
|
+
logger.debug("Failed to fetch %s and chr%s: %s", chrom, chrom, e)
|
|
216
|
+
return False
|
|
217
|
+
|
|
218
|
+
if ref_seq is None:
|
|
219
|
+
return False
|
|
220
|
+
|
|
221
|
+
return ref_seq.upper() == variant.ref.upper()
|
|
222
|
+
|
|
223
|
+
except Exception:
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
def close(self):
|
|
227
|
+
self.fasta.close()
|