py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +1 -13
- gbcms/cli.py +134 -716
- gbcms/core/kernel.py +126 -0
- gbcms/io/input.py +222 -0
- gbcms/io/output.py +361 -0
- gbcms/models/core.py +133 -0
- gbcms/pipeline.py +212 -0
- gbcms/py.typed +0 -0
- py_gbcms-2.1.1.dist-info/METADATA +216 -0
- py_gbcms-2.1.1.dist-info/RECORD +13 -0
- gbcms/config.py +0 -98
- gbcms/counter.py +0 -1074
- gbcms/models.py +0 -295
- gbcms/numba_counter.py +0 -394
- gbcms/output.py +0 -573
- gbcms/parallel.py +0 -129
- gbcms/processor.py +0 -293
- gbcms/reference.py +0 -86
- gbcms/variant.py +0 -390
- py_gbcms-2.0.0.dist-info/METADATA +0 -506
- py_gbcms-2.0.0.dist-info/RECORD +0 -16
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/WHEEL +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/entry_points.txt +0 -0
- {py_gbcms-2.0.0.dist-info → py_gbcms-2.1.1.dist-info}/licenses/LICENSE +0 -0
gbcms/pipeline.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline Orchestrator: Manages the execution flow of gbcms.
|
|
3
|
+
|
|
4
|
+
This module handles:
|
|
5
|
+
1. Reading variants from input (VCF/MAF).
|
|
6
|
+
2. Iterating over samples (BAM files).
|
|
7
|
+
3. Running the Rust-based counting engine for each sample.
|
|
8
|
+
4. Writing results to per-sample output files.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import pysam
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
from rich.progress import (
|
|
16
|
+
BarColumn,
|
|
17
|
+
Progress,
|
|
18
|
+
SpinnerColumn,
|
|
19
|
+
TaskProgressColumn,
|
|
20
|
+
TextColumn,
|
|
21
|
+
TimeRemainingColumn,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
import gbcms_rs
|
|
25
|
+
|
|
26
|
+
from .core.kernel import CoordinateKernel
|
|
27
|
+
from .io.input import MafReader, ReferenceChecker, VariantReader, VcfReader
|
|
28
|
+
from .io.output import MafWriter, VcfWriter
|
|
29
|
+
from .models.core import GbcmsConfig, OutputFormat, Variant
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Pipeline:
|
|
33
|
+
def __init__(self, config: GbcmsConfig):
|
|
34
|
+
self.config = config
|
|
35
|
+
self.console = Console()
|
|
36
|
+
|
|
37
|
+
def run(self):
|
|
38
|
+
"""Execute the pipeline."""
|
|
39
|
+
self.console.print("[bold blue]Starting gbcms pipeline[/bold blue]")
|
|
40
|
+
self.console.print(f"Output directory: {self.config.output_dir}")
|
|
41
|
+
|
|
42
|
+
# 1. Load Variants
|
|
43
|
+
with self.console.status("[bold green]Loading variants...[/bold green]"):
|
|
44
|
+
variants = self._load_variants()
|
|
45
|
+
|
|
46
|
+
self.console.print(f"Loaded [bold]{len(variants)}[/bold] variants.")
|
|
47
|
+
|
|
48
|
+
if not variants:
|
|
49
|
+
self.console.print("[bold red]No variants found. Exiting.[/bold red]")
|
|
50
|
+
return
|
|
51
|
+
|
|
52
|
+
# 2. Validate Variants against Reference
|
|
53
|
+
with self.console.status(
|
|
54
|
+
"[bold green]Validating variants against reference...[/bold green]"
|
|
55
|
+
):
|
|
56
|
+
valid_variants = self._validate_variants(variants)
|
|
57
|
+
|
|
58
|
+
self.console.print(f"Valid variants: [bold]{len(valid_variants)}[/bold] / {len(variants)}")
|
|
59
|
+
|
|
60
|
+
if not valid_variants:
|
|
61
|
+
self.console.print(
|
|
62
|
+
"[bold red]No valid variants remaining after validation. Exiting.[/bold red]"
|
|
63
|
+
)
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
variants = valid_variants
|
|
67
|
+
|
|
68
|
+
# 3. Prepare Rust Variants
|
|
69
|
+
rs_variants = [
|
|
70
|
+
gbcms_rs.Variant(v.chrom, v.pos, v.ref, v.alt, v.variant_type.value) for v in variants
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
# 4. Process Each Sample
|
|
74
|
+
self.config.output_dir.mkdir(parents=True, exist_ok=True)
|
|
75
|
+
|
|
76
|
+
samples = list(self.config.bam_files.items())
|
|
77
|
+
|
|
78
|
+
with Progress(
|
|
79
|
+
SpinnerColumn(),
|
|
80
|
+
TextColumn("[progress.description]{task.description}"),
|
|
81
|
+
BarColumn(),
|
|
82
|
+
TaskProgressColumn(),
|
|
83
|
+
TimeRemainingColumn(),
|
|
84
|
+
console=self.console,
|
|
85
|
+
) as progress:
|
|
86
|
+
task = progress.add_task("[cyan]Processing samples...", total=len(samples))
|
|
87
|
+
|
|
88
|
+
for sample_name, bam_path in samples:
|
|
89
|
+
progress.update(task, description=f"[cyan]Processing {sample_name}...")
|
|
90
|
+
|
|
91
|
+
# Validate BAM Header
|
|
92
|
+
if not self._validate_bam_header(bam_path, variants):
|
|
93
|
+
self.console.print(
|
|
94
|
+
f"[yellow]Warning: BAM {sample_name} may not contain variant chromosomes. Proceeding anyway...[/yellow]"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
# Run Rust Engine
|
|
99
|
+
counts_list = gbcms_rs.count_bam(
|
|
100
|
+
str(bam_path),
|
|
101
|
+
rs_variants,
|
|
102
|
+
min_mapq=self.config.min_mapping_quality,
|
|
103
|
+
min_baseq=self.config.min_base_quality,
|
|
104
|
+
filter_duplicates=self.config.filter_duplicates,
|
|
105
|
+
filter_secondary=self.config.filter_secondary,
|
|
106
|
+
filter_supplementary=self.config.filter_supplementary,
|
|
107
|
+
filter_qc_failed=self.config.filter_qc_failed,
|
|
108
|
+
filter_improper_pair=self.config.filter_improper_pair,
|
|
109
|
+
filter_indel=self.config.filter_indel,
|
|
110
|
+
threads=self.config.threads,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Write Output
|
|
114
|
+
self._write_output(sample_name, variants, counts_list)
|
|
115
|
+
|
|
116
|
+
except Exception as e:
|
|
117
|
+
self.console.print(
|
|
118
|
+
f"[bold red]Error processing sample {sample_name}: {e}[/bold red]"
|
|
119
|
+
)
|
|
120
|
+
# Continue to next sample
|
|
121
|
+
|
|
122
|
+
progress.advance(task)
|
|
123
|
+
|
|
124
|
+
self.console.print("[bold green]Pipeline completed successfully.[/bold green]")
|
|
125
|
+
|
|
126
|
+
def _load_variants(self) -> list[Variant]:
|
|
127
|
+
"""Load variants based on file extension."""
|
|
128
|
+
path = self.config.variant_file
|
|
129
|
+
reader: VariantReader
|
|
130
|
+
|
|
131
|
+
if path.suffix.lower() in [".vcf", ".gz"]: # .vcf.gz handled by pysam
|
|
132
|
+
reader = VcfReader(path)
|
|
133
|
+
elif path.suffix.lower() == ".maf":
|
|
134
|
+
reader = MafReader(path, fasta_path=self.config.reference_fasta)
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(f"Unsupported variant file format: {path.suffix}")
|
|
137
|
+
|
|
138
|
+
variants = list(reader)
|
|
139
|
+
if hasattr(reader, "close"):
|
|
140
|
+
reader.close()
|
|
141
|
+
|
|
142
|
+
return variants
|
|
143
|
+
|
|
144
|
+
def _validate_variants(self, variants: list[Variant]) -> list[Variant]:
|
|
145
|
+
"""Validate variants against reference genome."""
|
|
146
|
+
checker = ReferenceChecker(self.config.reference_fasta)
|
|
147
|
+
valid_variants = []
|
|
148
|
+
invalid_count = 0
|
|
149
|
+
|
|
150
|
+
for v in variants:
|
|
151
|
+
if checker.validate(v):
|
|
152
|
+
valid_variants.append(v)
|
|
153
|
+
else:
|
|
154
|
+
invalid_count += 1
|
|
155
|
+
if invalid_count <= 5: # Log first few failures
|
|
156
|
+
self.console.print(
|
|
157
|
+
f"[yellow]Invalid variant (REF mismatch): {v.chrom}:{v.pos} {v.ref}>{v.alt}[/yellow]"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if invalid_count > 5:
|
|
161
|
+
self.console.print(
|
|
162
|
+
f"[yellow]... and {invalid_count - 5} more invalid variants.[/yellow]"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
checker.close()
|
|
166
|
+
return valid_variants
|
|
167
|
+
|
|
168
|
+
def _validate_bam_header(self, bam_path: Path, variants: list[Variant]) -> bool:
|
|
169
|
+
"""Check if BAM header contains chromosomes from variants."""
|
|
170
|
+
try:
|
|
171
|
+
with pysam.AlignmentFile(str(bam_path), "rb") as bam:
|
|
172
|
+
bam_chroms = set(bam.references)
|
|
173
|
+
|
|
174
|
+
# Check a few variants
|
|
175
|
+
# We need to handle chr prefix normalization
|
|
176
|
+
# BAM might have 'chr1', variant '1', or vice versa.
|
|
177
|
+
|
|
178
|
+
# Normalize BAM chroms
|
|
179
|
+
norm_bam_chroms = {CoordinateKernel.normalize_chromosome(c) for c in bam_chroms}
|
|
180
|
+
|
|
181
|
+
# Check first variant as a heuristic
|
|
182
|
+
if variants:
|
|
183
|
+
v = variants[0]
|
|
184
|
+
norm_v_chrom = CoordinateKernel.normalize_chromosome(v.chrom)
|
|
185
|
+
if norm_v_chrom not in norm_bam_chroms:
|
|
186
|
+
return False
|
|
187
|
+
return True
|
|
188
|
+
except Exception as e:
|
|
189
|
+
self.console.print(f"[yellow]Could not validate BAM header: {e}[/yellow]")
|
|
190
|
+
return True # Assume ok if we can't check
|
|
191
|
+
|
|
192
|
+
def _write_output(
|
|
193
|
+
self, sample_name: str, variants: list[Variant], counts_list: list[gbcms_rs.BaseCounts]
|
|
194
|
+
):
|
|
195
|
+
"""Write results to output file."""
|
|
196
|
+
ext = "vcf" if self.config.output_format == OutputFormat.VCF else "maf"
|
|
197
|
+
suffix = self.config.output_suffix
|
|
198
|
+
output_path = self.config.output_dir / f"{sample_name}{suffix}.{ext}"
|
|
199
|
+
writer: VcfWriter | MafWriter
|
|
200
|
+
if self.config.output_format == OutputFormat.VCF:
|
|
201
|
+
writer = VcfWriter(output_path, sample_name=sample_name)
|
|
202
|
+
else:
|
|
203
|
+
writer = MafWriter(output_path)
|
|
204
|
+
|
|
205
|
+
for v, counts in zip(
|
|
206
|
+
variants, counts_list, strict=True
|
|
207
|
+
): # Changed 'results' to 'counts_list'
|
|
208
|
+
writer.write(v, counts, sample_name=sample_name)
|
|
209
|
+
|
|
210
|
+
writer.close()
|
|
211
|
+
|
|
212
|
+
# self.console.print(f"Results written to {output_path}")
|
gbcms/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py-gbcms
|
|
3
|
+
Version: 2.1.1
|
|
4
|
+
Summary: Python implementation of GetBaseCountsMultiSample (gbcms) for calculating base counts in BAM files
|
|
5
|
+
Project-URL: Homepage, https://github.com/msk-access/py-gbcms
|
|
6
|
+
Project-URL: Repository, https://github.com/msk-access/py-gbcms
|
|
7
|
+
Project-URL: Documentation, https://github.com/msk-access/py-gbcms#readme
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/msk-access/py-gbcms/issues
|
|
9
|
+
Author-email: MSK-ACCESS <shahr2@mskcc.org>
|
|
10
|
+
License: AGPL-3.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: bam,base-counts,bioinformatics,gbcms,genomics,maf,vcf
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: pydantic>=2.0.0
|
|
21
|
+
Requires-Dist: pysam>=0.21.0
|
|
22
|
+
Requires-Dist: rich>=13.0.0
|
|
23
|
+
Requires-Dist: typer>=0.9.0
|
|
24
|
+
Provides-Extra: all
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: mkdocs-material>=9.0.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: mypy>=1.5.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest-mock>=3.11.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=7.4.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: types-pyyaml>=6.0.0; extra == 'dev'
|
|
34
|
+
Provides-Extra: fast
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# py-gbcms
|
|
38
|
+
|
|
39
|
+
**Complete orientation-aware counting system for genomic variants**
|
|
40
|
+
|
|
41
|
+
[](https://github.com/msk-access/py-gbcms/actions)
|
|
42
|
+
[](https://www.python.org/downloads/)
|
|
43
|
+
|
|
44
|
+
## Features
|
|
45
|
+
|
|
46
|
+
- 🚀 **High Performance**: Rust-powered core engine with multi-threading
|
|
47
|
+
- 🧬 **Complete Variant Support**: SNP, MNP, insertion, deletion, and complex variants (DelIns, SNP+Indel)
|
|
48
|
+
- 📊 **Orientation-Aware**: Forward and reverse strand analysis with fragment counting
|
|
49
|
+
- 🔬 **Statistical Analysis**: Fisher's exact test for strand bias
|
|
50
|
+
- 📁 **Flexible I/O**: VCF and MAF input/output formats
|
|
51
|
+
- 🎯 **Quality Filters**: 7 configurable read filtering options
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
**Quick install:**
|
|
56
|
+
```bash
|
|
57
|
+
pip install py-gbcms
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**From source (requires Rust):**
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/msk-access/py-gbcms.git
|
|
63
|
+
cd py-gbcms
|
|
64
|
+
pip install .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Docker:**
|
|
68
|
+
```bash
|
|
69
|
+
docker pull ghcr.io/msk-access/py-gbcms:2.1.0
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
📖 **Full documentation:** https://msk-access.github.io/py-gbcms/
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
|
|
78
|
+
`py-gbcms` can be used in two ways:
|
|
79
|
+
|
|
80
|
+
### 🔧 Option 1: Standalone CLI (1-10 samples)
|
|
81
|
+
|
|
82
|
+
**Best for:** Quick analysis, local processing, direct control
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
gbcms run \
|
|
86
|
+
--variants variants.vcf \
|
|
87
|
+
--bam sample1.bam \
|
|
88
|
+
--fasta reference.fa \
|
|
89
|
+
--output-dir results/
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Output:** `results/sample1.vcf`
|
|
93
|
+
|
|
94
|
+
**Learn more:**
|
|
95
|
+
- 📘 [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
|
|
96
|
+
- 📖 [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
### 🔄 Option 2: Nextflow Workflow (10+ samples, HPC)
|
|
101
|
+
|
|
102
|
+
**Best for:** Many samples, HPC clusters (SLURM), reproducible pipelines
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
nextflow run nextflow/main.nf \
|
|
106
|
+
--input samplesheet.csv \
|
|
107
|
+
--variants variants.vcf \
|
|
108
|
+
--fasta reference.fa \
|
|
109
|
+
-profile slurm
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Features:**
|
|
113
|
+
- ✅ Automatic parallelization across samples
|
|
114
|
+
- ✅ SLURM/HPC integration
|
|
115
|
+
- ✅ Container support (Docker/Singularity)
|
|
116
|
+
- ✅ Resume failed runs
|
|
117
|
+
|
|
118
|
+
**Learn more:**
|
|
119
|
+
- 🔄 [Nextflow Workflow Guide](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
|
|
120
|
+
- 📋 [Usage Patterns Comparison](https://cmo-ci.gitbook.io/py-gbcms/workflows)
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Which Should I Use?
|
|
125
|
+
|
|
126
|
+
| Scenario | Recommendation |
|
|
127
|
+
|----------|----------------|
|
|
128
|
+
| 1-10 samples, local machine | **CLI** |
|
|
129
|
+
| 10+ samples, HPC cluster | **Nextflow** |
|
|
130
|
+
| Quick ad-hoc analysis | **CLI** |
|
|
131
|
+
| Production pipeline | **Nextflow** |
|
|
132
|
+
| Need auto-parallelization | **Nextflow** |
|
|
133
|
+
| Full manual control | **CLI** |
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Quick Examples
|
|
138
|
+
|
|
139
|
+
### CLI: Single Sample
|
|
140
|
+
```bash
|
|
141
|
+
gbcms run \
|
|
142
|
+
--variants variants.vcf \
|
|
143
|
+
--bam tumor.bam \
|
|
144
|
+
--fasta hg19.fa \
|
|
145
|
+
--output-dir results/ \
|
|
146
|
+
--threads 4
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### CLI: Multiple Samples (Sequential)
|
|
150
|
+
```bash
|
|
151
|
+
gbcms run \
|
|
152
|
+
--variants variants.vcf \
|
|
153
|
+
--bam-list samples.txt \
|
|
154
|
+
--fasta hg19.fa \
|
|
155
|
+
--output-dir results/
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Nextflow: Many Samples (Parallel)
|
|
159
|
+
```bash
|
|
160
|
+
# samplesheet.csv:
|
|
161
|
+
# sample,bam,bai
|
|
162
|
+
# tumor1,/path/to/tumor1.bam,
|
|
163
|
+
# tumor2,/path/to/tumor2.bam,
|
|
164
|
+
|
|
165
|
+
nextflow run nextflow/main.nf \
|
|
166
|
+
--input samplesheet.csv \
|
|
167
|
+
--variants variants.vcf \
|
|
168
|
+
--fasta hg19.fa \
|
|
169
|
+
--outdir results \
|
|
170
|
+
-profile slurm
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Documentation
|
|
176
|
+
|
|
177
|
+
📚 **Full Documentation:** https://cmo-ci.gitbook.io/py-gbcms/
|
|
178
|
+
|
|
179
|
+
**Quick Links:**
|
|
180
|
+
- [Installation](https://cmo-ci.gitbook.io/py-gbcms/installation)
|
|
181
|
+
- [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
|
|
182
|
+
- [Nextflow Workflow](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
|
|
183
|
+
- [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
|
|
184
|
+
- [Input & Output Formats](https://cmo-ci.gitbook.io/py-gbcms/input_output)
|
|
185
|
+
- [Architecture](https://cmo-ci.gitbook.io/py-gbcms/architecture)
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Contributing
|
|
190
|
+
|
|
191
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
|
|
192
|
+
|
|
193
|
+
To contribute to documentation, see the [`gh-pages` branch](https://github.com/msk-access/py-gbcms/tree/gh-pages).
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Citation
|
|
198
|
+
|
|
199
|
+
If you use `py-gbcms` in your research, please cite:
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
[Citation to be added]
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
AGPL-3.0 - see [LICENSE](LICENSE) for details.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Support
|
|
214
|
+
|
|
215
|
+
- 🐛 **Issues:** https://github.com/msk-access/py-gbcms/issues
|
|
216
|
+
- 💬 **Discussions:** https://github.com/msk-access/py-gbcms/discussions
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
gbcms/__init__.py,sha256=zPJIgPGcoSNiD0qme18OnYJYE3A9VVytlhO-V5DaAW0,22
|
|
2
|
+
gbcms/cli.py,sha256=P7ZhQBbsXJ88E2yqhJt_cu3xavTs1m2Mr2HKqZNp3Yc,5709
|
|
3
|
+
gbcms/pipeline.py,sha256=ebPReb_MfdsiSXqxNGd8Q-dJUzKY2SeaxLSctZUHW54,7832
|
|
4
|
+
gbcms/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
gbcms/core/kernel.py,sha256=Sl53XS4uwUDHJyerbYWC6FfKhm14q494IlwON2c62vk,4665
|
|
6
|
+
gbcms/io/input.py,sha256=VBBqhTlVF_S2vxEUXnqJ1sSGG3u_XthQB1VpoFPZxIU,8982
|
|
7
|
+
gbcms/io/output.py,sha256=3nwIXD_d6f4pbYNVr3hxeNKFkeZLGkv_qQC8g7bxjMk,12603
|
|
8
|
+
gbcms/models/core.py,sha256=klUbEFJVMmug94R8nnB9IMpLP4Q0CqGXc1BFS18LWzM,3788
|
|
9
|
+
py_gbcms-2.1.1.dist-info/METADATA,sha256=LzEXVLw8njg-IOt5mxv3pT__aVWATnoDUmYyO7eKf6w,5826
|
|
10
|
+
py_gbcms-2.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
11
|
+
py_gbcms-2.1.1.dist-info/entry_points.txt,sha256=AAg3yd8-c7jlb-FDGiFJXSNFVAhqO44zMLJQVFv8oWQ,40
|
|
12
|
+
py_gbcms-2.1.1.dist-info/licenses/LICENSE,sha256=5vLuih3k9yufKSXoR5qVWOhALHC8WXbSXjrOo9ZK3cs,34797
|
|
13
|
+
py_gbcms-2.1.1.dist-info/RECORD,,
|
gbcms/config.py
DELETED
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
"""Configuration classes and enums for GetBaseCounts."""
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
from enum import IntEnum
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class CountType(IntEnum):
|
|
9
|
-
"""Enumeration for different count types."""
|
|
10
|
-
|
|
11
|
-
DP = 0 # Total depth
|
|
12
|
-
RD = 1 # Reference depth
|
|
13
|
-
AD = 2 # Alternate depth
|
|
14
|
-
DPP = 3 # Positive strand depth
|
|
15
|
-
RDP = 4 # Positive strand reference depth
|
|
16
|
-
ADP = 5 # Positive strand alternate depth
|
|
17
|
-
DPF = 6 # Fragment depth
|
|
18
|
-
RDF = 7 # Fragment reference depth
|
|
19
|
-
ADF = 8 # Fragment alternate depth
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class Config:
|
|
24
|
-
"""Configuration for base counting."""
|
|
25
|
-
|
|
26
|
-
fasta_file: str
|
|
27
|
-
bam_files: dict[str, str] # sample_name -> bam_path
|
|
28
|
-
variant_files: list[str]
|
|
29
|
-
output_file: str
|
|
30
|
-
|
|
31
|
-
# Optional parameters
|
|
32
|
-
mapping_quality_threshold: int = 20
|
|
33
|
-
base_quality_threshold: int = 0
|
|
34
|
-
filter_duplicate: bool = True
|
|
35
|
-
filter_improper_pair: bool = False
|
|
36
|
-
filter_qc_failed: bool = False
|
|
37
|
-
filter_indel: bool = False
|
|
38
|
-
filter_non_primary: bool = False
|
|
39
|
-
output_positive_count: bool = True
|
|
40
|
-
output_negative_count: bool = False
|
|
41
|
-
output_fragment_count: bool = False
|
|
42
|
-
fragment_fractional_weight: bool = False
|
|
43
|
-
max_block_size: int = 10000
|
|
44
|
-
max_block_dist: int = 100000
|
|
45
|
-
num_threads: int = 1
|
|
46
|
-
backend: str = "joblib" # Parallelization backend
|
|
47
|
-
input_is_maf: bool = False
|
|
48
|
-
input_is_vcf: bool = False
|
|
49
|
-
output_maf: bool = False
|
|
50
|
-
generic_counting: bool = False
|
|
51
|
-
max_warning_per_type: int = 3
|
|
52
|
-
|
|
53
|
-
def __post_init__(self) -> None:
|
|
54
|
-
"""Validate configuration."""
|
|
55
|
-
if not os.path.exists(self.fasta_file):
|
|
56
|
-
raise FileNotFoundError(f"Reference FASTA file not found: {self.fasta_file}")
|
|
57
|
-
|
|
58
|
-
fai_file = f"{self.fasta_file}.fai"
|
|
59
|
-
if not os.path.exists(fai_file):
|
|
60
|
-
raise FileNotFoundError(
|
|
61
|
-
f"Reference FASTA index not found: {fai_file}. "
|
|
62
|
-
f"Please index with: samtools faidx {self.fasta_file}"
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
for sample, bam_path in self.bam_files.items():
|
|
66
|
-
if not os.path.exists(bam_path):
|
|
67
|
-
raise FileNotFoundError(f"BAM file not found for sample {sample}: {bam_path}")
|
|
68
|
-
|
|
69
|
-
# Check for BAM index
|
|
70
|
-
bai_file1 = bam_path.replace(".bam", ".bai")
|
|
71
|
-
bai_file2 = f"{bam_path}.bai"
|
|
72
|
-
if not os.path.exists(bai_file1) and not os.path.exists(bai_file2):
|
|
73
|
-
raise FileNotFoundError(
|
|
74
|
-
f"BAM index not found for {bam_path}. "
|
|
75
|
-
f"Please index with: samtools index {bam_path}"
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
for variant_file in self.variant_files:
|
|
79
|
-
if not os.path.exists(variant_file):
|
|
80
|
-
raise FileNotFoundError(f"Variant file not found: {variant_file}")
|
|
81
|
-
|
|
82
|
-
if self.input_is_maf and self.input_is_vcf:
|
|
83
|
-
raise ValueError("--maf and --vcf are mutually exclusive")
|
|
84
|
-
|
|
85
|
-
if not self.input_is_maf and not self.input_is_vcf:
|
|
86
|
-
raise ValueError("Either --maf or --vcf must be specified")
|
|
87
|
-
|
|
88
|
-
if self.input_is_vcf and self.output_maf:
|
|
89
|
-
raise ValueError("--omaf can only be used with --maf input")
|
|
90
|
-
|
|
91
|
-
if self.num_threads < 1:
|
|
92
|
-
raise ValueError("Number of threads must be at least 1")
|
|
93
|
-
|
|
94
|
-
if self.max_block_size < 1:
|
|
95
|
-
raise ValueError("max_block_size must be at least 1")
|
|
96
|
-
|
|
97
|
-
if self.max_block_dist < 1:
|
|
98
|
-
raise ValueError("max_block_dist must be at least 1")
|