py-gbcms 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gbcms/pipeline.py ADDED
@@ -0,0 +1,212 @@
1
+ """
2
+ Pipeline Orchestrator: Manages the execution flow of gbcms.
3
+
4
+ This module handles:
5
+ 1. Reading variants from input (VCF/MAF).
6
+ 2. Iterating over samples (BAM files).
7
+ 3. Running the Rust-based counting engine for each sample.
8
+ 4. Writing results to per-sample output files.
9
+ """
10
+
11
+ from pathlib import Path
12
+
13
+ import pysam
14
+ from rich.console import Console
15
+ from rich.progress import (
16
+ BarColumn,
17
+ Progress,
18
+ SpinnerColumn,
19
+ TaskProgressColumn,
20
+ TextColumn,
21
+ TimeRemainingColumn,
22
+ )
23
+
24
+ import gbcms_rs
25
+
26
+ from .core.kernel import CoordinateKernel
27
+ from .io.input import MafReader, ReferenceChecker, VariantReader, VcfReader
28
+ from .io.output import MafWriter, VcfWriter
29
+ from .models.core import GbcmsConfig, OutputFormat, Variant
30
+
31
+
32
+ class Pipeline:
33
+ def __init__(self, config: GbcmsConfig):
34
+ self.config = config
35
+ self.console = Console()
36
+
37
+ def run(self):
38
+ """Execute the pipeline."""
39
+ self.console.print("[bold blue]Starting gbcms pipeline[/bold blue]")
40
+ self.console.print(f"Output directory: {self.config.output_dir}")
41
+
42
+ # 1. Load Variants
43
+ with self.console.status("[bold green]Loading variants...[/bold green]"):
44
+ variants = self._load_variants()
45
+
46
+ self.console.print(f"Loaded [bold]{len(variants)}[/bold] variants.")
47
+
48
+ if not variants:
49
+ self.console.print("[bold red]No variants found. Exiting.[/bold red]")
50
+ return
51
+
52
+ # 2. Validate Variants against Reference
53
+ with self.console.status(
54
+ "[bold green]Validating variants against reference...[/bold green]"
55
+ ):
56
+ valid_variants = self._validate_variants(variants)
57
+
58
+ self.console.print(f"Valid variants: [bold]{len(valid_variants)}[/bold] / {len(variants)}")
59
+
60
+ if not valid_variants:
61
+ self.console.print(
62
+ "[bold red]No valid variants remaining after validation. Exiting.[/bold red]"
63
+ )
64
+ return
65
+
66
+ variants = valid_variants
67
+
68
+ # 3. Prepare Rust Variants
69
+ rs_variants = [
70
+ gbcms_rs.Variant(v.chrom, v.pos, v.ref, v.alt, v.variant_type.value) for v in variants
71
+ ]
72
+
73
+ # 4. Process Each Sample
74
+ self.config.output_dir.mkdir(parents=True, exist_ok=True)
75
+
76
+ samples = list(self.config.bam_files.items())
77
+
78
+ with Progress(
79
+ SpinnerColumn(),
80
+ TextColumn("[progress.description]{task.description}"),
81
+ BarColumn(),
82
+ TaskProgressColumn(),
83
+ TimeRemainingColumn(),
84
+ console=self.console,
85
+ ) as progress:
86
+ task = progress.add_task("[cyan]Processing samples...", total=len(samples))
87
+
88
+ for sample_name, bam_path in samples:
89
+ progress.update(task, description=f"[cyan]Processing {sample_name}...")
90
+
91
+ # Validate BAM Header
92
+ if not self._validate_bam_header(bam_path, variants):
93
+ self.console.print(
94
+ f"[yellow]Warning: BAM {sample_name} may not contain variant chromosomes. Proceeding anyway...[/yellow]"
95
+ )
96
+
97
+ try:
98
+ # Run Rust Engine
99
+ counts_list = gbcms_rs.count_bam(
100
+ str(bam_path),
101
+ rs_variants,
102
+ min_mapq=self.config.min_mapping_quality,
103
+ min_baseq=self.config.min_base_quality,
104
+ filter_duplicates=self.config.filter_duplicates,
105
+ filter_secondary=self.config.filter_secondary,
106
+ filter_supplementary=self.config.filter_supplementary,
107
+ filter_qc_failed=self.config.filter_qc_failed,
108
+ filter_improper_pair=self.config.filter_improper_pair,
109
+ filter_indel=self.config.filter_indel,
110
+ threads=self.config.threads,
111
+ )
112
+
113
+ # Write Output
114
+ self._write_output(sample_name, variants, counts_list)
115
+
116
+ except Exception as e:
117
+ self.console.print(
118
+ f"[bold red]Error processing sample {sample_name}: {e}[/bold red]"
119
+ )
120
+ # Continue to next sample
121
+
122
+ progress.advance(task)
123
+
124
+ self.console.print("[bold green]Pipeline completed successfully.[/bold green]")
125
+
126
+ def _load_variants(self) -> list[Variant]:
127
+ """Load variants based on file extension."""
128
+ path = self.config.variant_file
129
+ reader: VariantReader
130
+
131
+ if path.suffix.lower() in [".vcf", ".gz"]: # .vcf.gz handled by pysam
132
+ reader = VcfReader(path)
133
+ elif path.suffix.lower() == ".maf":
134
+ reader = MafReader(path, fasta_path=self.config.reference_fasta)
135
+ else:
136
+ raise ValueError(f"Unsupported variant file format: {path.suffix}")
137
+
138
+ variants = list(reader)
139
+ if hasattr(reader, "close"):
140
+ reader.close()
141
+
142
+ return variants
143
+
144
+ def _validate_variants(self, variants: list[Variant]) -> list[Variant]:
145
+ """Validate variants against reference genome."""
146
+ checker = ReferenceChecker(self.config.reference_fasta)
147
+ valid_variants = []
148
+ invalid_count = 0
149
+
150
+ for v in variants:
151
+ if checker.validate(v):
152
+ valid_variants.append(v)
153
+ else:
154
+ invalid_count += 1
155
+ if invalid_count <= 5: # Log first few failures
156
+ self.console.print(
157
+ f"[yellow]Invalid variant (REF mismatch): {v.chrom}:{v.pos} {v.ref}>{v.alt}[/yellow]"
158
+ )
159
+
160
+ if invalid_count > 5:
161
+ self.console.print(
162
+ f"[yellow]... and {invalid_count - 5} more invalid variants.[/yellow]"
163
+ )
164
+
165
+ checker.close()
166
+ return valid_variants
167
+
168
+ def _validate_bam_header(self, bam_path: Path, variants: list[Variant]) -> bool:
169
+ """Check if BAM header contains chromosomes from variants."""
170
+ try:
171
+ with pysam.AlignmentFile(str(bam_path), "rb") as bam:
172
+ bam_chroms = set(bam.references)
173
+
174
+ # Check a few variants
175
+ # We need to handle chr prefix normalization
176
+ # BAM might have 'chr1', variant '1', or vice versa.
177
+
178
+ # Normalize BAM chroms
179
+ norm_bam_chroms = {CoordinateKernel.normalize_chromosome(c) for c in bam_chroms}
180
+
181
+ # Check first variant as a heuristic
182
+ if variants:
183
+ v = variants[0]
184
+ norm_v_chrom = CoordinateKernel.normalize_chromosome(v.chrom)
185
+ if norm_v_chrom not in norm_bam_chroms:
186
+ return False
187
+ return True
188
+ except Exception as e:
189
+ self.console.print(f"[yellow]Could not validate BAM header: {e}[/yellow]")
190
+ return True # Assume ok if we can't check
191
+
192
+ def _write_output(
193
+ self, sample_name: str, variants: list[Variant], counts_list: list[gbcms_rs.BaseCounts]
194
+ ):
195
+ """Write results to output file."""
196
+ ext = "vcf" if self.config.output_format == OutputFormat.VCF else "maf"
197
+ suffix = self.config.output_suffix
198
+ output_path = self.config.output_dir / f"{sample_name}{suffix}.{ext}"
199
+ writer: VcfWriter | MafWriter
200
+ if self.config.output_format == OutputFormat.VCF:
201
+ writer = VcfWriter(output_path, sample_name=sample_name)
202
+ else:
203
+ writer = MafWriter(output_path)
204
+
205
+ for v, counts in zip(
206
+ variants, counts_list, strict=True
207
+ ): # Changed 'results' to 'counts_list'
208
+ writer.write(v, counts, sample_name=sample_name)
209
+
210
+ writer.close()
211
+
212
+ # self.console.print(f"Results written to {output_path}")
gbcms/py.typed ADDED
File without changes
@@ -0,0 +1,216 @@
1
+ Metadata-Version: 2.4
2
+ Name: py-gbcms
3
+ Version: 2.1.1
4
+ Summary: Python implementation of GetBaseCountsMultiSample (gbcms) for calculating base counts in BAM files
5
+ Project-URL: Homepage, https://github.com/msk-access/py-gbcms
6
+ Project-URL: Repository, https://github.com/msk-access/py-gbcms
7
+ Project-URL: Documentation, https://github.com/msk-access/py-gbcms#readme
8
+ Project-URL: Bug Tracker, https://github.com/msk-access/py-gbcms/issues
9
+ Author-email: MSK-ACCESS <shahr2@mskcc.org>
10
+ License: AGPL-3.0
11
+ License-File: LICENSE
12
+ Keywords: bam,base-counts,bioinformatics,gbcms,genomics,maf,vcf
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: pydantic>=2.0.0
21
+ Requires-Dist: pysam>=0.21.0
22
+ Requires-Dist: rich>=13.0.0
23
+ Requires-Dist: typer>=0.9.0
24
+ Provides-Extra: all
25
+ Provides-Extra: dev
26
+ Requires-Dist: black>=23.0.0; extra == 'dev'
27
+ Requires-Dist: mkdocs-material>=9.0.0; extra == 'dev'
28
+ Requires-Dist: mypy>=1.5.0; extra == 'dev'
29
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
30
+ Requires-Dist: pytest-mock>=3.11.0; extra == 'dev'
31
+ Requires-Dist: pytest>=7.4.0; extra == 'dev'
32
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
33
+ Requires-Dist: types-pyyaml>=6.0.0; extra == 'dev'
34
+ Provides-Extra: fast
35
+ Description-Content-Type: text/markdown
36
+
37
+ # py-gbcms
38
+
39
+ **Complete orientation-aware counting system for genomic variants**
40
+
41
+ [![Tests](https://github.com/msk-access/py-gbcms/workflows/Tests/badge.svg)](https://github.com/msk-access/py-gbcms/actions)
42
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
43
+
44
+ ## Features
45
+
46
+ - 🚀 **High Performance**: Rust-powered core engine with multi-threading
47
+ - 🧬 **Complete Variant Support**: SNP, MNP, insertion, deletion, and complex variants (DelIns, SNP+Indel)
48
+ - 📊 **Orientation-Aware**: Forward and reverse strand analysis with fragment counting
49
+ - 🔬 **Statistical Analysis**: Fisher's exact test for strand bias
50
+ - 📁 **Flexible I/O**: VCF and MAF input/output formats
51
+ - 🎯 **Quality Filters**: 7 configurable read filtering options
52
+
53
+ ## Installation
54
+
55
+ **Quick install:**
56
+ ```bash
57
+ pip install py-gbcms
58
+ ```
59
+
60
+ **From source (requires Rust):**
61
+ ```bash
62
+ git clone https://github.com/msk-access/py-gbcms.git
63
+ cd py-gbcms
64
+ pip install .
65
+ ```
66
+
67
+ **Docker:**
68
+ ```bash
69
+ docker pull ghcr.io/msk-access/py-gbcms:2.1.0
70
+ ```
71
+
72
+ 📖 **Full documentation:** https://msk-access.github.io/py-gbcms/
73
+
74
+ ---
75
+
76
+ ## Usage
77
+
78
+ `py-gbcms` can be used in two ways:
79
+
80
+ ### 🔧 Option 1: Standalone CLI (1-10 samples)
81
+
82
+ **Best for:** Quick analysis, local processing, direct control
83
+
84
+ ```bash
85
+ gbcms run \
86
+ --variants variants.vcf \
87
+ --bam sample1.bam \
88
+ --fasta reference.fa \
89
+ --output-dir results/
90
+ ```
91
+
92
+ **Output:** `results/sample1.vcf`
93
+
94
+ **Learn more:**
95
+ - 📘 [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
96
+ - 📖 [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
97
+
98
+ ---
99
+
100
+ ### 🔄 Option 2: Nextflow Workflow (10+ samples, HPC)
101
+
102
+ **Best for:** Many samples, HPC clusters (SLURM), reproducible pipelines
103
+
104
+ ```bash
105
+ nextflow run nextflow/main.nf \
106
+ --input samplesheet.csv \
107
+ --variants variants.vcf \
108
+ --fasta reference.fa \
109
+ -profile slurm
110
+ ```
111
+
112
+ **Features:**
113
+ - ✅ Automatic parallelization across samples
114
+ - ✅ SLURM/HPC integration
115
+ - ✅ Container support (Docker/Singularity)
116
+ - ✅ Resume failed runs
117
+
118
+ **Learn more:**
119
+ - 🔄 [Nextflow Workflow Guide](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
120
+ - 📋 [Usage Patterns Comparison](https://cmo-ci.gitbook.io/py-gbcms/workflows)
121
+
122
+ ---
123
+
124
+ ## Which Should I Use?
125
+
126
+ | Scenario | Recommendation |
127
+ |----------|----------------|
128
+ | 1-10 samples, local machine | **CLI** |
129
+ | 10+ samples, HPC cluster | **Nextflow** |
130
+ | Quick ad-hoc analysis | **CLI** |
131
+ | Production pipeline | **Nextflow** |
132
+ | Need auto-parallelization | **Nextflow** |
133
+ | Full manual control | **CLI** |
134
+
135
+ ---
136
+
137
+ ## Quick Examples
138
+
139
+ ### CLI: Single Sample
140
+ ```bash
141
+ gbcms run \
142
+ --variants variants.vcf \
143
+ --bam tumor.bam \
144
+ --fasta hg19.fa \
145
+ --output-dir results/ \
146
+ --threads 4
147
+ ```
148
+
149
+ ### CLI: Multiple Samples (Sequential)
150
+ ```bash
151
+ gbcms run \
152
+ --variants variants.vcf \
153
+ --bam-list samples.txt \
154
+ --fasta hg19.fa \
155
+ --output-dir results/
156
+ ```
157
+
158
+ ### Nextflow: Many Samples (Parallel)
159
+ ```bash
160
+ # samplesheet.csv:
161
+ # sample,bam,bai
162
+ # tumor1,/path/to/tumor1.bam,
163
+ # tumor2,/path/to/tumor2.bam,
164
+
165
+ nextflow run nextflow/main.nf \
166
+ --input samplesheet.csv \
167
+ --variants variants.vcf \
168
+ --fasta hg19.fa \
169
+ --outdir results \
170
+ -profile slurm
171
+ ```
172
+
173
+ ---
174
+
175
+ ## Documentation
176
+
177
+ 📚 **Full Documentation:** https://cmo-ci.gitbook.io/py-gbcms/
178
+
179
+ **Quick Links:**
180
+ - [Installation](https://cmo-ci.gitbook.io/py-gbcms/installation)
181
+ - [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
182
+ - [Nextflow Workflow](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
183
+ - [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
184
+ - [Input & Output Formats](https://cmo-ci.gitbook.io/py-gbcms/input_output)
185
+ - [Architecture](https://cmo-ci.gitbook.io/py-gbcms/architecture)
186
+
187
+ ---
188
+
189
+ ## Contributing
190
+
191
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
192
+
193
+ To contribute to documentation, see the [`gh-pages` branch](https://github.com/msk-access/py-gbcms/tree/gh-pages).
194
+
195
+ ---
196
+
197
+ ## Citation
198
+
199
+ If you use `py-gbcms` in your research, please cite:
200
+
201
+ ```
202
+ [Citation to be added]
203
+ ```
204
+
205
+ ---
206
+
207
+ ## License
208
+
209
+ AGPL-3.0 - see [LICENSE](LICENSE) for details.
210
+
211
+ ---
212
+
213
+ ## Support
214
+
215
+ - 🐛 **Issues:** https://github.com/msk-access/py-gbcms/issues
216
+ - 💬 **Discussions:** https://github.com/msk-access/py-gbcms/discussions
@@ -0,0 +1,13 @@
1
+ gbcms/__init__.py,sha256=zPJIgPGcoSNiD0qme18OnYJYE3A9VVytlhO-V5DaAW0,22
2
+ gbcms/cli.py,sha256=P7ZhQBbsXJ88E2yqhJt_cu3xavTs1m2Mr2HKqZNp3Yc,5709
3
+ gbcms/pipeline.py,sha256=ebPReb_MfdsiSXqxNGd8Q-dJUzKY2SeaxLSctZUHW54,7832
4
+ gbcms/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ gbcms/core/kernel.py,sha256=Sl53XS4uwUDHJyerbYWC6FfKhm14q494IlwON2c62vk,4665
6
+ gbcms/io/input.py,sha256=VBBqhTlVF_S2vxEUXnqJ1sSGG3u_XthQB1VpoFPZxIU,8982
7
+ gbcms/io/output.py,sha256=3nwIXD_d6f4pbYNVr3hxeNKFkeZLGkv_qQC8g7bxjMk,12603
8
+ gbcms/models/core.py,sha256=klUbEFJVMmug94R8nnB9IMpLP4Q0CqGXc1BFS18LWzM,3788
9
+ py_gbcms-2.1.1.dist-info/METADATA,sha256=LzEXVLw8njg-IOt5mxv3pT__aVWATnoDUmYyO7eKf6w,5826
10
+ py_gbcms-2.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ py_gbcms-2.1.1.dist-info/entry_points.txt,sha256=AAg3yd8-c7jlb-FDGiFJXSNFVAhqO44zMLJQVFv8oWQ,40
12
+ py_gbcms-2.1.1.dist-info/licenses/LICENSE,sha256=5vLuih3k9yufKSXoR5qVWOhALHC8WXbSXjrOo9ZK3cs,34797
13
+ py_gbcms-2.1.1.dist-info/RECORD,,
gbcms/config.py DELETED
@@ -1,98 +0,0 @@
1
- """Configuration classes and enums for GetBaseCounts."""
2
-
3
- import os
4
- from dataclasses import dataclass
5
- from enum import IntEnum
6
-
7
-
8
- class CountType(IntEnum):
9
- """Enumeration for different count types."""
10
-
11
- DP = 0 # Total depth
12
- RD = 1 # Reference depth
13
- AD = 2 # Alternate depth
14
- DPP = 3 # Positive strand depth
15
- RDP = 4 # Positive strand reference depth
16
- ADP = 5 # Positive strand alternate depth
17
- DPF = 6 # Fragment depth
18
- RDF = 7 # Fragment reference depth
19
- ADF = 8 # Fragment alternate depth
20
-
21
-
22
- @dataclass
23
- class Config:
24
- """Configuration for base counting."""
25
-
26
- fasta_file: str
27
- bam_files: dict[str, str] # sample_name -> bam_path
28
- variant_files: list[str]
29
- output_file: str
30
-
31
- # Optional parameters
32
- mapping_quality_threshold: int = 20
33
- base_quality_threshold: int = 0
34
- filter_duplicate: bool = True
35
- filter_improper_pair: bool = False
36
- filter_qc_failed: bool = False
37
- filter_indel: bool = False
38
- filter_non_primary: bool = False
39
- output_positive_count: bool = True
40
- output_negative_count: bool = False
41
- output_fragment_count: bool = False
42
- fragment_fractional_weight: bool = False
43
- max_block_size: int = 10000
44
- max_block_dist: int = 100000
45
- num_threads: int = 1
46
- backend: str = "joblib" # Parallelization backend
47
- input_is_maf: bool = False
48
- input_is_vcf: bool = False
49
- output_maf: bool = False
50
- generic_counting: bool = False
51
- max_warning_per_type: int = 3
52
-
53
- def __post_init__(self) -> None:
54
- """Validate configuration."""
55
- if not os.path.exists(self.fasta_file):
56
- raise FileNotFoundError(f"Reference FASTA file not found: {self.fasta_file}")
57
-
58
- fai_file = f"{self.fasta_file}.fai"
59
- if not os.path.exists(fai_file):
60
- raise FileNotFoundError(
61
- f"Reference FASTA index not found: {fai_file}. "
62
- f"Please index with: samtools faidx {self.fasta_file}"
63
- )
64
-
65
- for sample, bam_path in self.bam_files.items():
66
- if not os.path.exists(bam_path):
67
- raise FileNotFoundError(f"BAM file not found for sample {sample}: {bam_path}")
68
-
69
- # Check for BAM index
70
- bai_file1 = bam_path.replace(".bam", ".bai")
71
- bai_file2 = f"{bam_path}.bai"
72
- if not os.path.exists(bai_file1) and not os.path.exists(bai_file2):
73
- raise FileNotFoundError(
74
- f"BAM index not found for {bam_path}. "
75
- f"Please index with: samtools index {bam_path}"
76
- )
77
-
78
- for variant_file in self.variant_files:
79
- if not os.path.exists(variant_file):
80
- raise FileNotFoundError(f"Variant file not found: {variant_file}")
81
-
82
- if self.input_is_maf and self.input_is_vcf:
83
- raise ValueError("--maf and --vcf are mutually exclusive")
84
-
85
- if not self.input_is_maf and not self.input_is_vcf:
86
- raise ValueError("Either --maf or --vcf must be specified")
87
-
88
- if self.input_is_vcf and self.output_maf:
89
- raise ValueError("--omaf can only be used with --maf input")
90
-
91
- if self.num_threads < 1:
92
- raise ValueError("Number of threads must be at least 1")
93
-
94
- if self.max_block_size < 1:
95
- raise ValueError("max_block_size must be at least 1")
96
-
97
- if self.max_block_dist < 1:
98
- raise ValueError("max_block_dist must be at least 1")