py-gbcms 2.2.0__cp312-cp312-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gbcms/__init__.py +23 -0
- gbcms/_rs.cpython-312-x86_64-linux-gnu.so +0 -0
- gbcms/_rs.pyi +49 -0
- gbcms/cli.py +204 -0
- gbcms/core/__init__.py +9 -0
- gbcms/core/kernel.py +128 -0
- gbcms/io/__init__.py +18 -0
- gbcms/io/input.py +227 -0
- gbcms/io/output.py +354 -0
- gbcms/models/__init__.py +27 -0
- gbcms/models/core.py +172 -0
- gbcms/pipeline.py +257 -0
- gbcms/py.typed +0 -0
- gbcms/utils/__init__.py +14 -0
- gbcms/utils/logging.py +123 -0
- gbcms.libs/libbz2-a1e77c99.so.1 +0 -0
- gbcms.libs/libcrypto-bfee2032.so.1.1 +0 -0
- gbcms.libs/libssl-658e53cd.so.1.1 +0 -0
- py_gbcms-2.2.0.dist-info/METADATA +217 -0
- py_gbcms-2.2.0.dist-info/RECORD +23 -0
- py_gbcms-2.2.0.dist-info/WHEEL +4 -0
- py_gbcms-2.2.0.dist-info/entry_points.txt +2 -0
- py_gbcms-2.2.0.dist-info/licenses/LICENSE +664 -0
gbcms/pipeline.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline Orchestrator: Manages the execution flow of gbcms.
|
|
3
|
+
|
|
4
|
+
This module handles:
|
|
5
|
+
1. Reading variants from input (VCF/MAF).
|
|
6
|
+
2. Iterating over samples (BAM files).
|
|
7
|
+
3. Running the Rust-based counting engine for each sample.
|
|
8
|
+
4. Writing results to per-sample output files.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pysam
|
|
16
|
+
from rich.console import Console
|
|
17
|
+
from rich.progress import (
|
|
18
|
+
BarColumn,
|
|
19
|
+
Progress,
|
|
20
|
+
SpinnerColumn,
|
|
21
|
+
TaskProgressColumn,
|
|
22
|
+
TextColumn,
|
|
23
|
+
TimeRemainingColumn,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from gbcms import _rs as gbcms_rs
|
|
27
|
+
|
|
28
|
+
from .core.kernel import CoordinateKernel
|
|
29
|
+
from .io.input import MafReader, ReferenceChecker, VariantReader, VcfReader
|
|
30
|
+
from .io.output import MafWriter, VcfWriter
|
|
31
|
+
from .models.core import GbcmsConfig, OutputFormat, Variant
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
__all__ = ["Pipeline"]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Pipeline:
|
|
39
|
+
"""Main pipeline for processing BAM files and counting bases at variant positions."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, config: GbcmsConfig):
|
|
42
|
+
"""
|
|
43
|
+
Initialize the pipeline.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
config: Configuration object with input/output paths and filter settings.
|
|
47
|
+
"""
|
|
48
|
+
self.config = config
|
|
49
|
+
self.console = Console()
|
|
50
|
+
self._stats = {"samples_processed": 0, "total_variants": 0, "total_time": 0.0}
|
|
51
|
+
|
|
52
|
+
def run(self) -> dict:
|
|
53
|
+
"""
|
|
54
|
+
Execute the pipeline.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Dictionary with processing statistics.
|
|
58
|
+
"""
|
|
59
|
+
start_time = time.perf_counter()
|
|
60
|
+
logger.info("Starting gbcms pipeline")
|
|
61
|
+
logger.info("Output directory: %s", self.config.output.directory)
|
|
62
|
+
|
|
63
|
+
# 1. Load Variants
|
|
64
|
+
logger.debug("Loading variants from %s", self.config.variant_file)
|
|
65
|
+
variants = self._load_variants()
|
|
66
|
+
logger.info("Loaded %d variants", len(variants))
|
|
67
|
+
|
|
68
|
+
if not variants:
|
|
69
|
+
logger.error("No variants found. Exiting.")
|
|
70
|
+
return self._stats
|
|
71
|
+
|
|
72
|
+
# 2. Validate Variants against Reference
|
|
73
|
+
logger.debug("Validating variants against reference genome")
|
|
74
|
+
valid_variants = self._validate_variants(variants)
|
|
75
|
+
logger.info("Valid variants: %d / %d", len(valid_variants), len(variants))
|
|
76
|
+
|
|
77
|
+
if not valid_variants:
|
|
78
|
+
logger.error("No valid variants remaining after validation. Exiting.")
|
|
79
|
+
return self._stats
|
|
80
|
+
|
|
81
|
+
variants = valid_variants
|
|
82
|
+
self._stats["total_variants"] = len(variants)
|
|
83
|
+
|
|
84
|
+
# 3. Prepare Rust Variants
|
|
85
|
+
rs_variants = [
|
|
86
|
+
gbcms_rs.Variant(v.chrom, v.pos, v.ref, v.alt, v.variant_type.value) for v in variants
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# 4. Process Each Sample
|
|
90
|
+
self.config.output.directory.mkdir(parents=True, exist_ok=True)
|
|
91
|
+
samples = list(self.config.bam_files.items())
|
|
92
|
+
|
|
93
|
+
with Progress(
|
|
94
|
+
SpinnerColumn(),
|
|
95
|
+
TextColumn("[progress.description]{task.description}"),
|
|
96
|
+
BarColumn(),
|
|
97
|
+
TaskProgressColumn(),
|
|
98
|
+
TimeRemainingColumn(),
|
|
99
|
+
console=self.console,
|
|
100
|
+
) as progress:
|
|
101
|
+
task = progress.add_task("[cyan]Processing samples...", total=len(samples))
|
|
102
|
+
|
|
103
|
+
for sample_name, bam_path in samples:
|
|
104
|
+
progress.update(task, description=f"[cyan]Processing {sample_name}...")
|
|
105
|
+
self._process_sample(sample_name, bam_path, variants, rs_variants)
|
|
106
|
+
progress.advance(task)
|
|
107
|
+
|
|
108
|
+
# Calculate total time
|
|
109
|
+
self._stats["total_time"] = time.perf_counter() - start_time
|
|
110
|
+
logger.info(
|
|
111
|
+
"Pipeline completed: %d samples, %.2fs",
|
|
112
|
+
self._stats["samples_processed"],
|
|
113
|
+
self._stats["total_time"],
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return self._stats
|
|
117
|
+
|
|
118
|
+
def _process_sample(
|
|
119
|
+
self,
|
|
120
|
+
sample_name: str,
|
|
121
|
+
bam_path: Path,
|
|
122
|
+
variants: list[Variant],
|
|
123
|
+
rs_variants: list,
|
|
124
|
+
) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Process a single sample.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
sample_name: Name of the sample.
|
|
130
|
+
bam_path: Path to BAM file.
|
|
131
|
+
variants: List of normalized variants.
|
|
132
|
+
rs_variants: List of Rust variant objects.
|
|
133
|
+
"""
|
|
134
|
+
sample_start = time.perf_counter()
|
|
135
|
+
logger.debug("Processing sample: %s (%s)", sample_name, bam_path)
|
|
136
|
+
|
|
137
|
+
# Validate BAM Header
|
|
138
|
+
if not self._validate_bam_header(bam_path, variants):
|
|
139
|
+
logger.warning(
|
|
140
|
+
"BAM %s may not contain variant chromosomes. Proceeding anyway.",
|
|
141
|
+
sample_name,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
# Run Rust Engine with nested config accessors
|
|
146
|
+
rust_start = time.perf_counter()
|
|
147
|
+
counts_list = gbcms_rs.count_bam(
|
|
148
|
+
str(bam_path),
|
|
149
|
+
rs_variants,
|
|
150
|
+
min_mapq=self.config.quality.min_mapping_quality,
|
|
151
|
+
min_baseq=self.config.quality.min_base_quality,
|
|
152
|
+
filter_duplicates=self.config.filters.duplicates,
|
|
153
|
+
filter_secondary=self.config.filters.secondary,
|
|
154
|
+
filter_supplementary=self.config.filters.supplementary,
|
|
155
|
+
filter_qc_failed=self.config.filters.qc_failed,
|
|
156
|
+
filter_improper_pair=self.config.filters.improper_pair,
|
|
157
|
+
filter_indel=self.config.filters.indel,
|
|
158
|
+
threads=self.config.threads,
|
|
159
|
+
)
|
|
160
|
+
rust_time = time.perf_counter() - rust_start
|
|
161
|
+
logger.debug("Rust count_bam completed in %.3fs", rust_time)
|
|
162
|
+
|
|
163
|
+
# Write Output
|
|
164
|
+
self._write_output(sample_name, variants, counts_list)
|
|
165
|
+
self._stats["samples_processed"] += 1
|
|
166
|
+
|
|
167
|
+
sample_time = time.perf_counter() - sample_start
|
|
168
|
+
logger.debug("Sample %s completed in %.3fs", sample_name, sample_time)
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
logger.error("Error processing sample %s: %s", sample_name, e)
|
|
172
|
+
|
|
173
|
+
def _load_variants(self) -> list[Variant]:
|
|
174
|
+
"""Load variants based on file extension."""
|
|
175
|
+
path = self.config.variant_file
|
|
176
|
+
reader: VariantReader
|
|
177
|
+
|
|
178
|
+
suffix = path.suffix.lower()
|
|
179
|
+
if suffix in [".vcf", ".gz"]:
|
|
180
|
+
reader = VcfReader(path)
|
|
181
|
+
elif suffix == ".maf":
|
|
182
|
+
reader = MafReader(path, fasta_path=self.config.reference_fasta)
|
|
183
|
+
else:
|
|
184
|
+
raise ValueError(f"Unsupported variant file format: {suffix}")
|
|
185
|
+
|
|
186
|
+
variants = list(reader)
|
|
187
|
+
if hasattr(reader, "close"):
|
|
188
|
+
reader.close()
|
|
189
|
+
|
|
190
|
+
return variants
|
|
191
|
+
|
|
192
|
+
def _validate_variants(self, variants: list[Variant]) -> list[Variant]:
|
|
193
|
+
"""Validate variants against reference genome."""
|
|
194
|
+
checker = ReferenceChecker(self.config.reference_fasta)
|
|
195
|
+
valid_variants = []
|
|
196
|
+
invalid_count = 0
|
|
197
|
+
|
|
198
|
+
for v in variants:
|
|
199
|
+
if checker.validate(v):
|
|
200
|
+
valid_variants.append(v)
|
|
201
|
+
else:
|
|
202
|
+
invalid_count += 1
|
|
203
|
+
if invalid_count <= 5:
|
|
204
|
+
logger.warning(
|
|
205
|
+
"Invalid variant (REF mismatch): %s:%d %s>%s",
|
|
206
|
+
v.chrom,
|
|
207
|
+
v.pos,
|
|
208
|
+
v.ref,
|
|
209
|
+
v.alt,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
if invalid_count > 5:
|
|
213
|
+
logger.warning("... and %d more invalid variants.", invalid_count - 5)
|
|
214
|
+
|
|
215
|
+
checker.close()
|
|
216
|
+
return valid_variants
|
|
217
|
+
|
|
218
|
+
def _validate_bam_header(self, bam_path: Path, variants: list[Variant]) -> bool:
|
|
219
|
+
"""Check if BAM header contains chromosomes from variants."""
|
|
220
|
+
try:
|
|
221
|
+
with pysam.AlignmentFile(str(bam_path), "rb") as bam:
|
|
222
|
+
bam_chroms = set(bam.references)
|
|
223
|
+
|
|
224
|
+
norm_bam_chroms = {CoordinateKernel.normalize_chromosome(c) for c in bam_chroms}
|
|
225
|
+
|
|
226
|
+
if variants:
|
|
227
|
+
v = variants[0]
|
|
228
|
+
norm_v_chrom = CoordinateKernel.normalize_chromosome(v.chrom)
|
|
229
|
+
if norm_v_chrom not in norm_bam_chroms:
|
|
230
|
+
return False
|
|
231
|
+
return True
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.warning("Could not validate BAM header: %s", e)
|
|
234
|
+
return True
|
|
235
|
+
|
|
236
|
+
def _write_output(
|
|
237
|
+
self,
|
|
238
|
+
sample_name: str,
|
|
239
|
+
variants: list[Variant],
|
|
240
|
+
counts_list: list[gbcms_rs.BaseCounts],
|
|
241
|
+
) -> None:
|
|
242
|
+
"""Write results to output file."""
|
|
243
|
+
ext = "vcf" if self.config.output.format == OutputFormat.VCF else "maf"
|
|
244
|
+
suffix = self.config.output.suffix
|
|
245
|
+
output_path = self.config.output.directory / f"{sample_name}{suffix}.{ext}"
|
|
246
|
+
|
|
247
|
+
writer: VcfWriter | MafWriter
|
|
248
|
+
if self.config.output.format == OutputFormat.VCF:
|
|
249
|
+
writer = VcfWriter(output_path, sample_name=sample_name)
|
|
250
|
+
else:
|
|
251
|
+
writer = MafWriter(output_path)
|
|
252
|
+
|
|
253
|
+
for v, counts in zip(variants, counts_list, strict=True):
|
|
254
|
+
writer.write(v, counts, sample_name=sample_name)
|
|
255
|
+
|
|
256
|
+
writer.close()
|
|
257
|
+
logger.debug("Results written to %s", output_path)
|
gbcms/py.typed
ADDED
|
File without changes
|
gbcms/utils/__init__.py
ADDED
gbcms/utils/logging.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Logging utilities for gbcms.
|
|
3
|
+
|
|
4
|
+
Provides centralized logging configuration with dual output:
|
|
5
|
+
- Structured logging via Python logging module
|
|
6
|
+
- Rich console output for interactive use
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from contextlib import contextmanager
|
|
13
|
+
from functools import wraps
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from rich.console import Console
|
|
17
|
+
from rich.logging import RichHandler
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"setup_logging",
|
|
21
|
+
"get_logger",
|
|
22
|
+
"timed",
|
|
23
|
+
"log_call",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
# Module-level console for rich output
|
|
27
|
+
_console = Console()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def setup_logging(verbose: bool = False, log_file: str | None = None) -> None:
|
|
31
|
+
"""
|
|
32
|
+
Configure logging for gbcms.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
verbose: If True, set log level to DEBUG. Otherwise INFO.
|
|
36
|
+
log_file: Optional path to write logs to file.
|
|
37
|
+
"""
|
|
38
|
+
log_level = logging.DEBUG if verbose else logging.INFO
|
|
39
|
+
|
|
40
|
+
handlers: list[logging.Handler] = [
|
|
41
|
+
RichHandler(
|
|
42
|
+
console=_console,
|
|
43
|
+
rich_tracebacks=True,
|
|
44
|
+
markup=True,
|
|
45
|
+
show_path=verbose,
|
|
46
|
+
)
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
if log_file:
|
|
50
|
+
file_handler = logging.FileHandler(log_file)
|
|
51
|
+
file_handler.setFormatter(
|
|
52
|
+
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
53
|
+
)
|
|
54
|
+
handlers.append(file_handler)
|
|
55
|
+
|
|
56
|
+
logging.basicConfig(
|
|
57
|
+
level=log_level,
|
|
58
|
+
format="%(message)s",
|
|
59
|
+
datefmt="[%X]",
|
|
60
|
+
handlers=handlers,
|
|
61
|
+
force=True, # Override existing config
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_logger(name: str) -> logging.Logger:
|
|
66
|
+
"""Get a logger instance for the given module name."""
|
|
67
|
+
return logging.getLogger(name)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@contextmanager
|
|
71
|
+
def timed(operation: str, logger: logging.Logger | None = None):
|
|
72
|
+
"""
|
|
73
|
+
Context manager for timing operations.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
operation: Description of the operation being timed.
|
|
77
|
+
logger: Logger to use. If None, uses root logger.
|
|
78
|
+
|
|
79
|
+
Example:
|
|
80
|
+
with timed("Loading variants", logger):
|
|
81
|
+
variants = load_variants()
|
|
82
|
+
"""
|
|
83
|
+
log = logger or logging.getLogger(__name__)
|
|
84
|
+
start = time.perf_counter()
|
|
85
|
+
log.debug("Starting: %s", operation)
|
|
86
|
+
try:
|
|
87
|
+
yield
|
|
88
|
+
finally:
|
|
89
|
+
elapsed = time.perf_counter() - start
|
|
90
|
+
log.debug("Completed: %s (%.3fs)", operation, elapsed)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def log_call(logger: logging.Logger | None = None) -> Callable:
|
|
94
|
+
"""
|
|
95
|
+
Decorator to log function calls with timing.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
logger: Logger to use. If None, uses function's module logger.
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
@log_call()
|
|
102
|
+
def process_sample(sample_name: str) -> dict:
|
|
103
|
+
...
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def decorator(func: Callable) -> Callable:
|
|
107
|
+
@wraps(func)
|
|
108
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
109
|
+
log = logger or logging.getLogger(func.__module__)
|
|
110
|
+
log.debug("Calling %s", func.__name__)
|
|
111
|
+
start = time.perf_counter()
|
|
112
|
+
try:
|
|
113
|
+
result = func(*args, **kwargs)
|
|
114
|
+
elapsed = time.perf_counter() - start
|
|
115
|
+
log.debug("%s completed (%.3fs)", func.__name__, elapsed)
|
|
116
|
+
return result
|
|
117
|
+
except Exception as e:
|
|
118
|
+
log.error("%s failed: %s", func.__name__, e)
|
|
119
|
+
raise
|
|
120
|
+
|
|
121
|
+
return wrapper
|
|
122
|
+
|
|
123
|
+
return decorator
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: py-gbcms
|
|
3
|
+
Version: 2.2.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Science/Research
|
|
6
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
10
|
+
Requires-Dist: pysam>=0.21.0
|
|
11
|
+
Requires-Dist: typer>=0.9.0
|
|
12
|
+
Requires-Dist: rich>=13.0.0
|
|
13
|
+
Requires-Dist: pydantic>=2.0.0
|
|
14
|
+
Requires-Dist: pytest>=7.4.0 ; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest-cov>=4.1.0 ; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest-mock>=3.11.0 ; extra == 'dev'
|
|
17
|
+
Requires-Dist: black>=23.0.0 ; extra == 'dev'
|
|
18
|
+
Requires-Dist: ruff>=0.1.0 ; extra == 'dev'
|
|
19
|
+
Requires-Dist: mypy>=1.5.0 ; extra == 'dev'
|
|
20
|
+
Requires-Dist: types-pyyaml>=6.0.0 ; extra == 'dev'
|
|
21
|
+
Requires-Dist: mkdocs-material>=9.0.0 ; extra == 'dev'
|
|
22
|
+
Provides-Extra: all
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Provides-Extra: fast
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Summary: Python implementation of GetBaseCountsMultiSample (gbcms) for calculating base counts in BAM files
|
|
27
|
+
Keywords: bioinformatics,genomics,bam,vcf,maf,base-counts,gbcms
|
|
28
|
+
Author-email: MSK-ACCESS <shahr2@mskcc.org>
|
|
29
|
+
License: AGPL-3.0
|
|
30
|
+
Requires-Python: >=3.10
|
|
31
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
32
|
+
Project-URL: Bug Tracker, https://github.com/msk-access/py-gbcms/issues
|
|
33
|
+
Project-URL: Documentation, https://github.com/msk-access/py-gbcms#readme
|
|
34
|
+
Project-URL: Homepage, https://github.com/msk-access/py-gbcms
|
|
35
|
+
Project-URL: Repository, https://github.com/msk-access/py-gbcms
|
|
36
|
+
|
|
37
|
+
# py-gbcms
|
|
38
|
+
|
|
39
|
+
**Complete orientation-aware counting system for genomic variants**
|
|
40
|
+
|
|
41
|
+
[](https://github.com/msk-access/py-gbcms/actions)
|
|
42
|
+
[](https://www.python.org/downloads/)
|
|
43
|
+
|
|
44
|
+
## Features
|
|
45
|
+
|
|
46
|
+
- 🚀 **High Performance**: Rust-powered core engine with multi-threading
|
|
47
|
+
- 🧬 **Complete Variant Support**: SNP, MNP, insertion, deletion, and complex variants (DelIns, SNP+Indel)
|
|
48
|
+
- 📊 **Orientation-Aware**: Forward and reverse strand analysis with fragment counting
|
|
49
|
+
- 🔬 **Statistical Analysis**: Fisher's exact test for strand bias
|
|
50
|
+
- 📁 **Flexible I/O**: VCF and MAF input/output formats
|
|
51
|
+
- 🎯 **Quality Filters**: 7 configurable read filtering options
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
**Quick install:**
|
|
56
|
+
```bash
|
|
57
|
+
pip install py-gbcms
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**From source (requires Rust):**
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/msk-access/py-gbcms.git
|
|
63
|
+
cd py-gbcms
|
|
64
|
+
pip install .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Docker:**
|
|
68
|
+
```bash
|
|
69
|
+
docker pull ghcr.io/msk-access/py-gbcms:2.1.0
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
📖 **Full documentation:** https://msk-access.github.io/py-gbcms/
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
|
|
78
|
+
`py-gbcms` can be used in two ways:
|
|
79
|
+
|
|
80
|
+
### 🔧 Option 1: Standalone CLI (1-10 samples)
|
|
81
|
+
|
|
82
|
+
**Best for:** Quick analysis, local processing, direct control
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
gbcms run \
|
|
86
|
+
--variants variants.vcf \
|
|
87
|
+
--bam sample1.bam \
|
|
88
|
+
--fasta reference.fa \
|
|
89
|
+
--output-dir results/
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Output:** `results/sample1.vcf`
|
|
93
|
+
|
|
94
|
+
**Learn more:**
|
|
95
|
+
- 📘 [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
|
|
96
|
+
- 📖 [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
### 🔄 Option 2: Nextflow Workflow (10+ samples, HPC)
|
|
101
|
+
|
|
102
|
+
**Best for:** Many samples, HPC clusters (SLURM), reproducible pipelines
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
nextflow run nextflow/main.nf \
|
|
106
|
+
--input samplesheet.csv \
|
|
107
|
+
--variants variants.vcf \
|
|
108
|
+
--fasta reference.fa \
|
|
109
|
+
-profile slurm
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Features:**
|
|
113
|
+
- ✅ Automatic parallelization across samples
|
|
114
|
+
- ✅ SLURM/HPC integration
|
|
115
|
+
- ✅ Container support (Docker/Singularity)
|
|
116
|
+
- ✅ Resume failed runs
|
|
117
|
+
|
|
118
|
+
**Learn more:**
|
|
119
|
+
- 🔄 [Nextflow Workflow Guide](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
|
|
120
|
+
- 📋 [Usage Patterns Comparison](https://cmo-ci.gitbook.io/py-gbcms/workflows)
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Which Should I Use?
|
|
125
|
+
|
|
126
|
+
| Scenario | Recommendation |
|
|
127
|
+
|----------|----------------|
|
|
128
|
+
| 1-10 samples, local machine | **CLI** |
|
|
129
|
+
| 10+ samples, HPC cluster | **Nextflow** |
|
|
130
|
+
| Quick ad-hoc analysis | **CLI** |
|
|
131
|
+
| Production pipeline | **Nextflow** |
|
|
132
|
+
| Need auto-parallelization | **Nextflow** |
|
|
133
|
+
| Full manual control | **CLI** |
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Quick Examples
|
|
138
|
+
|
|
139
|
+
### CLI: Single Sample
|
|
140
|
+
```bash
|
|
141
|
+
gbcms run \
|
|
142
|
+
--variants variants.vcf \
|
|
143
|
+
--bam tumor.bam \
|
|
144
|
+
--fasta hg19.fa \
|
|
145
|
+
--output-dir results/ \
|
|
146
|
+
--threads 4
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### CLI: Multiple Samples (Sequential)
|
|
150
|
+
```bash
|
|
151
|
+
gbcms run \
|
|
152
|
+
--variants variants.vcf \
|
|
153
|
+
--bam-list samples.txt \
|
|
154
|
+
--fasta hg19.fa \
|
|
155
|
+
--output-dir results/
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Nextflow: Many Samples (Parallel)
|
|
159
|
+
```bash
|
|
160
|
+
# samplesheet.csv:
|
|
161
|
+
# sample,bam,bai
|
|
162
|
+
# tumor1,/path/to/tumor1.bam,
|
|
163
|
+
# tumor2,/path/to/tumor2.bam,
|
|
164
|
+
|
|
165
|
+
nextflow run nextflow/main.nf \
|
|
166
|
+
--input samplesheet.csv \
|
|
167
|
+
--variants variants.vcf \
|
|
168
|
+
--fasta hg19.fa \
|
|
169
|
+
--outdir results \
|
|
170
|
+
-profile slurm
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Documentation
|
|
176
|
+
|
|
177
|
+
📚 **Full Documentation:** https://cmo-ci.gitbook.io/py-gbcms/
|
|
178
|
+
|
|
179
|
+
**Quick Links:**
|
|
180
|
+
- [Installation](https://cmo-ci.gitbook.io/py-gbcms/installation)
|
|
181
|
+
- [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
|
|
182
|
+
- [Nextflow Workflow](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
|
|
183
|
+
- [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
|
|
184
|
+
- [Input & Output Formats](https://cmo-ci.gitbook.io/py-gbcms/input_output)
|
|
185
|
+
- [Architecture](https://cmo-ci.gitbook.io/py-gbcms/architecture)
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Contributing
|
|
190
|
+
|
|
191
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
|
|
192
|
+
|
|
193
|
+
To contribute to documentation, see the [`gh-pages` branch](https://github.com/msk-access/py-gbcms/tree/gh-pages).
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Citation
|
|
198
|
+
|
|
199
|
+
If you use `py-gbcms` in your research, please cite:
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
[Citation to be added]
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
AGPL-3.0 - see [LICENSE](LICENSE) for details.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Support
|
|
214
|
+
|
|
215
|
+
- 🐛 **Issues:** https://github.com/msk-access/py-gbcms/issues
|
|
216
|
+
- 💬 **Discussions:** https://github.com/msk-access/py-gbcms/discussions
|
|
217
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
gbcms/__init__.py,sha256=eZ9tInanaAhEP5eXrtEBBrG09ADJA93dZHsTqTtT8-c,582
|
|
2
|
+
gbcms/_rs.cpython-312-x86_64-linux-gnu.so,sha256=kMZxk9EXJ11C0Rso1r4DC7qxmoLmKeJqdV3vYlHiOWE,2691713
|
|
3
|
+
gbcms/_rs.pyi,sha256=pQwrF9cgCSYeIQXsdE40z9Rxwv4-I-7JynE96kOoNEE,1029
|
|
4
|
+
gbcms/cli.py,sha256=BHlC0_79wuhKtdpLGWYLPVDApwwHEAWg7euA6nHnb7I,6246
|
|
5
|
+
gbcms/core/__init__.py,sha256=YY_0MvtbIOb_0I8cZg9DTNVN3xVn7qFohm4qzBO4vKU,182
|
|
6
|
+
gbcms/core/kernel.py,sha256=lZYW2gurzG-Egq04CVoFFCn3-GnYN0VC1dyP8QMP5QQ,4697
|
|
7
|
+
gbcms/io/__init__.py,sha256=0P3ag3XKQp7ldfdEM9iyLa814bQSXjWRN-uIXFJsiWM,374
|
|
8
|
+
gbcms/io/input.py,sha256=FjKBl5ENcAJapkiXLHLBu0eXrIeChPTFZ_2AtwOPGeU,9115
|
|
9
|
+
gbcms/io/output.py,sha256=MgWmqkETkqJ6aega3bDBNJvgYlB5W4fkliuWBLHd320,12224
|
|
10
|
+
gbcms/models/__init__.py,sha256=w30KiqKJgW1mKhVf8Erhv5sNhsKdmBKjy5V8sSdCaLo,454
|
|
11
|
+
gbcms/models/core.py,sha256=oP1_S4QldiqSIcVuELIVSCT0wHpLRGxDthfcFJxRHjo,5334
|
|
12
|
+
gbcms/pipeline.py,sha256=kbFceb4Mro0C2ptgpo8R0h6mskHSpG8zRYVs49DaEDs,8869
|
|
13
|
+
gbcms/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
gbcms/utils/__init__.py,sha256=BVT4l3YAuDUYzcYhNBRLCh-iogqJTVfAIhuFSdujRoQ,238
|
|
15
|
+
gbcms/utils/logging.py,sha256=oVrosypHiJumUBWZM-Dj8JBIZfP418xhtlKvLxJNAlc,3263
|
|
16
|
+
gbcms.libs/libbz2-a1e77c99.so.1,sha256=DaMHVzhlKReUBcnfEqhZnzle2gdIDp7wPz4BLI-zf6g,75049
|
|
17
|
+
gbcms.libs/libcrypto-bfee2032.so.1.1,sha256=vENxnY50N4f5NMoJR5ElTBXei4_x6ZjNLP7oMzBioo8,3215921
|
|
18
|
+
gbcms.libs/libssl-658e53cd.so.1.1,sha256=59sUdvtjU-6lBFU6e8nNeH96zgPwWQ9GIIuaOUUOASs,666857
|
|
19
|
+
py_gbcms-2.2.0.dist-info/METADATA,sha256=8W3KKx9EKQZFiuG-Jr55IgXzhxOnrY9VZMuKIRG2fvg,5863
|
|
20
|
+
py_gbcms-2.2.0.dist-info/WHEEL,sha256=-bb09_PJZyKYDRWofYMyCwQSwDvJ5qTYPgEDkxiuUjI,109
|
|
21
|
+
py_gbcms-2.2.0.dist-info/entry_points.txt,sha256=Yqzc4l1V0IO3dDpKe9O3sYlPyR1b3zAQEPtoaSl_Ftg,38
|
|
22
|
+
py_gbcms-2.2.0.dist-info/licenses/LICENSE,sha256=5vLuih3k9yufKSXoR5qVWOhALHC8WXbSXjrOo9ZK3cs,34797
|
|
23
|
+
py_gbcms-2.2.0.dist-info/RECORD,,
|