py-gbcms 2.0.0__tar.gz → 2.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {py_gbcms-2.0.0 → py_gbcms-2.1.2}/.gitignore +19 -0
  2. py_gbcms-2.1.2/CHANGELOG.md +192 -0
  3. {py_gbcms-2.0.0 → py_gbcms-2.1.2}/CONTRIBUTING.md +1 -8
  4. py_gbcms-2.1.2/PKG-INFO +216 -0
  5. py_gbcms-2.1.2/README.md +180 -0
  6. {py_gbcms-2.0.0 → py_gbcms-2.1.2}/pyproject.toml +29 -25
  7. py_gbcms-2.1.2/src/gbcms/__init__.py +1 -0
  8. py_gbcms-2.1.2/src/gbcms/cli.py +163 -0
  9. py_gbcms-2.1.2/src/gbcms/core/kernel.py +126 -0
  10. py_gbcms-2.1.2/src/gbcms/io/input.py +222 -0
  11. py_gbcms-2.1.2/src/gbcms/io/output.py +361 -0
  12. py_gbcms-2.1.2/src/gbcms/models/core.py +133 -0
  13. py_gbcms-2.1.2/src/gbcms/pipeline.py +212 -0
  14. py_gbcms-2.1.2/src/gbcms/py.typed +0 -0
  15. py_gbcms-2.1.2/src/gbcms_rs/.gitignore +72 -0
  16. py_gbcms-2.1.2/src/gbcms_rs/Cargo.lock +1395 -0
  17. py_gbcms-2.1.2/src/gbcms_rs/Cargo.toml +20 -0
  18. py_gbcms-2.1.2/src/gbcms_rs/pyproject.toml +13 -0
  19. py_gbcms-2.1.2/src/gbcms_rs/src/counting.rs +663 -0
  20. py_gbcms-2.1.2/src/gbcms_rs/src/lib.rs +16 -0
  21. py_gbcms-2.1.2/src/gbcms_rs/src/stats.rs +79 -0
  22. py_gbcms-2.1.2/src/gbcms_rs/src/types.rs +90 -0
  23. py_gbcms-2.1.2/src/gbcms_rs.pyi +50 -0
  24. {py_gbcms-2.0.0 → py_gbcms-2.1.2}/uv.lock +481 -337
  25. py_gbcms-2.0.0/.gitbook.yaml +0 -8
  26. py_gbcms-2.0.0/.github/workflows/release.yml +0 -130
  27. py_gbcms-2.0.0/.github/workflows/test.yml +0 -150
  28. py_gbcms-2.0.0/.pre-commit-config.yaml +0 -31
  29. py_gbcms-2.0.0/Dockerfile +0 -55
  30. py_gbcms-2.0.0/Dockerfile.test +0 -39
  31. py_gbcms-2.0.0/Makefile +0 -104
  32. py_gbcms-2.0.0/PKG-INFO +0 -506
  33. py_gbcms-2.0.0/README.md +0 -462
  34. py_gbcms-2.0.0/docker-compose.yml +0 -26
  35. py_gbcms-2.0.0/docs/ADVANCED_FEATURES.md +0 -747
  36. py_gbcms-2.0.0/docs/ARCHITECTURE.md +0 -631
  37. py_gbcms-2.0.0/docs/CLI_FEATURES.md +0 -393
  38. py_gbcms-2.0.0/docs/COMPLETE_FEATURES_SUMMARY.md +0 -600
  39. py_gbcms-2.0.0/docs/CPP_FEATURE_COMPARISON.md +0 -334
  40. py_gbcms-2.0.0/docs/CYVCF2_SUPPORT.md +0 -406
  41. py_gbcms-2.0.0/docs/DOCKER_GUIDE.md +0 -589
  42. py_gbcms-2.0.0/docs/DOCKER_SUMMARY.md +0 -394
  43. py_gbcms-2.0.0/docs/FAQ.md +0 -476
  44. py_gbcms-2.0.0/docs/INPUT_OUTPUT.md +0 -469
  45. py_gbcms-2.0.0/docs/INSTALLATION.md +0 -124
  46. py_gbcms-2.0.0/docs/PACKAGE_STRUCTURE.md +0 -299
  47. py_gbcms-2.0.0/docs/PARALLELIZATION_GUIDE.md +0 -185
  48. py_gbcms-2.0.0/docs/QUICKSTART.md +0 -329
  49. py_gbcms-2.0.0/docs/README.md +0 -100
  50. py_gbcms-2.0.0/docs/SUMMARY.md +0 -40
  51. py_gbcms-2.0.0/docs/TESTING_GUIDE.md +0 -261
  52. py_gbcms-2.0.0/git-flow-helper.sh +0 -118
  53. py_gbcms-2.0.0/scripts/setup_and_test.sh +0 -145
  54. py_gbcms-2.0.0/scripts/test_docker.sh +0 -156
  55. py_gbcms-2.0.0/scripts/test_maf_workflow.sh +0 -164
  56. py_gbcms-2.0.0/scripts/test_vcf_workflow.sh +0 -116
  57. py_gbcms-2.0.0/scripts/validate_against_cpp.sh +0 -272
  58. py_gbcms-2.0.0/scripts/verify_installation.py +0 -147
  59. py_gbcms-2.0.0/src/gbcms/__init__.py +0 -13
  60. py_gbcms-2.0.0/src/gbcms/cli.py +0 -745
  61. py_gbcms-2.0.0/src/gbcms/config.py +0 -98
  62. py_gbcms-2.0.0/src/gbcms/counter.py +0 -1074
  63. py_gbcms-2.0.0/src/gbcms/models.py +0 -295
  64. py_gbcms-2.0.0/src/gbcms/numba_counter.py +0 -394
  65. py_gbcms-2.0.0/src/gbcms/output.py +0 -573
  66. py_gbcms-2.0.0/src/gbcms/parallel.py +0 -129
  67. py_gbcms-2.0.0/src/gbcms/processor.py +0 -293
  68. py_gbcms-2.0.0/src/gbcms/reference.py +0 -86
  69. py_gbcms-2.0.0/src/gbcms/variant.py +0 -390
  70. py_gbcms-2.0.0/tests/__init__.py +0 -1
  71. py_gbcms-2.0.0/tests/conftest.py +0 -117
  72. py_gbcms-2.0.0/tests/test_cli.py +0 -235
  73. py_gbcms-2.0.0/tests/test_config.py +0 -142
  74. py_gbcms-2.0.0/tests/test_counter.py +0 -188
  75. py_gbcms-2.0.0/tests/test_output.py +0 -191
  76. py_gbcms-2.0.0/tests/test_reference.py +0 -84
  77. py_gbcms-2.0.0/tests/test_variant.py +0 -159
  78. {py_gbcms-2.0.0 → py_gbcms-2.1.2}/LICENSE +0 -0
@@ -33,6 +33,9 @@ htmlcov/
33
33
  .tox/
34
34
  coverage.xml
35
35
  *.cover
36
+ .mypy_cache/
37
+ .ruff_cache/
38
+ .benchmarks/
36
39
 
37
40
  # IDEs
38
41
  .vscode/
@@ -56,6 +59,16 @@ test_data/
56
59
  *.maf
57
60
  output/
58
61
  results/
62
+ test_output*/
63
+ *.txt
64
+
65
+ # Exception: Allow test data files
66
+ !tests/testdata/*.bam
67
+ !tests/testdata/*.bam.bai
68
+ !tests/testdata/*.fa
69
+ !tests/testdata/*.fa.fai
70
+ !tests/testdata/*.vcf
71
+ !tests/testdata/*.maf
59
72
 
60
73
  # Logs
61
74
  *.log
@@ -63,3 +76,9 @@ results/
63
76
  # Temporary files
64
77
  tmp/
65
78
  temp/
79
+
80
+ # Rust
81
+ target/
82
+
83
+ # Documentation
84
+ site/
@@ -0,0 +1,192 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [2.1.2] - 2025-11-25
9
+
10
+ ### 🔧 Fixed
11
+ - **PyPI Distribution**: Fixed source distribution size issue by correctly excluding large files (tests, docs, etc.) via `pyproject.toml` configuration.
12
+
13
+ ## [2.1.1] - 2025-11-25 [YANKED]
14
+
15
+ > [!WARNING]
16
+ > This release was yanked from PyPI due to a source distribution size limit error. Use 2.1.2 instead.
17
+
18
+ ### 🔧 Fixed
19
+ - **PyPI Distribution**: Added MANIFEST.in (failed to work with Hatchling) to reduce source distribution size
20
+ - **Documentation**: Added comprehensive Installation guide
21
+ - **Documentation**: Unified Contributing guide (merged code + docs contributions)
22
+ - **Documentation**: Added Changelog to documentation navigation
23
+
24
+ ## [2.1.0] - 2025-11-25
25
+
26
+ ### ✨ Added
27
+
28
+ #### Nextflow Workflow
29
+ - **Production-ready Nextflow workflow** for processing multiple samples in parallel
30
+ - **SLURM cluster support** with customizable queue configuration
31
+ - **Per-sample suffix support** via optional `suffix` column in samplesheet
32
+ - **Docker and Singularity profiles** for containerized execution
33
+ - **Automatic BAI index discovery** with validation
34
+ - **Resume capability** for failed workflow runs
35
+ - **Resource management** with automatic retry and scaling
36
+ - **Comprehensive documentation** in `docs/NEXTFLOW.md` and `nextflow/README.md`
37
+
38
+ #### Documentation
39
+ - **Usage pattern comparison** guide (`docs/WORKFLOWS.md`) for choosing between CLI and Nextflow
40
+ - **MkDocs integration** for beautiful GitHub Pages documentation
41
+ - **Local documentation preview** with live reload (`mkdocs serve`)
42
+ - **Staging deployment** from `develop` branch for testing docs
43
+ - **Production deployment** from `main` branch
44
+ - **Reorganized documentation structure** with clear CLI vs Nextflow separation
45
+ - **CLI Quick Start guide** (`docs/quick-start.md`)
46
+
47
+ ### 🔧 Changed
48
+ - **Documentation workflow**: docs now live on `main` branch with automated deployment
49
+ - **GitBook integration**: configured to read from `main` branch
50
+ - **Nextflow module**: improved parameter passing with meta.suffix support
51
+
52
+ ### 📝 Documentation
53
+ - Complete Nextflow workflow guide with SLURM examples
54
+ - Per-sample suffix usage examples
55
+ - Git-flow documentation workflow guide
56
+ - Local preview instructions
57
+ - Updated README with clear usage pattern separation
58
+
59
+ ## [2.0.0] - 2025-11-21
60
+
61
+ ### 🚀 Major Rewrite
62
+
63
+ Version 2.0.0 represents a complete rewrite of py-gbcms with a focus on performance, correctness, and modern architecture.
64
+
65
+ ### ✨ Added
66
+
67
+ #### Core Features
68
+ - **Rust-based Counting Engine**: Hybrid Python/Rust architecture for 20x+ performance improvement
69
+ - **Strand Bias Statistics**: Fisher's exact test p-values and odds ratios for both reads (`SB_PVAL`, `SB_OR`) and fragments (`FSB_PVAL`, `FSB_OR`)
70
+ - **Fragment-Level Counting**: Majority-rule fragment counting with strand-specific counts (`RDF`, `ADF`)
71
+ - **Variant Allele Fractions**: Read-level (`VAF`) and fragment-level (`FAF`) allele fraction calculations
72
+ - **Thread Control**: Explicit control over parallelism via `--threads` argument (default: 1)
73
+
74
+ #### Input/Output
75
+ - **VCF Output Format**: Standard VCF with comprehensive INFO and FORMAT fields
76
+ - **MAF Output Format**: Extended MAF with custom columns for strand counts and statistics
77
+ - **Column Preservation**: Input MAF columns are preserved in output
78
+ - **Multiple BAM Support**: Process multiple samples via `--bam-list` or repeated `--bam` arguments
79
+ - **Sample ID Override**: Explicit sample naming via `--bam sample_id:path` syntax
80
+
81
+ #### Filters
82
+ - `--filter-duplicates`: Filter duplicate reads (default: enabled)
83
+ - `--filter-secondary`: Filter secondary alignments
84
+ - `--filter-supplementary`: Filter supplementary alignments
85
+ - `--filter-qc-failed`: Filter reads that failed QC
86
+ - `--filter-improper-pair`: Filter improperly paired reads
87
+ - `--filter-indel`: Filter reads with indels in CIGAR
88
+
89
+ #### CLI & Usability
90
+ - **Modern CLI**: Built with Typer and Rich for beautiful terminal output
91
+ - **Progress Tracking**: Real-time progress bars and status indicators
92
+ - **Direct Invocation**: Use `gbcms run` instead of `python -m gbcms.cli`
93
+ - **Output Customization**: `--suffix` flag for output filename customization
94
+ - **Flexible Input**: Support for both VCF and MAF input formats
95
+
96
+ #### Infrastructure
97
+ - **Docker Support**: Production-ready multi-stage Dockerfile with optimized layers
98
+ - **Type Safety**: Full type annotations with mypy support
99
+ - **Type Stubs**: Provided `.pyi` stub file for Rust extension
100
+ - **Comprehensive Tests**: Extended test suite with accuracy and filter validation
101
+ - **CI/CD**: GitHub Actions workflows for testing, linting, and releases
102
+
103
+ ### 🔄 Changed
104
+
105
+ #### Architecture
106
+ - Migrated from pure Python to hybrid Python/Rust architecture
107
+ - Core counting logic implemented in Rust using `rust-htslib`
108
+ - Data parallelism over variants with per-thread BAM readers
109
+
110
+ #### Output Formats
111
+ - **VCF FORMAT fields**: Strand-specific counts now use comma-separated values (e.g., `RD=5,3` for forward,reverse)
112
+ - **MAF columns**: Standardized column names (`t_ref_count_forward`, `t_alt_count_reverse`, etc.)
113
+ - **Coordinate System**: Internal 0-based indexing with correct conversion for VCF (1-based) and MAF output
114
+
115
+ #### Performance
116
+ - **Speed**: 20x+ faster than v1.x on typical datasets
117
+ - **Memory**: Efficient per-thread BAM readers with minimal overhead
118
+ - **Scalability**: Configurable thread pool for optimal resource usage
119
+
120
+ #### Dependencies
121
+ - **Python**: Updated to require Python ≥3.10
122
+ - **Rust**: pyo3 0.27.1, rust-htslib 0.51.0, statrs 0.18.0
123
+ - **Python Packages**: pysam ≥0.21.0, typer ≥0.9.0, rich ≥13.0.0, pydantic ≥2.0.0
124
+
125
+ ### 🗑️ Removed
126
+
127
+ - **Legacy Python Counting**: Pure Python implementation removed in favor of Rust
128
+ - **Old CLI**: Deprecated `python -m gbcms.cli` entry point
129
+ - **Unused Dependencies**: Removed `cyvcf2` and `numba` (no longer needed)
130
+ - **Pre-commit Hooks**: Removed in favor of explicit linting in CI
131
+
132
+ ### 🐛 Fixed
133
+
134
+ - Correct handling of complex variants (MNPs, DelIns)
135
+ - Proper strand assignment for fragment counting
136
+ - Reference validation against FASTA for all variant types
137
+ - Thread-safe BAM access with per-thread readers
138
+
139
+ ### 📚 Documentation
140
+
141
+ - Complete rewrite of all documentation
142
+ - New guides: `INSTALLATION.md`, `CLI_FEATURES.md`, `INPUT_OUTPUT.md`
143
+ - Comprehensive API documentation
144
+ - Docker usage examples
145
+ - Contributing guidelines updated
146
+
147
+ ### 🔧 Technical Details
148
+
149
+ #### Rust Components
150
+ - `gbcms_rs`: PyO3-based extension module
151
+ - Fisher's exact test via `statrs` crate
152
+ - Rayon-based parallelism with configurable thread pools
153
+ - Safe memory management with Rust's ownership model
154
+
155
+ #### Testing
156
+ - 16 comprehensive test cases
157
+ - Accuracy validation with synthetic BAM files
158
+ - Filter validation for all read flag combinations
159
+ - Integration tests with real-world data
160
+
161
+ ### ⚠️ Breaking Changes
162
+
163
+ Version 2.0.0 is **not backward compatible** with 1.x. Key breaking changes:
164
+
165
+ 1. **CLI syntax**: Use `gbcms run` instead of `python -m gbcms.cli`
166
+ 2. **Output format**: VCF/MAF column structures have changed
167
+ 3. **Default behavior**: Only duplicate filtering enabled by default (was: all filters)
168
+ 4. **Dependencies**: Requires Rust toolchain for installation from source
169
+ 5. **Python version**: Minimum Python 3.10 (was: 3.8)
170
+
171
+ ### 📦 Installation
172
+
173
+ ```bash
174
+ # From PyPI (includes pre-built wheels)
175
+ pip install py-gbcms
176
+
177
+ # From source (requires Rust)
178
+ pip install git+https://github.com/msk-access/py-gbcms.git
179
+
180
+ # Docker
181
+ docker pull ghcr.io/msk-access/py-gbcms:2.0.0
182
+ ```
183
+
184
+ ### 🙏 Acknowledgments
185
+
186
+ This rewrite was designed and implemented with a focus on correctness, performance, and modern best practices in bioinformatics software development.
187
+
188
+ ---
189
+
190
+ ## [1.x] - Legacy
191
+
192
+ Previous versions (1.x) used a pure Python implementation. See git history for details.
@@ -20,10 +20,7 @@ Thank you for your interest in contributing to GetBaseCounts! This document prov
20
20
  uv pip install -e ".[dev]"
21
21
  ```
22
22
 
23
- 4. **Install pre-commit hooks**
24
- ```bash
25
- pre-commit install
26
- ```
23
+
27
24
 
28
25
  ## Development Workflow
29
26
 
@@ -64,10 +61,6 @@ mypy src/
64
61
  ```bash
65
62
  # Build Docker image
66
63
  docker build -t gbcms:latest .
67
-
68
- # Run tests in Docker
69
- docker build -f Dockerfile.test -t gbcms:test .
70
- docker run --rm gbcms:test
71
64
  ```
72
65
 
73
66
  ## Code Style
@@ -0,0 +1,216 @@
1
+ Metadata-Version: 2.4
2
+ Name: py-gbcms
3
+ Version: 2.1.2
4
+ Summary: Python implementation of GetBaseCountsMultiSample (gbcms) for calculating base counts in BAM files
5
+ Project-URL: Homepage, https://github.com/msk-access/py-gbcms
6
+ Project-URL: Repository, https://github.com/msk-access/py-gbcms
7
+ Project-URL: Documentation, https://github.com/msk-access/py-gbcms#readme
8
+ Project-URL: Bug Tracker, https://github.com/msk-access/py-gbcms/issues
9
+ Author-email: MSK-ACCESS <shahr2@mskcc.org>
10
+ License: AGPL-3.0
11
+ License-File: LICENSE
12
+ Keywords: bam,base-counts,bioinformatics,gbcms,genomics,maf,vcf
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: pydantic>=2.0.0
21
+ Requires-Dist: pysam>=0.21.0
22
+ Requires-Dist: rich>=13.0.0
23
+ Requires-Dist: typer>=0.9.0
24
+ Provides-Extra: all
25
+ Provides-Extra: dev
26
+ Requires-Dist: black>=23.0.0; extra == 'dev'
27
+ Requires-Dist: mkdocs-material>=9.0.0; extra == 'dev'
28
+ Requires-Dist: mypy>=1.5.0; extra == 'dev'
29
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
30
+ Requires-Dist: pytest-mock>=3.11.0; extra == 'dev'
31
+ Requires-Dist: pytest>=7.4.0; extra == 'dev'
32
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
33
+ Requires-Dist: types-pyyaml>=6.0.0; extra == 'dev'
34
+ Provides-Extra: fast
35
+ Description-Content-Type: text/markdown
36
+
37
+ # py-gbcms
38
+
39
+ **Complete orientation-aware counting system for genomic variants**
40
+
41
+ [![Tests](https://github.com/msk-access/py-gbcms/workflows/Tests/badge.svg)](https://github.com/msk-access/py-gbcms/actions)
42
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
43
+
44
+ ## Features
45
+
46
+ - 🚀 **High Performance**: Rust-powered core engine with multi-threading
47
+ - 🧬 **Complete Variant Support**: SNP, MNP, insertion, deletion, and complex variants (DelIns, SNP+Indel)
48
+ - 📊 **Orientation-Aware**: Forward and reverse strand analysis with fragment counting
49
+ - 🔬 **Statistical Analysis**: Fisher's exact test for strand bias
50
+ - 📁 **Flexible I/O**: VCF and MAF input/output formats
51
+ - 🎯 **Quality Filters**: 7 configurable read filtering options
52
+
53
+ ## Installation
54
+
55
+ **Quick install:**
56
+ ```bash
57
+ pip install py-gbcms
58
+ ```
59
+
60
+ **From source (requires Rust):**
61
+ ```bash
62
+ git clone https://github.com/msk-access/py-gbcms.git
63
+ cd py-gbcms
64
+ pip install .
65
+ ```
66
+
67
+ **Docker:**
68
+ ```bash
69
+ docker pull ghcr.io/msk-access/py-gbcms:2.1.0
70
+ ```
71
+
72
+ 📖 **Full documentation:** https://msk-access.github.io/py-gbcms/
73
+
74
+ ---
75
+
76
+ ## Usage
77
+
78
+ `py-gbcms` can be used in two ways:
79
+
80
+ ### 🔧 Option 1: Standalone CLI (1-10 samples)
81
+
82
+ **Best for:** Quick analysis, local processing, direct control
83
+
84
+ ```bash
85
+ gbcms run \
86
+ --variants variants.vcf \
87
+ --bam sample1.bam \
88
+ --fasta reference.fa \
89
+ --output-dir results/
90
+ ```
91
+
92
+ **Output:** `results/sample1.vcf`
93
+
94
+ **Learn more:**
95
+ - 📘 [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
96
+ - 📖 [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
97
+
98
+ ---
99
+
100
+ ### 🔄 Option 2: Nextflow Workflow (10+ samples, HPC)
101
+
102
+ **Best for:** Many samples, HPC clusters (SLURM), reproducible pipelines
103
+
104
+ ```bash
105
+ nextflow run nextflow/main.nf \
106
+ --input samplesheet.csv \
107
+ --variants variants.vcf \
108
+ --fasta reference.fa \
109
+ -profile slurm
110
+ ```
111
+
112
+ **Features:**
113
+ - ✅ Automatic parallelization across samples
114
+ - ✅ SLURM/HPC integration
115
+ - ✅ Container support (Docker/Singularity)
116
+ - ✅ Resume failed runs
117
+
118
+ **Learn more:**
119
+ - 🔄 [Nextflow Workflow Guide](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
120
+ - 📋 [Usage Patterns Comparison](https://cmo-ci.gitbook.io/py-gbcms/workflows)
121
+
122
+ ---
123
+
124
+ ## Which Should I Use?
125
+
126
+ | Scenario | Recommendation |
127
+ |----------|----------------|
128
+ | 1-10 samples, local machine | **CLI** |
129
+ | 10+ samples, HPC cluster | **Nextflow** |
130
+ | Quick ad-hoc analysis | **CLI** |
131
+ | Production pipeline | **Nextflow** |
132
+ | Need auto-parallelization | **Nextflow** |
133
+ | Full manual control | **CLI** |
134
+
135
+ ---
136
+
137
+ ## Quick Examples
138
+
139
+ ### CLI: Single Sample
140
+ ```bash
141
+ gbcms run \
142
+ --variants variants.vcf \
143
+ --bam tumor.bam \
144
+ --fasta hg19.fa \
145
+ --output-dir results/ \
146
+ --threads 4
147
+ ```
148
+
149
+ ### CLI: Multiple Samples (Sequential)
150
+ ```bash
151
+ gbcms run \
152
+ --variants variants.vcf \
153
+ --bam-list samples.txt \
154
+ --fasta hg19.fa \
155
+ --output-dir results/
156
+ ```
157
+
158
+ ### Nextflow: Many Samples (Parallel)
159
+ ```bash
160
+ # samplesheet.csv:
161
+ # sample,bam,bai
162
+ # tumor1,/path/to/tumor1.bam,
163
+ # tumor2,/path/to/tumor2.bam,
164
+
165
+ nextflow run nextflow/main.nf \
166
+ --input samplesheet.csv \
167
+ --variants variants.vcf \
168
+ --fasta hg19.fa \
169
+ --outdir results \
170
+ -profile slurm
171
+ ```
172
+
173
+ ---
174
+
175
+ ## Documentation
176
+
177
+ 📚 **Full Documentation:** https://cmo-ci.gitbook.io/py-gbcms/
178
+
179
+ **Quick Links:**
180
+ - [Installation](https://cmo-ci.gitbook.io/py-gbcms/installation)
181
+ - [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
182
+ - [Nextflow Workflow](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
183
+ - [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
184
+ - [Input & Output Formats](https://cmo-ci.gitbook.io/py-gbcms/input_output)
185
+ - [Architecture](https://cmo-ci.gitbook.io/py-gbcms/architecture)
186
+
187
+ ---
188
+
189
+ ## Contributing
190
+
191
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
192
+
193
+ To contribute to documentation, see the [`gh-pages` branch](https://github.com/msk-access/py-gbcms/tree/gh-pages).
194
+
195
+ ---
196
+
197
+ ## Citation
198
+
199
+ If you use `py-gbcms` in your research, please cite:
200
+
201
+ ```
202
+ [Citation to be added]
203
+ ```
204
+
205
+ ---
206
+
207
+ ## License
208
+
209
+ AGPL-3.0 - see [LICENSE](LICENSE) for details.
210
+
211
+ ---
212
+
213
+ ## Support
214
+
215
+ - 🐛 **Issues:** https://github.com/msk-access/py-gbcms/issues
216
+ - 💬 **Discussions:** https://github.com/msk-access/py-gbcms/discussions
@@ -0,0 +1,180 @@
1
+ # py-gbcms
2
+
3
+ **Complete orientation-aware counting system for genomic variants**
4
+
5
+ [![Tests](https://github.com/msk-access/py-gbcms/workflows/Tests/badge.svg)](https://github.com/msk-access/py-gbcms/actions)
6
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
7
+
8
+ ## Features
9
+
10
+ - 🚀 **High Performance**: Rust-powered core engine with multi-threading
11
+ - 🧬 **Complete Variant Support**: SNP, MNP, insertion, deletion, and complex variants (DelIns, SNP+Indel)
12
+ - 📊 **Orientation-Aware**: Forward and reverse strand analysis with fragment counting
13
+ - 🔬 **Statistical Analysis**: Fisher's exact test for strand bias
14
+ - 📁 **Flexible I/O**: VCF and MAF input/output formats
15
+ - 🎯 **Quality Filters**: 7 configurable read filtering options
16
+
17
+ ## Installation
18
+
19
+ **Quick install:**
20
+ ```bash
21
+ pip install py-gbcms
22
+ ```
23
+
24
+ **From source (requires Rust):**
25
+ ```bash
26
+ git clone https://github.com/msk-access/py-gbcms.git
27
+ cd py-gbcms
28
+ pip install .
29
+ ```
30
+
31
+ **Docker:**
32
+ ```bash
33
+ docker pull ghcr.io/msk-access/py-gbcms:2.1.0
34
+ ```
35
+
36
+ 📖 **Full documentation:** https://msk-access.github.io/py-gbcms/
37
+
38
+ ---
39
+
40
+ ## Usage
41
+
42
+ `py-gbcms` can be used in two ways:
43
+
44
+ ### 🔧 Option 1: Standalone CLI (1-10 samples)
45
+
46
+ **Best for:** Quick analysis, local processing, direct control
47
+
48
+ ```bash
49
+ gbcms run \
50
+ --variants variants.vcf \
51
+ --bam sample1.bam \
52
+ --fasta reference.fa \
53
+ --output-dir results/
54
+ ```
55
+
56
+ **Output:** `results/sample1.vcf`
57
+
58
+ **Learn more:**
59
+ - 📘 [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
60
+ - 📖 [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
61
+
62
+ ---
63
+
64
+ ### 🔄 Option 2: Nextflow Workflow (10+ samples, HPC)
65
+
66
+ **Best for:** Many samples, HPC clusters (SLURM), reproducible pipelines
67
+
68
+ ```bash
69
+ nextflow run nextflow/main.nf \
70
+ --input samplesheet.csv \
71
+ --variants variants.vcf \
72
+ --fasta reference.fa \
73
+ -profile slurm
74
+ ```
75
+
76
+ **Features:**
77
+ - ✅ Automatic parallelization across samples
78
+ - ✅ SLURM/HPC integration
79
+ - ✅ Container support (Docker/Singularity)
80
+ - ✅ Resume failed runs
81
+
82
+ **Learn more:**
83
+ - 🔄 [Nextflow Workflow Guide](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
84
+ - 📋 [Usage Patterns Comparison](https://cmo-ci.gitbook.io/py-gbcms/workflows)
85
+
86
+ ---
87
+
88
+ ## Which Should I Use?
89
+
90
+ | Scenario | Recommendation |
91
+ |----------|----------------|
92
+ | 1-10 samples, local machine | **CLI** |
93
+ | 10+ samples, HPC cluster | **Nextflow** |
94
+ | Quick ad-hoc analysis | **CLI** |
95
+ | Production pipeline | **Nextflow** |
96
+ | Need auto-parallelization | **Nextflow** |
97
+ | Full manual control | **CLI** |
98
+
99
+ ---
100
+
101
+ ## Quick Examples
102
+
103
+ ### CLI: Single Sample
104
+ ```bash
105
+ gbcms run \
106
+ --variants variants.vcf \
107
+ --bam tumor.bam \
108
+ --fasta hg19.fa \
109
+ --output-dir results/ \
110
+ --threads 4
111
+ ```
112
+
113
+ ### CLI: Multiple Samples (Sequential)
114
+ ```bash
115
+ gbcms run \
116
+ --variants variants.vcf \
117
+ --bam-list samples.txt \
118
+ --fasta hg19.fa \
119
+ --output-dir results/
120
+ ```
121
+
122
+ ### Nextflow: Many Samples (Parallel)
123
+ ```bash
124
+ # samplesheet.csv:
125
+ # sample,bam,bai
126
+ # tumor1,/path/to/tumor1.bam,
127
+ # tumor2,/path/to/tumor2.bam,
128
+
129
+ nextflow run nextflow/main.nf \
130
+ --input samplesheet.csv \
131
+ --variants variants.vcf \
132
+ --fasta hg19.fa \
133
+ --outdir results \
134
+ -profile slurm
135
+ ```
136
+
137
+ ---
138
+
139
+ ## Documentation
140
+
141
+ 📚 **Full Documentation:** https://cmo-ci.gitbook.io/py-gbcms/
142
+
143
+ **Quick Links:**
144
+ - [Installation](https://cmo-ci.gitbook.io/py-gbcms/installation)
145
+ - [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
146
+ - [Nextflow Workflow](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
147
+ - [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
148
+ - [Input & Output Formats](https://cmo-ci.gitbook.io/py-gbcms/input_output)
149
+ - [Architecture](https://cmo-ci.gitbook.io/py-gbcms/architecture)
150
+
151
+ ---
152
+
153
+ ## Contributing
154
+
155
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
156
+
157
+ To contribute to documentation, see the [`gh-pages` branch](https://github.com/msk-access/py-gbcms/tree/gh-pages).
158
+
159
+ ---
160
+
161
+ ## Citation
162
+
163
+ If you use `py-gbcms` in your research, please cite:
164
+
165
+ ```
166
+ [Citation to be added]
167
+ ```
168
+
169
+ ---
170
+
171
+ ## License
172
+
173
+ AGPL-3.0 - see [LICENSE](LICENSE) for details.
174
+
175
+ ---
176
+
177
+ ## Support
178
+
179
+ - 🐛 **Issues:** https://github.com/msk-access/py-gbcms/issues
180
+ - 💬 **Discussions:** https://github.com/msk-access/py-gbcms/discussions