py-gbcms 2.0.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {py_gbcms-2.0.0 → py_gbcms-2.2.0}/LICENSE +49 -52
  2. py_gbcms-2.2.0/PKG-INFO +216 -0
  3. py_gbcms-2.2.0/README.md +180 -0
  4. {py_gbcms-2.0.0 → py_gbcms-2.2.0}/pyproject.toml +20 -31
  5. py_gbcms-2.2.0/rust/.gitignore +72 -0
  6. py_gbcms-2.2.0/rust/Cargo.lock +1395 -0
  7. py_gbcms-2.2.0/rust/Cargo.toml +20 -0
  8. py_gbcms-2.2.0/rust/src/counting.rs +663 -0
  9. py_gbcms-2.2.0/rust/src/lib.rs +16 -0
  10. py_gbcms-2.2.0/rust/src/stats.rs +79 -0
  11. py_gbcms-2.2.0/rust/src/types.rs +90 -0
  12. py_gbcms-2.2.0/src/gbcms/__init__.py +23 -0
  13. py_gbcms-2.2.0/src/gbcms/_rs.pyi +49 -0
  14. py_gbcms-2.2.0/src/gbcms/cli.py +204 -0
  15. py_gbcms-2.2.0/src/gbcms/core/__init__.py +9 -0
  16. py_gbcms-2.2.0/src/gbcms/core/kernel.py +128 -0
  17. py_gbcms-2.2.0/src/gbcms/io/__init__.py +18 -0
  18. py_gbcms-2.2.0/src/gbcms/io/input.py +227 -0
  19. py_gbcms-2.2.0/src/gbcms/io/output.py +354 -0
  20. py_gbcms-2.2.0/src/gbcms/models/__init__.py +27 -0
  21. py_gbcms-2.2.0/src/gbcms/models/core.py +172 -0
  22. py_gbcms-2.2.0/src/gbcms/pipeline.py +257 -0
  23. py_gbcms-2.2.0/src/gbcms/py.typed +0 -0
  24. py_gbcms-2.2.0/src/gbcms/utils/__init__.py +14 -0
  25. py_gbcms-2.2.0/src/gbcms/utils/logging.py +123 -0
  26. py_gbcms-2.0.0/.gitbook.yaml +0 -8
  27. py_gbcms-2.0.0/.github/workflows/release.yml +0 -130
  28. py_gbcms-2.0.0/.github/workflows/test.yml +0 -150
  29. py_gbcms-2.0.0/.gitignore +0 -65
  30. py_gbcms-2.0.0/.pre-commit-config.yaml +0 -31
  31. py_gbcms-2.0.0/CONTRIBUTING.md +0 -154
  32. py_gbcms-2.0.0/Dockerfile +0 -55
  33. py_gbcms-2.0.0/Dockerfile.test +0 -39
  34. py_gbcms-2.0.0/Makefile +0 -104
  35. py_gbcms-2.0.0/PKG-INFO +0 -506
  36. py_gbcms-2.0.0/README.md +0 -462
  37. py_gbcms-2.0.0/docker-compose.yml +0 -26
  38. py_gbcms-2.0.0/docs/ADVANCED_FEATURES.md +0 -747
  39. py_gbcms-2.0.0/docs/ARCHITECTURE.md +0 -631
  40. py_gbcms-2.0.0/docs/CLI_FEATURES.md +0 -393
  41. py_gbcms-2.0.0/docs/COMPLETE_FEATURES_SUMMARY.md +0 -600
  42. py_gbcms-2.0.0/docs/CPP_FEATURE_COMPARISON.md +0 -334
  43. py_gbcms-2.0.0/docs/CYVCF2_SUPPORT.md +0 -406
  44. py_gbcms-2.0.0/docs/DOCKER_GUIDE.md +0 -589
  45. py_gbcms-2.0.0/docs/DOCKER_SUMMARY.md +0 -394
  46. py_gbcms-2.0.0/docs/FAQ.md +0 -476
  47. py_gbcms-2.0.0/docs/INPUT_OUTPUT.md +0 -469
  48. py_gbcms-2.0.0/docs/INSTALLATION.md +0 -124
  49. py_gbcms-2.0.0/docs/PACKAGE_STRUCTURE.md +0 -299
  50. py_gbcms-2.0.0/docs/PARALLELIZATION_GUIDE.md +0 -185
  51. py_gbcms-2.0.0/docs/QUICKSTART.md +0 -329
  52. py_gbcms-2.0.0/docs/README.md +0 -100
  53. py_gbcms-2.0.0/docs/SUMMARY.md +0 -40
  54. py_gbcms-2.0.0/docs/TESTING_GUIDE.md +0 -261
  55. py_gbcms-2.0.0/git-flow-helper.sh +0 -118
  56. py_gbcms-2.0.0/scripts/setup_and_test.sh +0 -145
  57. py_gbcms-2.0.0/scripts/test_docker.sh +0 -156
  58. py_gbcms-2.0.0/scripts/test_maf_workflow.sh +0 -164
  59. py_gbcms-2.0.0/scripts/test_vcf_workflow.sh +0 -116
  60. py_gbcms-2.0.0/scripts/validate_against_cpp.sh +0 -272
  61. py_gbcms-2.0.0/scripts/verify_installation.py +0 -147
  62. py_gbcms-2.0.0/src/gbcms/__init__.py +0 -13
  63. py_gbcms-2.0.0/src/gbcms/cli.py +0 -745
  64. py_gbcms-2.0.0/src/gbcms/config.py +0 -98
  65. py_gbcms-2.0.0/src/gbcms/counter.py +0 -1074
  66. py_gbcms-2.0.0/src/gbcms/models.py +0 -295
  67. py_gbcms-2.0.0/src/gbcms/numba_counter.py +0 -394
  68. py_gbcms-2.0.0/src/gbcms/output.py +0 -573
  69. py_gbcms-2.0.0/src/gbcms/parallel.py +0 -129
  70. py_gbcms-2.0.0/src/gbcms/processor.py +0 -293
  71. py_gbcms-2.0.0/src/gbcms/reference.py +0 -86
  72. py_gbcms-2.0.0/src/gbcms/variant.py +0 -390
  73. py_gbcms-2.0.0/tests/__init__.py +0 -1
  74. py_gbcms-2.0.0/tests/conftest.py +0 -117
  75. py_gbcms-2.0.0/tests/test_cli.py +0 -235
  76. py_gbcms-2.0.0/tests/test_config.py +0 -142
  77. py_gbcms-2.0.0/tests/test_counter.py +0 -188
  78. py_gbcms-2.0.0/tests/test_output.py +0 -191
  79. py_gbcms-2.0.0/tests/test_reference.py +0 -84
  80. py_gbcms-2.0.0/tests/test_variant.py +0 -159
  81. py_gbcms-2.0.0/uv.lock +0 -1237
@@ -1,16 +1,11 @@
1
1
  GNU AFFERO GENERAL PUBLIC LICENSE
2
2
  Version 3, 19 November 2007
3
3
 
4
- Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
5
  Everyone is permitted to copy and distribute verbatim copies
6
6
  of this license document, but changing it is not allowed.
7
7
 
8
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
9
-
10
- 1. Definitions.
11
-
12
- "License" shall mean the terms and conditions for use, reproduction,
13
- and distribution as defined by Sections 1 through 9 of this document.
8
+ Preamble
14
9
 
15
10
  The GNU Affero General Public License is a free, copyleft license for
16
11
  software and other kinds of works, specifically designed to ensure
@@ -18,15 +13,16 @@ cooperation with the community in the case of network server software.
18
13
 
19
14
  The licenses for most software and other practical works are designed
20
15
  to take away your freedom to share and change the works. By contrast,
21
- the GNU Affero General Public License is intended to guarantee your
22
- freedom to share and change all versions of a program--to make sure it
23
- remains free software for all its users. When we speak of free software,
24
- we are referring to freedom, not price. Our General Public Licenses
25
- are designed to make sure that you have the freedom to distribute copies
26
- of free software (and charge for them if you wish), that you receive
27
- source code or can get it if you want it, that you can change the
28
- software or use pieces of it in new free programs, and that you know
29
- you can do these things.
16
+ our General Public Licenses are intended to guarantee your freedom to
17
+ share and change all versions of a program--to make sure it remains free
18
+ software for all its users.
19
+
20
+ When we speak of free software, we are referring to freedom, not
21
+ price. Our General Public Licenses are designed to make sure that you
22
+ have the freedom to distribute copies of free software (and charge for
23
+ them if you wish), that you receive source code or can get it if you
24
+ want it, that you can change the software or use pieces of it in new
25
+ free programs, and that you know you can do these things.
30
26
 
31
27
  Developers that use our General Public Licenses protect your rights
32
28
  with two steps: (1) assert copyright on the software, and (2) offer
@@ -39,23 +35,39 @@ receive widespread use, become available for other developers to
39
35
  incorporate. Many developers of free software are heartened and
40
36
  encouraged by the resulting cooperation. However, in the case of
41
37
  software used on network servers, this result may fail to come about.
42
- The GNU Affero General Public License includes a provision that helps
43
- developers of such software achieve the same cooperation we expect
44
- from other free software.
38
+ The GNU General Public License permits making a modified version and
39
+ letting the public access it on a server without ever releasing its
40
+ source code to the public.
41
+
42
+ The GNU Affero General Public License is designed specifically to
43
+ ensure that, in such cases, the modified source code becomes available
44
+ to the community. It requires the operator of a network server to
45
+ provide the source code of the modified version running there to the
46
+ users of that server. Therefore, public use of a modified version, on
47
+ a publicly accessible server, gives the public access to the source
48
+ code of the modified version.
49
+
50
+ An older license, called the Affero General Public License and
51
+ published by Affero, was designed to accomplish similar goals. This is
52
+ a different license, not a version of the Affero GPL, but Affero has
53
+ released a new version of the Affero GPL which permits relicensing under
54
+ this license.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
45
58
 
46
- For example, if you distribute copies of such a program, whether
47
- gratis or for a fee, and make the source code available to users so
48
- they can modify the program while keeping the network server running,
49
- you must offer the source code under the GNU Affero General Public
50
- License.
59
+ TERMS AND CONDITIONS
51
60
 
52
- The GNU Affero General Public License is based on the GNU General
53
- Public License, but includes an additional permission and a requirement
54
- regarding network server software that is different from the GNU GPL.
61
+ 0. Definitions.
55
62
 
56
- "The Program" here refers to any copyrightable work licensed under
57
- the GNU Affero General Public License. Each licensee is addressed as
58
- "you". "Licensees" and "recipients" may be individuals or organizations.
63
+ "This License" refers to version 3 of the GNU Affero General Public License.
64
+
65
+ "Copyright" also means copyright-like laws that apply to other kinds of
66
+ works, such as semiconductor masks.
67
+
68
+ "The Program" refers to any copyrightable work licensed under this
69
+ License. Each licensee is addressed as "you". "Licensees" and
70
+ "recipients" may be individuals or organizations.
59
71
 
60
72
  To "modify" a work means to copy from or adapt all or part of the work
61
73
  in a fashion requiring copyright permission, other than the making of an
@@ -262,9 +274,9 @@ in one of these ways:
262
274
  available for as long as needed to satisfy these requirements.
263
275
 
264
276
  e) Convey the object code using peer-to-peer transmission, provided
265
- you inform other peers where the object and its Corresponding Source
266
- are being offered to the general public at no charge under
267
- subsection 6d.
277
+ you inform other peers where the object code and Corresponding
278
+ Source of the work are being offered to the general public at no
279
+ charge under subsection 6d.
268
280
 
269
281
  A separable portion of the object code, whose source code is excluded
270
282
  from the Corresponding Source as a System Library, need not be
@@ -353,7 +365,7 @@ that material) supplement the terms of this License with terms:
353
365
  authors of the material; or
354
366
 
355
367
  e) Declining to grant rights under trademark law for use of some
356
- trade names, marks, or slogans; or
368
+ trade names, trademarks, or service marks; or
357
369
 
358
370
  f) Requiring indemnification of licensors and authors of that
359
371
  material by anyone who conveys the material (or modified versions of
@@ -533,7 +545,7 @@ interacting with it remotely through a computer network (if your version
533
545
  supports such interaction) an opportunity to receive the Corresponding
534
546
  Source of your version by providing access to the Corresponding Source
535
547
  from a network server at no charge, through some standard or customary
536
- means of facilitating copying of software. This corresponding source
548
+ means of facilitating copying of software. This Corresponding Source
537
549
  shall include the Corresponding Source for any work covered by version 3
538
550
  of the GNU General Public License that is incorporated pursuant to the
539
551
  following paragraph.
@@ -631,7 +643,7 @@ the "copyright" line and a pointer to where the full notice is found.
631
643
  GNU Affero General Public License for more details.
632
644
 
633
645
  You should have received a copy of the GNU Affero General Public License
634
- along with this program. If not, see <http://www.gnu.org/licenses/>.
646
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
635
647
 
636
648
  Also add information on how to contact you by electronic and paper mail.
637
649
 
@@ -646,19 +658,4 @@ specific requirements.
646
658
  You should also get your employer (if you work as a programmer) or school,
647
659
  if any, to sign a "copyright disclaimer" for the program, if necessary.
648
660
  For more information on this, and how to apply and follow the GNU AGPL, see
649
- <http://www.gnu.org/licenses/>.
650
-
651
- Copyright 2024 MSK-ACCESS Team
652
-
653
- This program is free software: you can redistribute it and/or modify
654
- it under the terms of the GNU Affero General Public License as published by
655
- the Free Software Foundation, either version 3 of the License, or
656
- (at your option) any later version.
657
-
658
- This program is distributed in the hope that it will be useful,
659
- but WITHOUT ANY WARRANTY; without even the implied warranty of
660
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
661
- GNU Affero General Public License for more details.
662
-
663
- You should have received a copy of the GNU Affero General Public License
664
- along with this program. If not, see <http://www.gnu.org/licenses/>.
661
+ <https://www.gnu.org/licenses/>.
@@ -0,0 +1,216 @@
1
+ Metadata-Version: 2.4
2
+ Name: py-gbcms
3
+ Version: 2.2.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Science/Research
6
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
7
+ Classifier: Programming Language :: Python :: 3.11
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
10
+ Requires-Dist: pysam>=0.21.0
11
+ Requires-Dist: typer>=0.9.0
12
+ Requires-Dist: rich>=13.0.0
13
+ Requires-Dist: pydantic>=2.0.0
14
+ Requires-Dist: pytest>=7.4.0 ; extra == 'dev'
15
+ Requires-Dist: pytest-cov>=4.1.0 ; extra == 'dev'
16
+ Requires-Dist: pytest-mock>=3.11.0 ; extra == 'dev'
17
+ Requires-Dist: black>=23.0.0 ; extra == 'dev'
18
+ Requires-Dist: ruff>=0.1.0 ; extra == 'dev'
19
+ Requires-Dist: mypy>=1.5.0 ; extra == 'dev'
20
+ Requires-Dist: types-pyyaml>=6.0.0 ; extra == 'dev'
21
+ Requires-Dist: mkdocs-material>=9.0.0 ; extra == 'dev'
22
+ Provides-Extra: all
23
+ Provides-Extra: dev
24
+ Provides-Extra: fast
25
+ License-File: LICENSE
26
+ Summary: Python implementation of GetBaseCountsMultiSample (gbcms) for calculating base counts in BAM files
27
+ Keywords: bioinformatics,genomics,bam,vcf,maf,base-counts,gbcms
28
+ Author-email: MSK-ACCESS <shahr2@mskcc.org>
29
+ Requires-Python: >=3.10
30
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
31
+ Project-URL: Bug Tracker, https://github.com/msk-access/py-gbcms/issues
32
+ Project-URL: Documentation, https://github.com/msk-access/py-gbcms#readme
33
+ Project-URL: Homepage, https://github.com/msk-access/py-gbcms
34
+ Project-URL: Repository, https://github.com/msk-access/py-gbcms
35
+
36
+ # py-gbcms
37
+
38
+ **Complete orientation-aware counting system for genomic variants**
39
+
40
+ [![Tests](https://github.com/msk-access/py-gbcms/workflows/Tests/badge.svg)](https://github.com/msk-access/py-gbcms/actions)
41
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
42
+
43
+ ## Features
44
+
45
+ - 🚀 **High Performance**: Rust-powered core engine with multi-threading
46
+ - 🧬 **Complete Variant Support**: SNP, MNP, insertion, deletion, and complex variants (DelIns, SNP+Indel)
47
+ - 📊 **Orientation-Aware**: Forward and reverse strand analysis with fragment counting
48
+ - 🔬 **Statistical Analysis**: Fisher's exact test for strand bias
49
+ - 📁 **Flexible I/O**: VCF and MAF input/output formats
50
+ - 🎯 **Quality Filters**: 7 configurable read filtering options
51
+
52
+ ## Installation
53
+
54
+ **Quick install:**
55
+ ```bash
56
+ pip install py-gbcms
57
+ ```
58
+
59
+ **From source (requires Rust):**
60
+ ```bash
61
+ git clone https://github.com/msk-access/py-gbcms.git
62
+ cd py-gbcms
63
+ pip install .
64
+ ```
65
+
66
+ **Docker:**
67
+ ```bash
68
+ docker pull ghcr.io/msk-access/py-gbcms:2.1.0
69
+ ```
70
+
71
+ 📖 **Full documentation:** https://msk-access.github.io/py-gbcms/
72
+
73
+ ---
74
+
75
+ ## Usage
76
+
77
+ `py-gbcms` can be used in two ways:
78
+
79
+ ### 🔧 Option 1: Standalone CLI (1-10 samples)
80
+
81
+ **Best for:** Quick analysis, local processing, direct control
82
+
83
+ ```bash
84
+ gbcms run \
85
+ --variants variants.vcf \
86
+ --bam sample1.bam \
87
+ --fasta reference.fa \
88
+ --output-dir results/
89
+ ```
90
+
91
+ **Output:** `results/sample1.vcf`
92
+
93
+ **Learn more:**
94
+ - 📘 [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
95
+ - 📖 [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
96
+
97
+ ---
98
+
99
+ ### 🔄 Option 2: Nextflow Workflow (10+ samples, HPC)
100
+
101
+ **Best for:** Many samples, HPC clusters (SLURM), reproducible pipelines
102
+
103
+ ```bash
104
+ nextflow run nextflow/main.nf \
105
+ --input samplesheet.csv \
106
+ --variants variants.vcf \
107
+ --fasta reference.fa \
108
+ -profile slurm
109
+ ```
110
+
111
+ **Features:**
112
+ - ✅ Automatic parallelization across samples
113
+ - ✅ SLURM/HPC integration
114
+ - ✅ Container support (Docker/Singularity)
115
+ - ✅ Resume failed runs
116
+
117
+ **Learn more:**
118
+ - 🔄 [Nextflow Workflow Guide](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
119
+ - 📋 [Usage Patterns Comparison](https://cmo-ci.gitbook.io/py-gbcms/workflows)
120
+
121
+ ---
122
+
123
+ ## Which Should I Use?
124
+
125
+ | Scenario | Recommendation |
126
+ |----------|----------------|
127
+ | 1-10 samples, local machine | **CLI** |
128
+ | 10+ samples, HPC cluster | **Nextflow** |
129
+ | Quick ad-hoc analysis | **CLI** |
130
+ | Production pipeline | **Nextflow** |
131
+ | Need auto-parallelization | **Nextflow** |
132
+ | Full manual control | **CLI** |
133
+
134
+ ---
135
+
136
+ ## Quick Examples
137
+
138
+ ### CLI: Single Sample
139
+ ```bash
140
+ gbcms run \
141
+ --variants variants.vcf \
142
+ --bam tumor.bam \
143
+ --fasta hg19.fa \
144
+ --output-dir results/ \
145
+ --threads 4
146
+ ```
147
+
148
+ ### CLI: Multiple Samples (Sequential)
149
+ ```bash
150
+ gbcms run \
151
+ --variants variants.vcf \
152
+ --bam-list samples.txt \
153
+ --fasta hg19.fa \
154
+ --output-dir results/
155
+ ```
156
+
157
+ ### Nextflow: Many Samples (Parallel)
158
+ ```bash
159
+ # samplesheet.csv:
160
+ # sample,bam,bai
161
+ # tumor1,/path/to/tumor1.bam,
162
+ # tumor2,/path/to/tumor2.bam,
163
+
164
+ nextflow run nextflow/main.nf \
165
+ --input samplesheet.csv \
166
+ --variants variants.vcf \
167
+ --fasta hg19.fa \
168
+ --outdir results \
169
+ -profile slurm
170
+ ```
171
+
172
+ ---
173
+
174
+ ## Documentation
175
+
176
+ 📚 **Full Documentation:** https://cmo-ci.gitbook.io/py-gbcms/
177
+
178
+ **Quick Links:**
179
+ - [Installation](https://cmo-ci.gitbook.io/py-gbcms/installation)
180
+ - [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
181
+ - [Nextflow Workflow](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
182
+ - [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
183
+ - [Input & Output Formats](https://cmo-ci.gitbook.io/py-gbcms/input_output)
184
+ - [Architecture](https://cmo-ci.gitbook.io/py-gbcms/architecture)
185
+
186
+ ---
187
+
188
+ ## Contributing
189
+
190
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
191
+
192
+ To contribute to documentation, see the [`gh-pages` branch](https://github.com/msk-access/py-gbcms/tree/gh-pages).
193
+
194
+ ---
195
+
196
+ ## Citation
197
+
198
+ If you use `py-gbcms` in your research, please cite:
199
+
200
+ ```
201
+ [Citation to be added]
202
+ ```
203
+
204
+ ---
205
+
206
+ ## License
207
+
208
+ AGPL-3.0 - see [LICENSE](LICENSE) for details.
209
+
210
+ ---
211
+
212
+ ## Support
213
+
214
+ - 🐛 **Issues:** https://github.com/msk-access/py-gbcms/issues
215
+ - 💬 **Discussions:** https://github.com/msk-access/py-gbcms/discussions
216
+
@@ -0,0 +1,180 @@
1
+ # py-gbcms
2
+
3
+ **Complete orientation-aware counting system for genomic variants**
4
+
5
+ [![Tests](https://github.com/msk-access/py-gbcms/workflows/Tests/badge.svg)](https://github.com/msk-access/py-gbcms/actions)
6
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
7
+
8
+ ## Features
9
+
10
+ - 🚀 **High Performance**: Rust-powered core engine with multi-threading
11
+ - 🧬 **Complete Variant Support**: SNP, MNP, insertion, deletion, and complex variants (DelIns, SNP+Indel)
12
+ - 📊 **Orientation-Aware**: Forward and reverse strand analysis with fragment counting
13
+ - 🔬 **Statistical Analysis**: Fisher's exact test for strand bias
14
+ - 📁 **Flexible I/O**: VCF and MAF input/output formats
15
+ - 🎯 **Quality Filters**: 7 configurable read filtering options
16
+
17
+ ## Installation
18
+
19
+ **Quick install:**
20
+ ```bash
21
+ pip install py-gbcms
22
+ ```
23
+
24
+ **From source (requires Rust):**
25
+ ```bash
26
+ git clone https://github.com/msk-access/py-gbcms.git
27
+ cd py-gbcms
28
+ pip install .
29
+ ```
30
+
31
+ **Docker:**
32
+ ```bash
33
+ docker pull ghcr.io/msk-access/py-gbcms:2.1.0
34
+ ```
35
+
36
+ 📖 **Full documentation:** https://msk-access.github.io/py-gbcms/
37
+
38
+ ---
39
+
40
+ ## Usage
41
+
42
+ `py-gbcms` can be used in two ways:
43
+
44
+ ### 🔧 Option 1: Standalone CLI (1-10 samples)
45
+
46
+ **Best for:** Quick analysis, local processing, direct control
47
+
48
+ ```bash
49
+ gbcms run \
50
+ --variants variants.vcf \
51
+ --bam sample1.bam \
52
+ --fasta reference.fa \
53
+ --output-dir results/
54
+ ```
55
+
56
+ **Output:** `results/sample1.vcf`
57
+
58
+ **Learn more:**
59
+ - 📘 [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
60
+ - 📖 [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
61
+
62
+ ---
63
+
64
+ ### 🔄 Option 2: Nextflow Workflow (10+ samples, HPC)
65
+
66
+ **Best for:** Many samples, HPC clusters (SLURM), reproducible pipelines
67
+
68
+ ```bash
69
+ nextflow run nextflow/main.nf \
70
+ --input samplesheet.csv \
71
+ --variants variants.vcf \
72
+ --fasta reference.fa \
73
+ -profile slurm
74
+ ```
75
+
76
+ **Features:**
77
+ - ✅ Automatic parallelization across samples
78
+ - ✅ SLURM/HPC integration
79
+ - ✅ Container support (Docker/Singularity)
80
+ - ✅ Resume failed runs
81
+
82
+ **Learn more:**
83
+ - 🔄 [Nextflow Workflow Guide](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
84
+ - 📋 [Usage Patterns Comparison](https://cmo-ci.gitbook.io/py-gbcms/workflows)
85
+
86
+ ---
87
+
88
+ ## Which Should I Use?
89
+
90
+ | Scenario | Recommendation |
91
+ |----------|----------------|
92
+ | 1-10 samples, local machine | **CLI** |
93
+ | 10+ samples, HPC cluster | **Nextflow** |
94
+ | Quick ad-hoc analysis | **CLI** |
95
+ | Production pipeline | **Nextflow** |
96
+ | Need auto-parallelization | **Nextflow** |
97
+ | Full manual control | **CLI** |
98
+
99
+ ---
100
+
101
+ ## Quick Examples
102
+
103
+ ### CLI: Single Sample
104
+ ```bash
105
+ gbcms run \
106
+ --variants variants.vcf \
107
+ --bam tumor.bam \
108
+ --fasta hg19.fa \
109
+ --output-dir results/ \
110
+ --threads 4
111
+ ```
112
+
113
+ ### CLI: Multiple Samples (Sequential)
114
+ ```bash
115
+ gbcms run \
116
+ --variants variants.vcf \
117
+ --bam-list samples.txt \
118
+ --fasta hg19.fa \
119
+ --output-dir results/
120
+ ```
121
+
122
+ ### Nextflow: Many Samples (Parallel)
123
+ ```bash
124
+ # samplesheet.csv:
125
+ # sample,bam,bai
126
+ # tumor1,/path/to/tumor1.bam,
127
+ # tumor2,/path/to/tumor2.bam,
128
+
129
+ nextflow run nextflow/main.nf \
130
+ --input samplesheet.csv \
131
+ --variants variants.vcf \
132
+ --fasta hg19.fa \
133
+ --outdir results \
134
+ -profile slurm
135
+ ```
136
+
137
+ ---
138
+
139
+ ## Documentation
140
+
141
+ 📚 **Full Documentation:** https://cmo-ci.gitbook.io/py-gbcms/
142
+
143
+ **Quick Links:**
144
+ - [Installation](https://cmo-ci.gitbook.io/py-gbcms/installation)
145
+ - [CLI Quick Start](https://cmo-ci.gitbook.io/py-gbcms/quick-start)
146
+ - [Nextflow Workflow](https://cmo-ci.gitbook.io/py-gbcms/nextflow)
147
+ - [CLI Reference](https://cmo-ci.gitbook.io/py-gbcms/cli_features)
148
+ - [Input & Output Formats](https://cmo-ci.gitbook.io/py-gbcms/input_output)
149
+ - [Architecture](https://cmo-ci.gitbook.io/py-gbcms/architecture)
150
+
151
+ ---
152
+
153
+ ## Contributing
154
+
155
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development guidelines.
156
+
157
+ To contribute to documentation, see the [`gh-pages` branch](https://github.com/msk-access/py-gbcms/tree/gh-pages).
158
+
159
+ ---
160
+
161
+ ## Citation
162
+
163
+ If you use `py-gbcms` in your research, please cite:
164
+
165
+ ```
166
+ [Citation to be added]
167
+ ```
168
+
169
+ ---
170
+
171
+ ## License
172
+
173
+ AGPL-3.0 - see [LICENSE](LICENSE) for details.
174
+
175
+ ---
176
+
177
+ ## Support
178
+
179
+ - 🐛 **Issues:** https://github.com/msk-access/py-gbcms/issues
180
+ - 💬 **Discussions:** https://github.com/msk-access/py-gbcms/discussions
@@ -1,13 +1,13 @@
1
1
  [project]
2
2
  name = "py-gbcms"
3
- version = "2.0.0"
3
+ version = "2.2.0"
4
4
  description = "Python implementation of GetBaseCountsMultiSample (gbcms) for calculating base counts in BAM files"
5
5
  authors = [
6
6
  {name = "MSK-ACCESS", email = "shahr2@mskcc.org"}
7
7
  ]
8
8
  readme = "README.md"
9
- requires-python = ">=3.11"
10
- license = {text = "AGPL-3.0"}
9
+ requires-python = ">=3.10"
10
+ license = {file = "LICENSE"}
11
11
  keywords = ["bioinformatics", "genomics", "bam", "vcf", "maf", "base-counts", "gbcms"]
12
12
  classifiers = [
13
13
  "Development Status :: 4 - Beta",
@@ -19,27 +19,16 @@ classifiers = [
19
19
  ]
20
20
 
21
21
  dependencies = [
22
- "pysam>=0.22.0",
23
- "numpy>=1.24.0",
22
+ "pysam>=0.21.0",
24
23
  "typer>=0.9.0",
25
24
  "rich>=13.0.0",
26
- "pandas>=2.0.0",
27
25
  "pydantic>=2.0.0",
28
- "pydantic-settings>=2.0.0",
29
- "numba>=0.58.0",
30
- "joblib>=1.3.0",
31
- "scipy>=1.11.0",
32
26
  ]
33
27
 
34
28
  [project.optional-dependencies]
35
- fast = [
36
- "cyvcf2>=0.30.0",
37
- ]
38
-
29
+ fast = []
39
30
 
40
- all = [
41
- "cyvcf2>=0.30.0",
42
- ]
31
+ all = []
43
32
 
44
33
  dev = [
45
34
  "pytest>=7.4.0",
@@ -48,25 +37,28 @@ dev = [
48
37
  "black>=23.0.0",
49
38
  "ruff>=0.1.0",
50
39
  "mypy>=1.5.0",
51
- "pre-commit>=3.3.0",
52
40
  "types-pyyaml>=6.0.0",
41
+ "mkdocs-material>=9.0.0",
53
42
  ]
54
43
 
55
44
  [project.scripts]
56
45
  gbcms = "gbcms.cli:app"
57
46
 
58
47
  [project.urls]
59
- Homepage = "https://github.com/msk-access/getbasecounts"
60
- Repository = "https://github.com/msk-access/getbasecounts"
61
- Documentation = "https://github.com/msk-access/getbasecounts#readme"
62
- "Bug Tracker" = "https://github.com/msk-access/getbasecounts/issues"
48
+ Homepage = "https://github.com/msk-access/py-gbcms"
49
+ Repository = "https://github.com/msk-access/py-gbcms"
50
+ Documentation = "https://github.com/msk-access/py-gbcms#readme"
51
+ "Bug Tracker" = "https://github.com/msk-access/py-gbcms/issues"
63
52
 
64
53
  [build-system]
65
- requires = ["hatchling"]
66
- build-backend = "hatchling.build"
54
+ requires = ["maturin>=1.0,<2.0"]
55
+ build-backend = "maturin"
56
+
57
+ [tool.maturin]
58
+ python-source = "src"
59
+ manifest-path = "rust/Cargo.toml"
60
+ module-name = "gbcms._rs"
67
61
 
68
- [tool.hatch.build.targets.wheel]
69
- packages = ["src/gbcms"]
70
62
 
71
63
  [tool.pytest.ini_options]
72
64
  testpaths = ["tests"]
@@ -89,7 +81,7 @@ include = '\.pyi?$'
89
81
 
90
82
  [tool.ruff]
91
83
  line-length = 100
92
- target-version = "py311"
84
+ target-version = "py310"
93
85
 
94
86
  [tool.ruff.lint]
95
87
  select = [
@@ -127,9 +119,7 @@ disable_error_code = ["call-arg"]
127
119
  [[tool.mypy.overrides]]
128
120
  module = [
129
121
  "pysam.*",
130
- "numba.*",
131
- "joblib.*",
132
- "cyvcf2.*",
122
+ "gbcms._rs",
133
123
  ]
134
124
  ignore_missing_imports = true
135
125
 
@@ -152,7 +142,6 @@ exclude_lines = [
152
142
  dev = [
153
143
  "black>=25.9.0",
154
144
  "mypy>=1.18.2",
155
- "pre-commit>=4.3.0",
156
145
  "pytest>=8.4.2",
157
146
  "pytest-cov>=4.1.0",
158
147
  "pytest-benchmark>=5.1.0",