deeptaxa-rrna 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. deeptaxa_rrna-1.0.0/.gitattributes +1 -0
  2. deeptaxa_rrna-1.0.0/.gitignore +182 -0
  3. deeptaxa_rrna-1.0.0/LICENSE +21 -0
  4. deeptaxa_rrna-1.0.0/PKG-INFO +365 -0
  5. deeptaxa_rrna-1.0.0/README.md +324 -0
  6. deeptaxa_rrna-1.0.0/conda-recipe/README.md +65 -0
  7. deeptaxa_rrna-1.0.0/conda-recipe/meta.yaml +66 -0
  8. deeptaxa_rrna-1.0.0/deeptaxa/__init__.py +35 -0
  9. deeptaxa_rrna-1.0.0/deeptaxa/cli.py +491 -0
  10. deeptaxa_rrna-1.0.0/deeptaxa/config.py +64 -0
  11. deeptaxa_rrna-1.0.0/deeptaxa/dataset.py +320 -0
  12. deeptaxa_rrna-1.0.0/deeptaxa/describe.py +212 -0
  13. deeptaxa_rrna-1.0.0/deeptaxa/models/__init__.py +21 -0
  14. deeptaxa_rrna-1.0.0/deeptaxa/models/bert.py +109 -0
  15. deeptaxa_rrna-1.0.0/deeptaxa/models/cnn.py +170 -0
  16. deeptaxa_rrna-1.0.0/deeptaxa/models/hybrid.py +166 -0
  17. deeptaxa_rrna-1.0.0/deeptaxa/models/losses.py +69 -0
  18. deeptaxa_rrna-1.0.0/deeptaxa/predict.py +669 -0
  19. deeptaxa_rrna-1.0.0/deeptaxa/train.py +1133 -0
  20. deeptaxa_rrna-1.0.0/deeptaxa/tune.py +247 -0
  21. deeptaxa_rrna-1.0.0/deeptaxa/utils.py +85 -0
  22. deeptaxa_rrna-1.0.0/deeptaxa_rrna.egg-info/PKG-INFO +365 -0
  23. deeptaxa_rrna-1.0.0/deeptaxa_rrna.egg-info/SOURCES.txt +48 -0
  24. deeptaxa_rrna-1.0.0/deeptaxa_rrna.egg-info/dependency_links.txt +1 -0
  25. deeptaxa_rrna-1.0.0/deeptaxa_rrna.egg-info/entry_points.txt +2 -0
  26. deeptaxa_rrna-1.0.0/deeptaxa_rrna.egg-info/requires.txt +9 -0
  27. deeptaxa_rrna-1.0.0/deeptaxa_rrna.egg-info/top_level.txt +1 -0
  28. deeptaxa_rrna-1.0.0/pyproject.toml +41 -0
  29. deeptaxa_rrna-1.0.0/scripts/calibration_diagnosis.sh +60 -0
  30. deeptaxa_rrna-1.0.0/scripts/calibration_sweep.sh +62 -0
  31. deeptaxa_rrna-1.0.0/scripts/deeptaxa_workflow.sh +194 -0
  32. deeptaxa_rrna-1.0.0/scripts/run_ablation.sh +120 -0
  33. deeptaxa_rrna-1.0.0/scripts/run_amplicon_eval.sh +96 -0
  34. deeptaxa_rrna-1.0.0/scripts/run_experiment.sh +641 -0
  35. deeptaxa_rrna-1.0.0/scripts/run_similarity_eval.sh +113 -0
  36. deeptaxa_rrna-1.0.0/scripts/sequence_similarity.py +218 -0
  37. deeptaxa_rrna-1.0.0/scripts/similarity_curve.py +313 -0
  38. deeptaxa_rrna-1.0.0/scripts/simulate_amplicons.py +267 -0
  39. deeptaxa_rrna-1.0.0/setup.cfg +4 -0
  40. deeptaxa_rrna-1.0.0/tutorials/.gitignore +6 -0
  41. deeptaxa_rrna-1.0.0/tutorials/Makefile +19 -0
  42. deeptaxa_rrna-1.0.0/tutorials/_quarto.yml +47 -0
  43. deeptaxa_rrna-1.0.0/tutorials/analysis.qmd +930 -0
  44. deeptaxa_rrna-1.0.0/tutorials/architecture.qmd +434 -0
  45. deeptaxa_rrna-1.0.0/tutorials/custom.css +17 -0
  46. deeptaxa_rrna-1.0.0/tutorials/index.qmd +37 -0
  47. deeptaxa_rrna-1.0.0/tutorials/prediction.qmd +376 -0
  48. deeptaxa_rrna-1.0.0/tutorials/references.bib +66 -0
  49. deeptaxa_rrna-1.0.0/tutorials/render_tutorials.sh +98 -0
  50. deeptaxa_rrna-1.0.0/tutorials/training.qmd +411 -0
@@ -0,0 +1 @@
1
+ *.pt filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,182 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
175
+
176
+ # DeepTaxa
177
+ *.pt
178
+ *.pth
179
+ *.pkl
180
+ .cache/huggingface/
181
+ quarto-*.deb
182
+ quarto-*.deb
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Systems Genomics Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,365 @@
1
+ Metadata-Version: 2.4
2
+ Name: deeptaxa-rrna
3
+ Version: 1.0.0
4
+ Summary: A deep learning framework for hierarchical taxonomy classification of 16S rRNA gene sequences.
5
+ Author-email: Khlood Ramadan <khlood.ramadan@aucegypt.edu>, Lobna Ghonaim <lobnaghonaim@aucegypt.edu>, Rana Salah <rana_salah@aucegypt.edu>, Ahmed Moustafa <amoustafa@aucegypt.edu>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Systems Genomics Lab
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Requires-Python: >=3.10
29
+ Description-Content-Type: text/markdown
30
+ License-File: LICENSE
31
+ Requires-Dist: torch
32
+ Requires-Dist: numpy
33
+ Requires-Dist: transformers
34
+ Requires-Dist: pandas
35
+ Requires-Dist: tqdm
36
+ Requires-Dist: scikit-learn
37
+ Requires-Dist: biopython
38
+ Requires-Dist: h5py
39
+ Requires-Dist: optuna
40
+ Dynamic: license-file
41
+
42
+ # DeepTaxa
43
+
44
+ [![License](https://img.shields.io/github/license/systems-genomics-lab/deeptaxa)](LICENSE)
45
+ [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-blue)](https://huggingface.co/systems-genomics-lab/deeptaxa)
46
+ [![Tutorials](https://img.shields.io/badge/Tutorials-GitHub%20Pages-green)](https://systems-genomics-lab.github.io/deeptaxa/)
47
+ [![Last Commit](https://img.shields.io/github/last-commit/systems-genomics-lab/deeptaxa)](https://github.com/systems-genomics-lab/deeptaxa/commits/main)
48
+ [![Issues](https://img.shields.io/github/issues/systems-genomics-lab/deeptaxa)](https://github.com/systems-genomics-lab/deeptaxa/issues)
49
+ [![GitHub Stars](https://img.shields.io/github/stars/systems-genomics-lab/deeptaxa?style=social)](https://github.com/systems-genomics-lab/deeptaxa/stargazers)
50
+
51
+ **DeepTaxa** is a deep learning framework for hierarchical taxonomic classification of 16S rRNA gene sequences. It classifies sequences into all seven taxonomic ranks (Domain through Species) in a single forward pass, achieving 92.96% species-level accuracy (3-seed mean) on the Greengenes2 2024.09 test set.
52
+
53
+ ---
54
+
55
+ ## Table of Contents
56
+
57
+ 1. [Performance](#performance)
58
+ 2. [Installation](#installation)
59
+ 3. [Quick Start](#quick-start)
60
+ 4. [Data and Pre-Trained Models](#data-and-pre-trained-models)
61
+ 5. [Training](#training)
62
+ 6. [Experimentation](#experimentation)
63
+ 7. [Scripts](#scripts)
64
+ 8. [Tutorials](#tutorials)
65
+ 9. [License](#license)
66
+ 10. [Citation](#citation)
67
+ 11. [Contact](#contact)
68
+ 12. [Acknowledgements](#acknowledgements)
69
+
70
+ ---
71
+
72
+ ## Performance
73
+
74
+ The published HybridCNNBERT checkpoint achieves the following on 69,335 held-out test sequences from Greengenes2 2024.09 (3-seed mean across seeds 42, 123, 456):
75
+
76
+ | Rank | Accuracy | F1 | ECE |
77
+ |------|----------|-----|-----|
78
+ | Domain | 99.98% | 99.98% | 0.0001 |
79
+ | Phylum | 99.69% | 99.68% | 0.0023 |
80
+ | Class | 99.63% | 99.59% | 0.0024 |
81
+ | Order | 99.07% | 98.97% | 0.0056 |
82
+ | Family | 98.61% | 98.41% | 0.0075 |
83
+ | Genus | 96.90% | 96.48% | 0.0144 |
84
+ | Species | 92.96% | 92.12% | 0.0242 |
85
+
86
+ Cross-seed standard deviation is at most 0.0008 F1 at every rank (species std 0.0008 F1 / 0.07 percentage points accuracy), demonstrating high reproducibility.
87
+
88
+ ### Architecture
89
+
90
+ | Component | Configuration |
91
+ |-----------|--------------|
92
+ | CNN | embed_dim=896, 256 filters, kernels [3, 5, 7], 1 conv layer |
93
+ | BERT | 4 layers, 7 heads, hidden=896, FFN=3584, GELU, random init |
94
+ | Fusion | Learnable alpha/beta weights + BERT residual connection |
95
+ | Training | Cross-entropy loss, LR=5e-4, batch=64, dropout=0.20, 10 epochs |
96
+
97
+ Three architectures are available:
98
+
99
+ - **HybridCNNBERTClassifier** (default): Fuses CNN local motif features with BERT global context. Used for the published checkpoints.
100
+ - **CNNClassifier**: Multi-kernel convolutional network only. Faster training, slightly lower species accuracy.
101
+ - **BERTClassifier**: Transformer encoder only. On its own, a from-scratch transformer underperforms substantially at the species rank; provided mainly for ablation.
102
+
103
+ ### Pre-Trained Checkpoints
104
+
105
+ Two checkpoints are hosted on [Hugging Face](https://huggingface.co/systems-genomics-lab/deeptaxa):
106
+
107
+ | Checkpoint | Training data | Species accuracy | Parameters |
108
+ |-----------|--------------|-----------------|------------|
109
+ | `deeptaxa-full-length-v1.pt` | Full-length 16S (277,336 sequences, ~1,500 bp) | 92.96% (3-seed mean) | 76.4 M |
110
+ | `deeptaxa-v3v4-v1.pt` | In-silico V3-V4 amplicons (~420 bp, 273,003 amplicons) | 87.55% (seed 42) | 75.8 M |
111
+
112
+ Both checkpoints share the same compact architecture (the small parameter difference reflects smaller per-rank classifier heads on the V3-V4 model, which has a smaller species vocabulary: 8,347 vs 16,909). A `config.json` with full model metadata is also available.
113
+
114
+ ---
115
+
116
+ ## Installation
117
+
118
+ DeepTaxa requires Python 3.10 or later. We recommend using a Conda environment:
119
+
120
+ ```bash
121
+ git clone https://github.com/systems-genomics-lab/deeptaxa.git
122
+ cd deeptaxa
123
+ conda create --name deeptaxa_env python=3.10 -y
124
+ conda activate deeptaxa_env
125
+ pip install .
126
+ deeptaxa --version
127
+ ```
128
+
129
+ Dependencies (torch, transformers, pandas, numpy, scikit-learn, h5py, etc.) are specified in [`pyproject.toml`](pyproject.toml) and installed automatically.
130
+
131
+ > **Note**: For GPU support, install a CUDA-compatible PyTorch build before running `pip install .`. See the [PyTorch installation guide](https://pytorch.org/get-started/locally/).
132
+
133
+ ---
134
+
135
+ ## Quick Start
136
+
137
+ **Predict** with the pre-trained model (no training data needed):
138
+
139
+ ```bash
140
+ # Download the checkpoint
141
+ mkdir -p ../deeptaxa-data/models
142
+ wget -P ../deeptaxa-data/models \
143
+ https://huggingface.co/systems-genomics-lab/deeptaxa/resolve/main/deeptaxa-full-length-v1.pt
144
+
145
+ # Classify sequences
146
+ deeptaxa predict \
147
+ --fasta-file your_sequences.fna \
148
+ --checkpoint ../deeptaxa-data/models/deeptaxa-full-length-v1.pt \
149
+ --output-dir ../deeptaxa-outputs/predictions
150
+ ```
151
+
152
+ **Evaluate** against known labels (adds per-rank accuracy, F1, ECE to the output):
153
+
154
+ ```bash
155
+ deeptaxa predict \
156
+ --fasta-file ../deeptaxa-data/greengenes/gg_2024_09_testing.fna.gz \
157
+ --taxonomy-file ../deeptaxa-data/greengenes/gg_2024_09_testing.tsv.gz \
158
+ --checkpoint ../deeptaxa-data/models/deeptaxa-full-length-v1.pt \
159
+ --output-dir ../deeptaxa-outputs/evaluation
160
+ ```
161
+
162
+ **Inspect** a checkpoint:
163
+
164
+ ```bash
165
+ deeptaxa describe \
166
+ --checkpoint ../deeptaxa-data/models/deeptaxa-full-length-v1.pt
167
+ ```
168
+
169
+ > **Tip**: Run `deeptaxa train --help` or `deeptaxa predict --help` for a full list of options.
170
+
171
+ ---
172
+
173
+ ## Data and Pre-Trained Models
174
+
175
+ Datasets and checkpoints are hosted on [Hugging Face](https://huggingface.co/systems-genomics-lab/deeptaxa). Store them in a sibling directory outside the codebase:
176
+
177
+ ```
178
+ working_directory/
179
+ ├── deeptaxa/ # This repository
180
+ ├── deeptaxa-data/ # Datasets and checkpoints
181
+ │ ├── greengenes/
182
+ │ │ ├── gg_2024_09_training.fna.gz (277,336 sequences, ~96 MB)
183
+ │ │ ├── gg_2024_09_training.tsv.gz (taxonomy labels, ~2.6 MB)
184
+ │ │ ├── gg_2024_09_testing.fna.gz (69,335 sequences, ~24 MB)
185
+ │ │ └── gg_2024_09_testing.tsv.gz (taxonomy labels, ~0.8 MB)
186
+ │ └── models/
187
+ │ ├── deeptaxa-full-length-v1.pt
188
+ │ └── deeptaxa-v3v4-v1.pt
189
+ └── deeptaxa-outputs/ # Training and prediction outputs
190
+ ```
191
+
192
+ DeepTaxa uses the [Greengenes2](https://greengenes2.ucsd.edu/) database (2024.09 release), reformatted and hosted on [Hugging Face](https://huggingface.co/datasets/systems-genomics-lab/greengenes).
193
+
194
+ ### Download
195
+
196
+ ```bash
197
+ # Dataset
198
+ mkdir -p deeptaxa-data/greengenes && cd deeptaxa-data/greengenes
199
+ for f in gg_2024_09_training.fna.gz gg_2024_09_training.tsv.gz \
200
+ gg_2024_09_testing.fna.gz gg_2024_09_testing.tsv.gz; do
201
+ wget https://huggingface.co/datasets/systems-genomics-lab/greengenes/resolve/main/$f
202
+ done
203
+
204
+ # Checkpoints
205
+ mkdir -p ../models && cd ../models
206
+ wget https://huggingface.co/systems-genomics-lab/deeptaxa/resolve/main/deeptaxa-full-length-v1.pt
207
+ wget https://huggingface.co/systems-genomics-lab/deeptaxa/resolve/main/deeptaxa-v3v4-v1.pt
208
+ wget https://huggingface.co/systems-genomics-lab/deeptaxa/resolve/main/config.json
209
+ ```
210
+
211
+ > **Note**: Checkpoint files use PyTorch's `pickle`-based serialization. Download them only from the official Hugging Face repository.
212
+
213
+ ---
214
+
215
+ ## Training
216
+
217
+ All architecture hyperparameters default to the published (compact) configuration, so a minimal training command reproduces the published checkpoint:
218
+
219
+ ```bash
220
+ deeptaxa train \
221
+ --fasta-file ../deeptaxa-data/greengenes/gg_2024_09_training.fna.gz \
222
+ --taxonomy-file ../deeptaxa-data/greengenes/gg_2024_09_training.tsv.gz \
223
+ --model-type hybridcnnbert \
224
+ --output-dir ../deeptaxa-outputs/
225
+ ```
226
+
227
+ Training takes approximately 1 h 20 m on an NVIDIA RTX 4090 (or 2 h 35 m on an NVIDIA A40) for 10 epochs.
228
+
229
+ ### Output
230
+
231
+ Each training run produces:
232
+
233
+ - `checkpoints/deeptaxa_<uuid>_epoch<N>.pt`: Model weights, optimizer state, scheduler state, and label encoders for each epoch.
234
+ - `metrics/deeptaxa_<uuid>_epoch<N>.json`: Per-epoch validation loss, accuracy, F1, precision, and recall at each rank.
235
+ - `deeptaxa_uuid.txt`: The unique run identifier.
236
+
237
+ ### Early Stopping
238
+
239
+ To stop training when validation loss plateaus:
240
+
241
+ ```bash
242
+ deeptaxa train \
243
+ --fasta-file ../deeptaxa-data/greengenes/gg_2024_09_training.fna.gz \
244
+ --taxonomy-file ../deeptaxa-data/greengenes/gg_2024_09_training.tsv.gz \
245
+ --model-type hybridcnnbert \
246
+ --epochs 20 \
247
+ --early-stopping-patience 3 \
248
+ --output-dir ../deeptaxa-outputs/
249
+ ```
250
+
251
+ Setting `--early-stopping-patience 0` (the default) disables early stopping.
252
+
253
+ ---
254
+
255
+ ## Experimentation
256
+
257
+ The default configuration uses DNABERT-2 tokenization, cross-entropy loss, and uniform rank weighting. Each choice can be varied independently for ablation studies.
258
+
259
+ ### Encoding comparison
260
+
261
+ ```bash
262
+ # Default: DNABERT-2 BPE tokenization
263
+ deeptaxa train --model-type cnn --encoding dnabert ...
264
+
265
+ # Ablation: one-hot nucleotide encoding (4-channel, no pretrained tokenizer)
266
+ deeptaxa train --model-type cnn --encoding onehot ...
267
+ ```
268
+
269
+ ### Loss function comparison
270
+
271
+ ```bash
272
+ # Default: cross-entropy
273
+ deeptaxa train --model-type hybridcnnbert --loss-type cross_entropy ...
274
+
275
+ # Ablation: focal loss (gamma=2.0)
276
+ deeptaxa train --model-type hybridcnnbert --loss-type focal --focal-gamma 2.0 ...
277
+ ```
278
+
279
+ ### Architecture comparison
280
+
281
+ Train CNN-only, BERT-only, or the hybrid under the same data and hyperparameters using `--model-type cnn`, `--model-type bert`, or `--model-type hybridcnnbert`.
282
+
283
+ ### Calibration
284
+
285
+ When `--taxonomy-file` is provided at prediction time, DeepTaxa computes Expected Calibration Error (ECE) alongside accuracy, F1, precision, recall, and AUC. ECE measures the gap between predicted confidence and observed accuracy across 10 equal-width bins. All metrics are saved to `metrics.json`.
286
+
287
+ ---
288
+
289
+ ## Scripts
290
+
291
+ The `scripts/` directory contains reusable tools for common workflows:
292
+
293
+ | Script | Purpose |
294
+ |--------|---------|
295
+ | `deeptaxa_workflow.sh` | End-to-end workflow: train, resume, describe, predict |
296
+ | `run_experiment.sh` | Central experiment runner with logging and timing |
297
+ | `run_ablation.sh` | Ablation study: architecture, encoding, and loss variants |
298
+ | `run_amplicon_eval.sh` | Simulated amplicon evaluation (V3-V4, V4) |
299
+ | `run_similarity_eval.sh` | Similarity-stratified evaluation using vsearch |
300
+ | `calibration_diagnosis.sh` | A/B comparison of temperature configurations |
301
+ | `calibration_sweep.sh` | Multi-configuration temperature sweep |
302
+ | `simulate_amplicons.py` | Extract amplicon regions via in-silico PCR |
303
+ | `sequence_similarity.py` | Compute train-test nearest-neighbor identity |
304
+
305
+ ---
306
+
307
+ ## Tutorials
308
+
309
+ Interactive tutorials with executable code are published at [systems-genomics-lab.github.io/deeptaxa](https://systems-genomics-lab.github.io/deeptaxa/):
310
+
311
+ - [Prediction](https://systems-genomics-lab.github.io/deeptaxa/prediction.html): Classify sequences with the pre-trained model
312
+ - [Training](https://systems-genomics-lab.github.io/deeptaxa/training.html): Train from scratch on Greengenes2
313
+ - [Analysis](https://systems-genomics-lab.github.io/deeptaxa/analysis.html): Evaluate performance, calibration, and error patterns
314
+ - [Architecture](https://systems-genomics-lab.github.io/deeptaxa/architecture.html): Model internals and extensibility
315
+
316
+ ---
317
+
318
+ ## License
319
+
320
+ - **Code and models**: [MIT License](LICENSE)
321
+ - **Greengenes dataset**: [Modified BSD License](https://huggingface.co/datasets/systems-genomics-lab/greengenes)
322
+
323
+ ---
324
+
325
+ ## Citation
326
+
327
+ If DeepTaxa contributes to your research, please cite our paper in *Bioinformatics Advances*: [https://doi.org/10.1093/bioadv/vbag166](https://doi.org/10.1093/bioadv/vbag166)
328
+
329
+ ```bibtex
330
+ @article{salah2026deeptaxa,
331
+ title={{DeepTaxa}: A Hybrid {CNN}-{BERT} Framework for {16S} {rRNA} Taxonomic Classification},
332
+ author={Salah, Rana and AbdElaal, Khlood R. and Ghonaim, Lobna and Awe, Olaitan I. and Moustafa, Ahmed},
333
+ journal={Bioinformatics Advances},
334
+ year={2026},
335
+ doi={10.1093/bioadv/vbag166},
336
+ publisher={Oxford University Press}
337
+ }
338
+ ```
339
+
340
+ For the Greengenes dataset:
341
+
342
+ ```bibtex
343
+ @article{mcdonald2024greengenes,
344
+ title={Greengenes2 unifies microbial data in a single reference tree},
345
+ author={McDonald, Daniel and Jiang, Yueyu and Balaban, Metin and others},
346
+ journal={Nature Biotechnology},
347
+ volume={42},
348
+ pages={715--718},
349
+ year={2024},
350
+ doi={10.1038/s41587-023-01845-1}
351
+ }
352
+ ```
353
+
354
+ ---
355
+
356
+ ## Contact
357
+
358
+ To report bugs, suggest features, or contribute code, open an issue on [GitHub](https://github.com/systems-genomics-lab/deeptaxa/issues).
359
+
360
+ ---
361
+
362
+ ## Acknowledgements
363
+
364
+ - **[Ahmed A. El Hosseiny](https://github.com/ahmedelhosseiny)** and the High-Performance Computing Team of the [School of Sciences and Engineering](https://sse.aucegypt.edu/) at the [American University in Cairo](https://www.aucegypt.edu/) for GPU access that enabled this work.
365
+ - **[Hugging Face](https://huggingface.co/)** for hosting datasets and models.