genome-entropy 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genome_entropy-0.1.1/LICENSE +21 -0
- genome_entropy-0.1.1/PKG-INFO +469 -0
- genome_entropy-0.1.1/README.md +427 -0
- genome_entropy-0.1.1/pyproject.toml +89 -0
- genome_entropy-0.1.1/setup.cfg +4 -0
- genome_entropy-0.1.1/src/genome_entropy/__init__.py +3 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/__init__.py +1 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/__init__.py +1 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/download.py +64 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/encode3di.py +131 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/entropy.py +99 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/estimate_tokens.py +112 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/orf.py +63 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/run.py +164 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/commands/translate.py +66 -0
- genome_entropy-0.1.1/src/genome_entropy/cli/main.py +97 -0
- genome_entropy-0.1.1/src/genome_entropy/config.py +52 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/__init__.py +19 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/encoder.py +367 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/encoding.py +200 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/gpu_utils.py +151 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/multi_gpu.py +335 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/prostt5.py +15 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/token_estimator.py +214 -0
- genome_entropy-0.1.1/src/genome_entropy/encode3di/types.py +33 -0
- genome_entropy-0.1.1/src/genome_entropy/entropy/__init__.py +1 -0
- genome_entropy-0.1.1/src/genome_entropy/entropy/shannon.py +122 -0
- genome_entropy-0.1.1/src/genome_entropy/errors.py +55 -0
- genome_entropy-0.1.1/src/genome_entropy/io/__init__.py +1 -0
- genome_entropy-0.1.1/src/genome_entropy/io/fasta.py +136 -0
- genome_entropy-0.1.1/src/genome_entropy/io/genbank.py +195 -0
- genome_entropy-0.1.1/src/genome_entropy/io/jsonio.py +81 -0
- genome_entropy-0.1.1/src/genome_entropy/logging_config.py +148 -0
- genome_entropy-0.1.1/src/genome_entropy/orf/__init__.py +1 -0
- genome_entropy-0.1.1/src/genome_entropy/orf/finder.py +228 -0
- genome_entropy-0.1.1/src/genome_entropy/orf/types.py +50 -0
- genome_entropy-0.1.1/src/genome_entropy/pipeline/__init__.py +1 -0
- genome_entropy-0.1.1/src/genome_entropy/pipeline/runner.py +292 -0
- genome_entropy-0.1.1/src/genome_entropy/translate/__init__.py +1 -0
- genome_entropy-0.1.1/src/genome_entropy/translate/translator.py +110 -0
- genome_entropy-0.1.1/src/genome_entropy.egg-info/PKG-INFO +469 -0
- genome_entropy-0.1.1/src/genome_entropy.egg-info/SOURCES.txt +57 -0
- genome_entropy-0.1.1/src/genome_entropy.egg-info/dependency_links.txt +1 -0
- genome_entropy-0.1.1/src/genome_entropy.egg-info/entry_points.txt +2 -0
- genome_entropy-0.1.1/src/genome_entropy.egg-info/requires.txt +23 -0
- genome_entropy-0.1.1/src/genome_entropy.egg-info/top_level.txt +1 -0
- genome_entropy-0.1.1/tests/test_basic.py +14 -0
- genome_entropy-0.1.1/tests/test_cli_smoke.py +95 -0
- genome_entropy-0.1.1/tests/test_encoder_methods.py +122 -0
- genome_entropy-0.1.1/tests/test_entropy.py +124 -0
- genome_entropy-0.1.1/tests/test_genbank.py +252 -0
- genome_entropy-0.1.1/tests/test_gpu_discovery.py +217 -0
- genome_entropy-0.1.1/tests/test_logging_config.py +185 -0
- genome_entropy-0.1.1/tests/test_multi_gpu_encoding.py +298 -0
- genome_entropy-0.1.1/tests/test_orf_finder.py +205 -0
- genome_entropy-0.1.1/tests/test_prostt5_integration.py +74 -0
- genome_entropy-0.1.1/tests/test_token_budget_batches.py +282 -0
- genome_entropy-0.1.1/tests/test_token_estimator.py +101 -0
- genome_entropy-0.1.1/tests/test_translation.py +163 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Rob Edwards
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genome_entropy
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Quantify information content across multiple biological representations derived from genomic sequences
|
|
5
|
+
Author-email: Rob Edwards <raedwards@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/linsalrob/genome_entropy
|
|
8
|
+
Project-URL: Repository, https://github.com/linsalrob/genome_entropy
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: torch>=2.0.0
|
|
21
|
+
Requires-Dist: transformers>=4.30.0
|
|
22
|
+
Requires-Dist: pygenetic_code>=0.20.0
|
|
23
|
+
Requires-Dist: typer>=0.9.0
|
|
24
|
+
Requires-Dist: tqdm>=4.65.0
|
|
25
|
+
Requires-Dist: protobuf>=6.33.1
|
|
26
|
+
Requires-Dist: sentencepiece>=0.2.1
|
|
27
|
+
Requires-Dist: biopython>=1.80
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
33
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
36
|
+
Provides-Extra: docs
|
|
37
|
+
Requires-Dist: sphinx>=7.0.0; extra == "docs"
|
|
38
|
+
Requires-Dist: sphinx-rtd-theme>=2.0.0; extra == "docs"
|
|
39
|
+
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
|
40
|
+
Requires-Dist: linkify-it-py>=2.0.0; extra == "docs"
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
|
|
43
|
+
# genome_entropy
|
|
44
|
+
|
|
45
|
+
[](https://edwards.flinders.edu.au/)
|
|
46
|
+
|
|
47
|
+
[](https://github.com/linsalrob/genome_entropy/actions)
|
|
48
|
+
[](https://linsalrob.github.io/genome_entropy/)
|
|
49
|
+
[](https://www.python.org/downloads/)
|
|
50
|
+
[](https://opensource.org/licenses/MIT)
|
|
51
|
+
|
|
52
|
+
Quantify information content across multiple biological representations derived from genomic sequences.
|
|
53
|
+
|
|
54
|
+
**genome_entropy** is a complete bioinformatics pipeline that converts DNA sequences → ORFs → proteins → 3Di structural tokens, computing Shannon entropy at each representation level.
|
|
55
|
+
|
|
56
|
+
## Why genome_entropy?
|
|
57
|
+
|
|
58
|
+
We refer to this framework as **genome-entropy** to emphasise its unifying focus on quantifying information content across multiple biological representations derived from the same genomic sequence. Rather than restricting analysis to a single abstraction, such as nucleotide composition or predicted coding regions, genome-entropy integrates DNA sequences, open reading frames, translated proteins, and structure-derived encodings (3Di) within a common information-theoretic framework. The name reflects both the biological scope of the approach—operating at the level of whole genomes and metagenomes—and the central analytical principle, entropy, which provides a consistent and comparable measure of complexity, organisation, and constraint across representations. This design allows direct comparison of informational signatures across molecular layers while remaining extensible to additional encodings as methods and data evolve.
|
|
59
|
+
|
|
60
|
+
## Documentation
|
|
61
|
+
|
|
62
|
+
📚 **[Read the full documentation on GitHub Pages](https://linsalrob.github.io/genome_entropy/)**
|
|
63
|
+
|
|
64
|
+
📚 **[Read the full documentation on Read The Docs](https://genome-entropy.readthedocs.io/en/latest/)**
|
|
65
|
+
|
|
66
|
+
The documentation includes:
|
|
67
|
+
- Installation guide
|
|
68
|
+
- Quick start tutorial
|
|
69
|
+
- Complete CLI reference
|
|
70
|
+
- Python API documentation
|
|
71
|
+
- User guide with detailed explanations
|
|
72
|
+
- Developer guide for contributors
|
|
73
|
+
|
|
74
|
+
## Features
|
|
75
|
+
|
|
76
|
+
- 🧬 **ORF Finding**: Extract Open Reading Frames from DNA sequences using customizable genetic codes
|
|
77
|
+
- 🔄 **Translation**: Convert ORFs to protein sequences with support for all NCBI genetic code tables
|
|
78
|
+
- 🏗️ **3Di Encoding**: Predict structural alphabet tokens directly from sequences using ProstT5
|
|
79
|
+
- 📊 **Entropy Analysis**: Calculate Shannon entropy at DNA, ORF, protein, and 3Di levels
|
|
80
|
+
- ⚡ **GPU Acceleration**: Auto-detect and use CUDA, MPS (Apple Silicon), or CPU
|
|
81
|
+
- 🚀 **Multi-GPU Support**: Parallelize 3Di encoding across multiple GPUs for faster processing
|
|
82
|
+
- 🔧 **Modular CLI**: Run complete pipeline or individual steps
|
|
83
|
+
- 📝 **Comprehensive Logging**: Configurable log levels and output to file or STDOUT
|
|
84
|
+
|
|
85
|
+
## Quick Start
|
|
86
|
+
|
|
87
|
+
### Installation
|
|
88
|
+
|
|
89
|
+
#### Recommended
|
|
90
|
+
|
|
91
|
+
Install with pip:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install genome-entropy
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
#### For developers
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
# Clone repository
|
|
101
|
+
git clone https://github.com/linsalrob/genome_entropy.git
|
|
102
|
+
cd genome_entropy
|
|
103
|
+
|
|
104
|
+
# Create virtual environment
|
|
105
|
+
python3 -m venv venv
|
|
106
|
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
107
|
+
|
|
108
|
+
# Install development dependencies (optional)
|
|
109
|
+
pip install -e ".[dev]"
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Basic Usage
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
# Run complete pipeline
|
|
116
|
+
genome_entropy run --input examples/example_small.fasta --output results.json
|
|
117
|
+
|
|
118
|
+
# Or run individual steps
|
|
119
|
+
genome_entropy orf --input input.fasta --output orfs.json
|
|
120
|
+
genome_entropy translate --input orfs.json --output proteins.json
|
|
121
|
+
genome_entropy encode3di --input proteins.json --output 3di.json
|
|
122
|
+
genome_entropy entropy --input 3di.json --output entropy.json
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Multi-GPU Usage
|
|
126
|
+
|
|
127
|
+
Speed up 3Di encoding by distributing batches across multiple GPUs:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
# Auto-discover and use all available GPUs
|
|
131
|
+
genome_entropy run --input input.fasta --output results.json --multi-gpu
|
|
132
|
+
|
|
133
|
+
# Use specific GPUs
|
|
134
|
+
genome_entropy run --input input.fasta --output results.json --multi-gpu --gpu-ids 0,1,2
|
|
135
|
+
|
|
136
|
+
# Works with SLURM job schedulers (GPUs auto-discovered from SLURM_JOB_GPUS)
|
|
137
|
+
srun --gres=gpu:4 genome_entropy run --input input.fasta --output results.json --multi-gpu
|
|
138
|
+
|
|
139
|
+
# Multi-GPU encoding also works for the encode3di command
|
|
140
|
+
genome_entropy encode3di --input proteins.json --output 3di.json --multi-gpu
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**GPU Discovery Priority:**
|
|
144
|
+
1. `SLURM_JOB_GPUS` environment variable (SLURM job allocations)
|
|
145
|
+
2. `SLURM_GPUS` environment variable
|
|
146
|
+
3. `CUDA_VISIBLE_DEVICES` environment variable
|
|
147
|
+
4. `torch.cuda.device_count()` (all available GPUs)
|
|
148
|
+
|
|
149
|
+
See `examples/multi_gpu_example.py` for more usage examples.
|
|
150
|
+
|
|
151
|
+
## Requirements
|
|
152
|
+
|
|
153
|
+
### Python Dependencies
|
|
154
|
+
|
|
155
|
+
- Python 3.8 or higher
|
|
156
|
+
- PyTorch >= 2.0.0 (GPU support optional)
|
|
157
|
+
- Transformers >= 4.30.0 (HuggingFace)
|
|
158
|
+
- pygenetic-code >= 0.1.0
|
|
159
|
+
- typer >= 0.9.0
|
|
160
|
+
|
|
161
|
+
### External Binary: get_orfs
|
|
162
|
+
|
|
163
|
+
The ORF finder requires the `get_orfs` binary from https://github.com/linsalrob/get_orfs
|
|
164
|
+
|
|
165
|
+
**Installation:**
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
# Clone and build get_orfs
|
|
169
|
+
git clone https://github.com/linsalrob/get_orfs.git /tmp/get_orfs
|
|
170
|
+
cd /tmp/get_orfs
|
|
171
|
+
mkdir build && cd build
|
|
172
|
+
cmake ..
|
|
173
|
+
make
|
|
174
|
+
cmake --install . --prefix ..
|
|
175
|
+
|
|
176
|
+
# Add to PATH or set environment variable
|
|
177
|
+
export PATH="/tmp/get_orfs/bin:$PATH"
|
|
178
|
+
# Or set GET_ORFS_PATH environment variable
|
|
179
|
+
export GET_ORFS_PATH=/tmp/get_orfs/bin/get_orfs
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## CLI Commands
|
|
183
|
+
|
|
184
|
+
### `genome_entropy run` - Complete Pipeline
|
|
185
|
+
|
|
186
|
+
Run all steps from DNA to 3Di with entropy calculation:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
genome_entropy run \
|
|
190
|
+
--input input.fasta \
|
|
191
|
+
--output results.json \
|
|
192
|
+
--table 11 \
|
|
193
|
+
--min-aa 30 \
|
|
194
|
+
--model Rostlab/ProstT5_fp16 \
|
|
195
|
+
--device auto
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
**Options:**
|
|
199
|
+
- `--input, -i`: Input FASTA file (required)
|
|
200
|
+
- `--output, -o`: Output JSON file (required)
|
|
201
|
+
- `--table, -t`: NCBI genetic code table ID (default: 11)
|
|
202
|
+
- `--min-aa`: Minimum protein length in amino acids (default: 30)
|
|
203
|
+
- `--model, -m`: ProstT5 model name (default: Rostlab/ProstT5_fp16)
|
|
204
|
+
- `--device, -d`: Device for inference (auto/cuda/mps/cpu)
|
|
205
|
+
- `--skip-entropy`: Skip entropy calculation
|
|
206
|
+
|
|
207
|
+
### `genome_entropy orf` - Find ORFs
|
|
208
|
+
|
|
209
|
+
Extract Open Reading Frames from DNA sequences:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
genome_entropy orf --input input.fasta --output orfs.json --table 11 --min-nt 90
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### `genome_entropy translate` - Translate ORFs
|
|
216
|
+
|
|
217
|
+
Translate ORFs to protein sequences:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
genome_entropy translate --input orfs.json --output proteins.json --table 11
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### `genome_entropy encode3di` - Encode to 3Di
|
|
224
|
+
|
|
225
|
+
Convert proteins to 3Di structural tokens using ProstT5:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
genome_entropy encode3di \
|
|
229
|
+
--input proteins.json \
|
|
230
|
+
--output 3di.json \
|
|
231
|
+
--model Rostlab/ProstT5_fp16 \
|
|
232
|
+
--device auto \
|
|
233
|
+
--batch-size 4
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### `genome_entropy entropy` - Calculate Entropy
|
|
237
|
+
|
|
238
|
+
Compute Shannon entropy at all representation levels:
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
genome_entropy entropy --input 3di.json --output entropy.json --normalize
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### `genome_entropy download` - Pre-download Models
|
|
245
|
+
|
|
246
|
+
Pre-download ProstT5 models to cache:
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
genome_entropy download --model Rostlab/ProstT5_fp16
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## Logging
|
|
253
|
+
|
|
254
|
+
All `genome_entropy` commands support comprehensive logging with configurable output and verbosity.
|
|
255
|
+
|
|
256
|
+
### Global Logging Options
|
|
257
|
+
|
|
258
|
+
Every command accepts these logging options:
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
genome_entropy [OPTIONS] COMMAND [ARGS]
|
|
262
|
+
|
|
263
|
+
Global Options:
|
|
264
|
+
--log-level, -l TEXT Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) [default: INFO]
|
|
265
|
+
--log-file PATH Path to log file (default: log to STDOUT)
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Usage Examples
|
|
269
|
+
|
|
270
|
+
**Default logging (INFO level to STDOUT):**
|
|
271
|
+
```bash
|
|
272
|
+
genome_entropy run --input data.fasta --output results.json
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
**Debug logging to see detailed progress:**
|
|
276
|
+
```bash
|
|
277
|
+
genome_entropy --log-level DEBUG run --input data.fasta --output results.json
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
**Log to a file:**
|
|
281
|
+
```bash
|
|
282
|
+
genome_entropy --log-file pipeline.log run --input data.fasta --output results.json
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
**Debug logging to file:**
|
|
286
|
+
```bash
|
|
287
|
+
genome_entropy --log-level DEBUG --log-file debug.log run --input data.fasta --output results.json
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
**Quiet mode (only warnings and errors):**
|
|
291
|
+
```bash
|
|
292
|
+
genome_entropy --log-level WARNING run --input data.fasta --output results.json
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Log Levels
|
|
296
|
+
|
|
297
|
+
- **DEBUG**: Detailed information for diagnosing problems (sequence lengths, batch info, etc.)
|
|
298
|
+
- **INFO**: General informational messages (default - shows major steps and progress)
|
|
299
|
+
- **WARNING**: Warning messages for unusual conditions
|
|
300
|
+
- **ERROR**: Error messages for failures
|
|
301
|
+
- **CRITICAL**: Critical errors that may cause the program to abort
|
|
302
|
+
|
|
303
|
+
### What Gets Logged
|
|
304
|
+
|
|
305
|
+
The logging system tracks:
|
|
306
|
+
|
|
307
|
+
- **File I/O**: Reading/writing FASTA and JSON files with sequence counts
|
|
308
|
+
- **ORF Finding**: Number of ORFs found, binary checks, parsing progress
|
|
309
|
+
- **Translation**: Translation progress, codon handling, error details
|
|
310
|
+
- **3Di Encoding**: Model loading, batch processing, memory usage, timing estimates
|
|
311
|
+
- **Entropy Calculation**: Entropy values at each representation level
|
|
312
|
+
- **Pipeline Progress**: Step-by-step progress through the complete pipeline
|
|
313
|
+
|
|
314
|
+
Example log output (INFO level):
|
|
315
|
+
```
|
|
316
|
+
2026-01-19 10:30:15 - genome_entropy.io.fasta - INFO - Reading FASTA file: input.fasta
|
|
317
|
+
2026-01-19 10:30:15 - genome_entropy.io.fasta - INFO - Successfully read 5 sequence(s) from input.fasta
|
|
318
|
+
2026-01-19 10:30:15 - genome_entropy.orf.finder - INFO - Starting ORF finding for 5 sequence(s) (table=11, min_length=90)
|
|
319
|
+
2026-01-19 10:30:16 - genome_entropy.orf.finder - INFO - Found 47 ORF(s) in 5 sequence(s)
|
|
320
|
+
2026-01-19 10:30:16 - genome_entropy.translate.translator - INFO - Translating 47 ORF(s) with table 11
|
|
321
|
+
2026-01-19 10:30:16 - genome_entropy.encode3di.encoder - INFO - Loading ProstT5 model: Rostlab/ProstT5_fp16
|
|
322
|
+
2026-01-19 10:30:20 - genome_entropy.encode3di.encoder - INFO - Loaded model Rostlab/ProstT5_fp16 on device cuda
|
|
323
|
+
2026-01-19 10:30:20 - genome_entropy.encode3di.encoding - INFO - 3Di encoding batch 1 of 12 batches...
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
## Data Flow
|
|
327
|
+
|
|
328
|
+
```
|
|
329
|
+
DNA FASTA → ORF Finder → ORFs (nucleotides)
|
|
330
|
+
↓
|
|
331
|
+
Translator → Proteins (amino acids)
|
|
332
|
+
↓
|
|
333
|
+
ProstT5 → 3Di tokens (structural alphabet)
|
|
334
|
+
↓
|
|
335
|
+
Shannon Entropy → Entropy Report
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
## Genetic Code Tables
|
|
339
|
+
|
|
340
|
+
The pipeline supports all NCBI genetic code tables. Common ones:
|
|
341
|
+
|
|
342
|
+
- **Table 1**: Standard genetic code
|
|
343
|
+
- **Table 11**: Bacterial, archaeal, and plant plastid code (default)
|
|
344
|
+
- **Table 4**: Mold, protozoan, and coelenterate mitochondrial code
|
|
345
|
+
|
|
346
|
+
See full list at: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
|
|
347
|
+
|
|
348
|
+
## Output Format
|
|
349
|
+
|
|
350
|
+
Results are saved as JSON with the following structure:
|
|
351
|
+
|
|
352
|
+
```json
|
|
353
|
+
[
|
|
354
|
+
{
|
|
355
|
+
"input_id": "seq1",
|
|
356
|
+
"input_dna_length": 1000,
|
|
357
|
+
"orfs": [...],
|
|
358
|
+
"proteins": [...],
|
|
359
|
+
"three_dis": [...],
|
|
360
|
+
"entropy": {
|
|
361
|
+
"dna_entropy_global": 2.5,
|
|
362
|
+
"orf_nt_entropy": {"orf1": 1.8},
|
|
363
|
+
"protein_aa_entropy": {"orf1": 3.2},
|
|
364
|
+
"three_di_entropy": {"orf1": 2.9},
|
|
365
|
+
"alphabet_sizes": {"dna": 4, "protein": 20, "three_di": 20}
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
]
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
## Development
|
|
372
|
+
|
|
373
|
+
### Running Tests
|
|
374
|
+
|
|
375
|
+
```bash
|
|
376
|
+
# Run unit tests
|
|
377
|
+
pytest
|
|
378
|
+
|
|
379
|
+
# Run with coverage
|
|
380
|
+
pytest --cov=genome_entropy
|
|
381
|
+
|
|
382
|
+
# Skip integration tests (default)
|
|
383
|
+
pytest -k "not integration"
|
|
384
|
+
|
|
385
|
+
# Run integration tests (downloads models, slow)
|
|
386
|
+
RUN_INTEGRATION=1 pytest -v -m integration
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
### Code Quality
|
|
390
|
+
|
|
391
|
+
```bash
|
|
392
|
+
# Format code
|
|
393
|
+
black src/ tests/
|
|
394
|
+
|
|
395
|
+
# Lint
|
|
396
|
+
ruff check src/ tests/
|
|
397
|
+
|
|
398
|
+
# Type check
|
|
399
|
+
mypy src/genome_entropy/
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
### Project Structure
|
|
403
|
+
|
|
404
|
+
```
|
|
405
|
+
genome_entropy/
|
|
406
|
+
├── src/genome_entropy/
|
|
407
|
+
│ ├── io/ # FASTA and JSON I/O
|
|
408
|
+
│ ├── orf/ # ORF finding and types
|
|
409
|
+
│ ├── translate/ # Protein translation
|
|
410
|
+
│ ├── encode3di/ # 3Di encoding (ProstT5)
|
|
411
|
+
│ ├── entropy/ # Shannon entropy calculation
|
|
412
|
+
│ ├── pipeline/ # End-to-end orchestration
|
|
413
|
+
│ └── cli/ # Command-line interface
|
|
414
|
+
├── tests/ # Unit and integration tests
|
|
415
|
+
└── examples/ # Example data and scripts
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
## Citation
|
|
419
|
+
|
|
420
|
+
If you use this software, please cite:
|
|
421
|
+
|
|
422
|
+
- **ProstT5**: Heinzinger et al. (2023), "ProstT5: Bilingual Language Model for Protein Sequence and Structure"
|
|
423
|
+
- **get_orfs**: https://github.com/linsalrob/get_orfs
|
|
424
|
+
- **pygenetic-code**: https://github.com/linsalrob/genetic_codes
|
|
425
|
+
|
|
426
|
+
## License
|
|
427
|
+
|
|
428
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
429
|
+
|
|
430
|
+
## Author
|
|
431
|
+
|
|
432
|
+
Rob Edwards (@linsalrob)
|
|
433
|
+
Email: raedwards@gmail.com
|
|
434
|
+
|
|
435
|
+
## Contributing
|
|
436
|
+
|
|
437
|
+
Contributions welcome! Please:
|
|
438
|
+
|
|
439
|
+
1. Fork the repository
|
|
440
|
+
2. Create a feature branch
|
|
441
|
+
3. Add tests for new functionality
|
|
442
|
+
4. Ensure all tests pass
|
|
443
|
+
5. Submit a pull request
|
|
444
|
+
|
|
445
|
+
## Troubleshooting
|
|
446
|
+
|
|
447
|
+
### Common Issues
|
|
448
|
+
|
|
449
|
+
**ModuleNotFoundError: No module named 'genome_entropy'**
|
|
450
|
+
- Run `pip install -e .` from repository root
|
|
451
|
+
|
|
452
|
+
**get_orfs binary not found**
|
|
453
|
+
- Install get_orfs and add to PATH or set GET_ORFS_PATH environment variable
|
|
454
|
+
|
|
455
|
+
**CUDA out of memory**
|
|
456
|
+
- Use CPU with `--device cpu` or reduce batch size with `--batch-size 1`
|
|
457
|
+
|
|
458
|
+
**Model download fails**
|
|
459
|
+
- Check internet connection
|
|
460
|
+
- Verify HuggingFace cache permissions (~/.cache/huggingface/)
|
|
461
|
+
|
|
462
|
+
**Integration tests run unexpectedly**
|
|
463
|
+
- Use `pytest -k "not integration"` to skip them
|
|
464
|
+
|
|
465
|
+
## Acknowledgments
|
|
466
|
+
|
|
467
|
+
- ProstT5 model by Rostlab
|
|
468
|
+
- get_orfs by Rob Edwards
|
|
469
|
+
- genetic_codes by Rob Edwards
|