genome-entropy 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. genome_entropy-0.1.1/LICENSE +21 -0
  2. genome_entropy-0.1.1/PKG-INFO +469 -0
  3. genome_entropy-0.1.1/README.md +427 -0
  4. genome_entropy-0.1.1/pyproject.toml +89 -0
  5. genome_entropy-0.1.1/setup.cfg +4 -0
  6. genome_entropy-0.1.1/src/genome_entropy/__init__.py +3 -0
  7. genome_entropy-0.1.1/src/genome_entropy/cli/__init__.py +1 -0
  8. genome_entropy-0.1.1/src/genome_entropy/cli/commands/__init__.py +1 -0
  9. genome_entropy-0.1.1/src/genome_entropy/cli/commands/download.py +64 -0
  10. genome_entropy-0.1.1/src/genome_entropy/cli/commands/encode3di.py +131 -0
  11. genome_entropy-0.1.1/src/genome_entropy/cli/commands/entropy.py +99 -0
  12. genome_entropy-0.1.1/src/genome_entropy/cli/commands/estimate_tokens.py +112 -0
  13. genome_entropy-0.1.1/src/genome_entropy/cli/commands/orf.py +63 -0
  14. genome_entropy-0.1.1/src/genome_entropy/cli/commands/run.py +164 -0
  15. genome_entropy-0.1.1/src/genome_entropy/cli/commands/translate.py +66 -0
  16. genome_entropy-0.1.1/src/genome_entropy/cli/main.py +97 -0
  17. genome_entropy-0.1.1/src/genome_entropy/config.py +52 -0
  18. genome_entropy-0.1.1/src/genome_entropy/encode3di/__init__.py +19 -0
  19. genome_entropy-0.1.1/src/genome_entropy/encode3di/encoder.py +367 -0
  20. genome_entropy-0.1.1/src/genome_entropy/encode3di/encoding.py +200 -0
  21. genome_entropy-0.1.1/src/genome_entropy/encode3di/gpu_utils.py +151 -0
  22. genome_entropy-0.1.1/src/genome_entropy/encode3di/multi_gpu.py +335 -0
  23. genome_entropy-0.1.1/src/genome_entropy/encode3di/prostt5.py +15 -0
  24. genome_entropy-0.1.1/src/genome_entropy/encode3di/token_estimator.py +214 -0
  25. genome_entropy-0.1.1/src/genome_entropy/encode3di/types.py +33 -0
  26. genome_entropy-0.1.1/src/genome_entropy/entropy/__init__.py +1 -0
  27. genome_entropy-0.1.1/src/genome_entropy/entropy/shannon.py +122 -0
  28. genome_entropy-0.1.1/src/genome_entropy/errors.py +55 -0
  29. genome_entropy-0.1.1/src/genome_entropy/io/__init__.py +1 -0
  30. genome_entropy-0.1.1/src/genome_entropy/io/fasta.py +136 -0
  31. genome_entropy-0.1.1/src/genome_entropy/io/genbank.py +195 -0
  32. genome_entropy-0.1.1/src/genome_entropy/io/jsonio.py +81 -0
  33. genome_entropy-0.1.1/src/genome_entropy/logging_config.py +148 -0
  34. genome_entropy-0.1.1/src/genome_entropy/orf/__init__.py +1 -0
  35. genome_entropy-0.1.1/src/genome_entropy/orf/finder.py +228 -0
  36. genome_entropy-0.1.1/src/genome_entropy/orf/types.py +50 -0
  37. genome_entropy-0.1.1/src/genome_entropy/pipeline/__init__.py +1 -0
  38. genome_entropy-0.1.1/src/genome_entropy/pipeline/runner.py +292 -0
  39. genome_entropy-0.1.1/src/genome_entropy/translate/__init__.py +1 -0
  40. genome_entropy-0.1.1/src/genome_entropy/translate/translator.py +110 -0
  41. genome_entropy-0.1.1/src/genome_entropy.egg-info/PKG-INFO +469 -0
  42. genome_entropy-0.1.1/src/genome_entropy.egg-info/SOURCES.txt +57 -0
  43. genome_entropy-0.1.1/src/genome_entropy.egg-info/dependency_links.txt +1 -0
  44. genome_entropy-0.1.1/src/genome_entropy.egg-info/entry_points.txt +2 -0
  45. genome_entropy-0.1.1/src/genome_entropy.egg-info/requires.txt +23 -0
  46. genome_entropy-0.1.1/src/genome_entropy.egg-info/top_level.txt +1 -0
  47. genome_entropy-0.1.1/tests/test_basic.py +14 -0
  48. genome_entropy-0.1.1/tests/test_cli_smoke.py +95 -0
  49. genome_entropy-0.1.1/tests/test_encoder_methods.py +122 -0
  50. genome_entropy-0.1.1/tests/test_entropy.py +124 -0
  51. genome_entropy-0.1.1/tests/test_genbank.py +252 -0
  52. genome_entropy-0.1.1/tests/test_gpu_discovery.py +217 -0
  53. genome_entropy-0.1.1/tests/test_logging_config.py +185 -0
  54. genome_entropy-0.1.1/tests/test_multi_gpu_encoding.py +298 -0
  55. genome_entropy-0.1.1/tests/test_orf_finder.py +205 -0
  56. genome_entropy-0.1.1/tests/test_prostt5_integration.py +74 -0
  57. genome_entropy-0.1.1/tests/test_token_budget_batches.py +282 -0
  58. genome_entropy-0.1.1/tests/test_token_estimator.py +101 -0
  59. genome_entropy-0.1.1/tests/test_translation.py +163 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Rob Edwards
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,469 @@
1
+ Metadata-Version: 2.4
2
+ Name: genome_entropy
3
+ Version: 0.1.1
4
+ Summary: Quantify information content across multiple biological representations derived from genomic sequences
5
+ Author-email: Rob Edwards <raedwards@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/linsalrob/genome_entropy
8
+ Project-URL: Repository, https://github.com/linsalrob/genome_entropy
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: torch>=2.0.0
21
+ Requires-Dist: transformers>=4.30.0
22
+ Requires-Dist: pygenetic_code>=0.20.0
23
+ Requires-Dist: typer>=0.9.0
24
+ Requires-Dist: tqdm>=4.65.0
25
+ Requires-Dist: protobuf>=6.33.1
26
+ Requires-Dist: sentencepiece>=0.2.1
27
+ Requires-Dist: biopython>=1.80
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=7.0; extra == "dev"
30
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
31
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
32
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
33
+ Requires-Dist: black>=23.0.0; extra == "dev"
34
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
35
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
36
+ Provides-Extra: docs
37
+ Requires-Dist: sphinx>=7.0.0; extra == "docs"
38
+ Requires-Dist: sphinx-rtd-theme>=2.0.0; extra == "docs"
39
+ Requires-Dist: myst-parser>=2.0.0; extra == "docs"
40
+ Requires-Dist: linkify-it-py>=2.0.0; extra == "docs"
41
+ Dynamic: license-file
42
+
43
+ # genome_entropy
44
+
45
+ [![Edwards Lab](https://img.shields.io/badge/Bioinformatics-EdwardsLab-03A9F4)](https://edwards.flinders.edu.au/)
46
+
47
+ [![Python CI](https://github.com/linsalrob/genome_entropy/workflows/Python%20CI/badge.svg)](https://github.com/linsalrob/genome_entropy/actions)
48
+ [![Documentation](https://github.com/linsalrob/genome_entropy/workflows/Documentation/badge.svg)](https://linsalrob.github.io/genome_entropy/)
49
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
50
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
51
+
52
+ Quantify information content across multiple biological representations derived from genomic sequences.
53
+
54
+ **genome_entropy** is a complete bioinformatics pipeline that converts DNA sequences → ORFs → proteins → 3Di structural tokens, computing Shannon entropy at each representation level.
55
+
56
+ ## Why genome_entropy?
57
+
58
+ We refer to this framework as **genome-entropy** to emphasise its unifying focus on quantifying information content across multiple biological representations derived from the same genomic sequence. Rather than restricting analysis to a single abstraction, such as nucleotide composition or predicted coding regions, genome-entropy integrates DNA sequences, open reading frames, translated proteins, and structure-derived encodings (3Di) within a common information-theoretic framework. The name reflects both the biological scope of the approach—operating at the level of whole genomes and metagenomes—and the central analytical principle, entropy, which provides a consistent and comparable measure of complexity, organisation, and constraint across representations. This design allows direct comparison of informational signatures across molecular layers while remaining extensible to additional encodings as methods and data evolve.
59
+
60
+ ## Documentation
61
+
62
+ 📚 **[Read the full documentation on GitHub Pages](https://linsalrob.github.io/genome_entropy/)**
63
+
64
+ 📚 **[Read the full documentation on Read The Docs](https://genome-entropy.readthedocs.io/en/latest/)**
65
+
66
+ The documentation includes:
67
+ - Installation guide
68
+ - Quick start tutorial
69
+ - Complete CLI reference
70
+ - Python API documentation
71
+ - User guide with detailed explanations
72
+ - Developer guide for contributors
73
+
74
+ ## Features
75
+
76
+ - 🧬 **ORF Finding**: Extract Open Reading Frames from DNA sequences using customizable genetic codes
77
+ - 🔄 **Translation**: Convert ORFs to protein sequences with support for all NCBI genetic code tables
78
+ - 🏗️ **3Di Encoding**: Predict structural alphabet tokens directly from sequences using ProstT5
79
+ - 📊 **Entropy Analysis**: Calculate Shannon entropy at DNA, ORF, protein, and 3Di levels
80
+ - ⚡ **GPU Acceleration**: Auto-detect and use CUDA, MPS (Apple Silicon), or CPU
81
+ - 🚀 **Multi-GPU Support**: Parallelize 3Di encoding across multiple GPUs for faster processing
82
+ - 🔧 **Modular CLI**: Run complete pipeline or individual steps
83
+ - 📝 **Comprehensive Logging**: Configurable log levels and output to file or STDOUT
84
+
85
+ ## Quick Start
86
+
87
+ ### Installation
88
+
89
+ #### Recommended
90
+
91
+ Install with pip:
92
+
93
+ ```bash
94
+ pip install genome-entropy
95
+ ```
96
+
97
+ #### For developers
98
+
99
+ ```bash
100
+ # Clone repository
101
+ git clone https://github.com/linsalrob/genome_entropy.git
102
+ cd genome_entropy
103
+
104
+ # Create virtual environment
105
+ python3 -m venv venv
106
+ source venv/bin/activate # On Windows: venv\Scripts\activate
107
+
108
+ # Install development dependencies (optional)
109
+ pip install -e ".[dev]"
110
+ ```
111
+
112
+ ### Basic Usage
113
+
114
+ ```bash
115
+ # Run complete pipeline
116
+ genome_entropy run --input examples/example_small.fasta --output results.json
117
+
118
+ # Or run individual steps
119
+ genome_entropy orf --input input.fasta --output orfs.json
120
+ genome_entropy translate --input orfs.json --output proteins.json
121
+ genome_entropy encode3di --input proteins.json --output 3di.json
122
+ genome_entropy entropy --input 3di.json --output entropy.json
123
+ ```
124
+
125
+ ### Multi-GPU Usage
126
+
127
+ Speed up 3Di encoding by distributing batches across multiple GPUs:
128
+
129
+ ```bash
130
+ # Auto-discover and use all available GPUs
131
+ genome_entropy run --input input.fasta --output results.json --multi-gpu
132
+
133
+ # Use specific GPUs
134
+ genome_entropy run --input input.fasta --output results.json --multi-gpu --gpu-ids 0,1,2
135
+
136
+ # Works with SLURM job schedulers (GPUs auto-discovered from SLURM_JOB_GPUS)
137
+ srun --gres=gpu:4 genome_entropy run --input input.fasta --output results.json --multi-gpu
138
+
139
+ # Multi-GPU encoding also works for the encode3di command
140
+ genome_entropy encode3di --input proteins.json --output 3di.json --multi-gpu
141
+ ```
142
+
143
+ **GPU Discovery Priority:**
144
+ 1. `SLURM_JOB_GPUS` environment variable (SLURM job allocations)
145
+ 2. `SLURM_GPUS` environment variable
146
+ 3. `CUDA_VISIBLE_DEVICES` environment variable
147
+ 4. `torch.cuda.device_count()` (all available GPUs)
148
+
149
+ See `examples/multi_gpu_example.py` for more usage examples.
150
+
151
+ ## Requirements
152
+
153
+ ### Python Dependencies
154
+
155
+ - Python 3.8 or higher
156
+ - PyTorch >= 2.0.0 (GPU support optional)
157
+ - Transformers >= 4.30.0 (HuggingFace)
158
+ - pygenetic-code >= 0.1.0
159
+ - typer >= 0.9.0
160
+
161
+ ### External Binary: get_orfs
162
+
163
+ The ORF finder requires the `get_orfs` binary from https://github.com/linsalrob/get_orfs
164
+
165
+ **Installation:**
166
+
167
+ ```bash
168
+ # Clone and build get_orfs
169
+ git clone https://github.com/linsalrob/get_orfs.git /tmp/get_orfs
170
+ cd /tmp/get_orfs
171
+ mkdir build && cd build
172
+ cmake ..
173
+ make
174
+ cmake --install . --prefix ..
175
+
176
+ # Add to PATH or set environment variable
177
+ export PATH="/tmp/get_orfs/bin:$PATH"
178
+ # Or set GET_ORFS_PATH environment variable
179
+ export GET_ORFS_PATH=/tmp/get_orfs/bin/get_orfs
180
+ ```
181
+
182
+ ## CLI Commands
183
+
184
+ ### `genome_entropy run` - Complete Pipeline
185
+
186
+ Run all steps from DNA to 3Di with entropy calculation:
187
+
188
+ ```bash
189
+ genome_entropy run \
190
+ --input input.fasta \
191
+ --output results.json \
192
+ --table 11 \
193
+ --min-aa 30 \
194
+ --model Rostlab/ProstT5_fp16 \
195
+ --device auto
196
+ ```
197
+
198
+ **Options:**
199
+ - `--input, -i`: Input FASTA file (required)
200
+ - `--output, -o`: Output JSON file (required)
201
+ - `--table, -t`: NCBI genetic code table ID (default: 11)
202
+ - `--min-aa`: Minimum protein length in amino acids (default: 30)
203
+ - `--model, -m`: ProstT5 model name (default: Rostlab/ProstT5_fp16)
204
+ - `--device, -d`: Device for inference (auto/cuda/mps/cpu)
205
+ - `--skip-entropy`: Skip entropy calculation
206
+
207
+ ### `genome_entropy orf` - Find ORFs
208
+
209
+ Extract Open Reading Frames from DNA sequences:
210
+
211
+ ```bash
212
+ genome_entropy orf --input input.fasta --output orfs.json --table 11 --min-nt 90
213
+ ```
214
+
215
+ ### `genome_entropy translate` - Translate ORFs
216
+
217
+ Translate ORFs to protein sequences:
218
+
219
+ ```bash
220
+ genome_entropy translate --input orfs.json --output proteins.json --table 11
221
+ ```
222
+
223
+ ### `genome_entropy encode3di` - Encode to 3Di
224
+
225
+ Convert proteins to 3Di structural tokens using ProstT5:
226
+
227
+ ```bash
228
+ genome_entropy encode3di \
229
+ --input proteins.json \
230
+ --output 3di.json \
231
+ --model Rostlab/ProstT5_fp16 \
232
+ --device auto \
233
+ --batch-size 4
234
+ ```
235
+
236
+ ### `genome_entropy entropy` - Calculate Entropy
237
+
238
+ Compute Shannon entropy at all representation levels:
239
+
240
+ ```bash
241
+ genome_entropy entropy --input 3di.json --output entropy.json --normalize
242
+ ```
243
+
244
+ ### `genome_entropy download` - Pre-download Models
245
+
246
+ Pre-download ProstT5 models to cache:
247
+
248
+ ```bash
249
+ genome_entropy download --model Rostlab/ProstT5_fp16
250
+ ```
251
+
252
+ ## Logging
253
+
254
+ All `genome_entropy` commands support comprehensive logging with configurable output and verbosity.
255
+
256
+ ### Global Logging Options
257
+
258
+ Every command accepts these logging options:
259
+
260
+ ```bash
261
+ genome_entropy [OPTIONS] COMMAND [ARGS]
262
+
263
+ Global Options:
264
+ --log-level, -l TEXT Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) [default: INFO]
265
+ --log-file PATH Path to log file (default: log to STDOUT)
266
+ ```
267
+
268
+ ### Usage Examples
269
+
270
+ **Default logging (INFO level to STDOUT):**
271
+ ```bash
272
+ genome_entropy run --input data.fasta --output results.json
273
+ ```
274
+
275
+ **Debug logging to see detailed progress:**
276
+ ```bash
277
+ genome_entropy --log-level DEBUG run --input data.fasta --output results.json
278
+ ```
279
+
280
+ **Log to a file:**
281
+ ```bash
282
+ genome_entropy --log-file pipeline.log run --input data.fasta --output results.json
283
+ ```
284
+
285
+ **Debug logging to file:**
286
+ ```bash
287
+ genome_entropy --log-level DEBUG --log-file debug.log run --input data.fasta --output results.json
288
+ ```
289
+
290
+ **Quiet mode (only warnings and errors):**
291
+ ```bash
292
+ genome_entropy --log-level WARNING run --input data.fasta --output results.json
293
+ ```
294
+
295
+ ### Log Levels
296
+
297
+ - **DEBUG**: Detailed information for diagnosing problems (sequence lengths, batch info, etc.)
298
+ - **INFO**: General informational messages (default - shows major steps and progress)
299
+ - **WARNING**: Warning messages for unusual conditions
300
+ - **ERROR**: Error messages for failures
301
+ - **CRITICAL**: Critical errors that may cause the program to abort
302
+
303
+ ### What Gets Logged
304
+
305
+ The logging system tracks:
306
+
307
+ - **File I/O**: Reading/writing FASTA and JSON files with sequence counts
308
+ - **ORF Finding**: Number of ORFs found, binary checks, parsing progress
309
+ - **Translation**: Translation progress, codon handling, error details
310
+ - **3Di Encoding**: Model loading, batch processing, memory usage, timing estimates
311
+ - **Entropy Calculation**: Entropy values at each representation level
312
+ - **Pipeline Progress**: Step-by-step progress through the complete pipeline
313
+
314
+ Example log output (INFO level):
315
+ ```
316
+ 2026-01-19 10:30:15 - genome_entropy.io.fasta - INFO - Reading FASTA file: input.fasta
317
+ 2026-01-19 10:30:15 - genome_entropy.io.fasta - INFO - Successfully read 5 sequence(s) from input.fasta
318
+ 2026-01-19 10:30:15 - genome_entropy.orf.finder - INFO - Starting ORF finding for 5 sequence(s) (table=11, min_length=90)
319
+ 2026-01-19 10:30:16 - genome_entropy.orf.finder - INFO - Found 47 ORF(s) in 5 sequence(s)
320
+ 2026-01-19 10:30:16 - genome_entropy.translate.translator - INFO - Translating 47 ORF(s) with table 11
321
+ 2026-01-19 10:30:16 - genome_entropy.encode3di.encoder - INFO - Loading ProstT5 model: Rostlab/ProstT5_fp16
322
+ 2026-01-19 10:30:20 - genome_entropy.encode3di.encoder - INFO - Loaded model Rostlab/ProstT5_fp16 on device cuda
323
+ 2026-01-19 10:30:20 - genome_entropy.encode3di.encoding - INFO - 3Di encoding batch 1 of 12 batches...
324
+ ```
325
+
326
+ ## Data Flow
327
+
328
+ ```
329
+ DNA FASTA → ORF Finder → ORFs (nucleotides)
330
+
331
+ Translator → Proteins (amino acids)
332
+
333
+ ProstT5 → 3Di tokens (structural alphabet)
334
+
335
+ Shannon Entropy → Entropy Report
336
+ ```
337
+
338
+ ## Genetic Code Tables
339
+
340
+ The pipeline supports all NCBI genetic code tables. Common ones:
341
+
342
+ - **Table 1**: Standard genetic code
343
+ - **Table 11**: Bacterial, archaeal, and plant plastid code (default)
344
+ - **Table 4**: Mold, protozoan, and coelenterate mitochondrial code
345
+
346
+ See full list at: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
347
+
348
+ ## Output Format
349
+
350
+ Results are saved as JSON with the following structure:
351
+
352
+ ```json
353
+ [
354
+ {
355
+ "input_id": "seq1",
356
+ "input_dna_length": 1000,
357
+ "orfs": [...],
358
+ "proteins": [...],
359
+ "three_dis": [...],
360
+ "entropy": {
361
+ "dna_entropy_global": 2.5,
362
+ "orf_nt_entropy": {"orf1": 1.8},
363
+ "protein_aa_entropy": {"orf1": 3.2},
364
+ "three_di_entropy": {"orf1": 2.9},
365
+ "alphabet_sizes": {"dna": 4, "protein": 20, "three_di": 20}
366
+ }
367
+ }
368
+ ]
369
+ ```
370
+
371
+ ## Development
372
+
373
+ ### Running Tests
374
+
375
+ ```bash
376
+ # Run unit tests
377
+ pytest
378
+
379
+ # Run with coverage
380
+ pytest --cov=genome_entropy
381
+
382
+ # Skip integration tests (default)
383
+ pytest -k "not integration"
384
+
385
+ # Run integration tests (downloads models, slow)
386
+ RUN_INTEGRATION=1 pytest -v -m integration
387
+ ```
388
+
389
+ ### Code Quality
390
+
391
+ ```bash
392
+ # Format code
393
+ black src/ tests/
394
+
395
+ # Lint
396
+ ruff check src/ tests/
397
+
398
+ # Type check
399
+ mypy src/genome_entropy/
400
+ ```
401
+
402
+ ### Project Structure
403
+
404
+ ```
405
+ genome_entropy/
406
+ ├── src/genome_entropy/
407
+ │ ├── io/ # FASTA and JSON I/O
408
+ │ ├── orf/ # ORF finding and types
409
+ │ ├── translate/ # Protein translation
410
+ │ ├── encode3di/ # 3Di encoding (ProstT5)
411
+ │ ├── entropy/ # Shannon entropy calculation
412
+ │ ├── pipeline/ # End-to-end orchestration
413
+ │ └── cli/ # Command-line interface
414
+ ├── tests/ # Unit and integration tests
415
+ └── examples/ # Example data and scripts
416
+ ```
417
+
418
+ ## Citation
419
+
420
+ If you use this software, please cite:
421
+
422
+ - **ProstT5**: Heinzinger et al. (2023), "ProstT5: Bilingual Language Model for Protein Sequence and Structure"
423
+ - **get_orfs**: https://github.com/linsalrob/get_orfs
424
+ - **pygenetic-code**: https://github.com/linsalrob/genetic_codes
425
+
426
+ ## License
427
+
428
+ MIT License - see [LICENSE](LICENSE) file for details.
429
+
430
+ ## Author
431
+
432
+ Rob Edwards (@linsalrob)
433
+ Email: raedwards@gmail.com
434
+
435
+ ## Contributing
436
+
437
+ Contributions welcome! Please:
438
+
439
+ 1. Fork the repository
440
+ 2. Create a feature branch
441
+ 3. Add tests for new functionality
442
+ 4. Ensure all tests pass
443
+ 5. Submit a pull request
444
+
445
+ ## Troubleshooting
446
+
447
+ ### Common Issues
448
+
449
+ **ModuleNotFoundError: No module named 'genome_entropy'**
450
+ - Run `pip install -e .` from repository root
451
+
452
+ **get_orfs binary not found**
453
+ - Install get_orfs and add to PATH or set GET_ORFS_PATH environment variable
454
+
455
+ **CUDA out of memory**
456
+ - Use CPU with `--device cpu` or reduce batch size with `--batch-size 1`
457
+
458
+ **Model download fails**
459
+ - Check internet connection
460
+ - Verify HuggingFace cache permissions (~/.cache/huggingface/)
461
+
462
+ **Integration tests run unexpectedly**
463
+ - Use `pytest -k "not integration"` to skip them
464
+
465
+ ## Acknowledgments
466
+
467
+ - ProstT5 model by Rostlab
468
+ - get_orfs by Rob Edwards
469
+ - genetic_codes by Rob Edwards