giae 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. giae-0.2.0/.gitignore +181 -0
  2. giae-0.2.0/LICENSE +21 -0
  3. giae-0.2.0/PKG-INFO +411 -0
  4. giae-0.2.0/README.md +372 -0
  5. giae-0.2.0/data/prosite/prosite.dat +321000 -0
  6. giae-0.2.0/pyproject.toml +124 -0
  7. giae-0.2.0/src/giae/__init__.py +23 -0
  8. giae-0.2.0/src/giae/analysis/__init__.py +25 -0
  9. giae-0.2.0/src/giae/analysis/ai.py +94 -0
  10. giae-0.2.0/src/giae/analysis/blast_local.py +126 -0
  11. giae-0.2.0/src/giae/analysis/cache.py +171 -0
  12. giae-0.2.0/src/giae/analysis/hmmer.py +101 -0
  13. giae-0.2.0/src/giae/analysis/homology.py +307 -0
  14. giae-0.2.0/src/giae/analysis/interpro.py +238 -0
  15. giae-0.2.0/src/giae/analysis/motif.py +335 -0
  16. giae-0.2.0/src/giae/analysis/orf_finder.py +238 -0
  17. giae-0.2.0/src/giae/analysis/prosite.py +356 -0
  18. giae-0.2.0/src/giae/analysis/throttle.py +117 -0
  19. giae-0.2.0/src/giae/analysis/uniprot.py +330 -0
  20. giae-0.2.0/src/giae/cli/__init__.py +12 -0
  21. giae-0.2.0/src/giae/cli/db.py +166 -0
  22. giae-0.2.0/src/giae/cli/main.py +459 -0
  23. giae-0.2.0/src/giae/engine/__init__.py +16 -0
  24. giae-0.2.0/src/giae/engine/aggregator.py +208 -0
  25. giae-0.2.0/src/giae/engine/confidence.py +239 -0
  26. giae-0.2.0/src/giae/engine/conflict.py +114 -0
  27. giae-0.2.0/src/giae/engine/hypothesis.py +443 -0
  28. giae-0.2.0/src/giae/engine/interpreter.py +481 -0
  29. giae-0.2.0/src/giae/engine/novelty.py +328 -0
  30. giae-0.2.0/src/giae/engine/plugin.py +88 -0
  31. giae-0.2.0/src/giae/models/__init__.py +16 -0
  32. giae-0.2.0/src/giae/models/evidence.py +134 -0
  33. giae-0.2.0/src/giae/models/gene.py +188 -0
  34. giae-0.2.0/src/giae/models/genome.py +222 -0
  35. giae-0.2.0/src/giae/models/interpretation.py +182 -0
  36. giae-0.2.0/src/giae/models/protein.py +127 -0
  37. giae-0.2.0/src/giae/output/__init__.py +10 -0
  38. giae-0.2.0/src/giae/output/json_export.py +184 -0
  39. giae-0.2.0/src/giae/output/reasoning.py +120 -0
  40. giae-0.2.0/src/giae/output/report.py +290 -0
  41. giae-0.2.0/src/giae/parsers/__init__.py +13 -0
  42. giae-0.2.0/src/giae/parsers/base.py +137 -0
  43. giae-0.2.0/src/giae/parsers/fasta.py +130 -0
  44. giae-0.2.0/src/giae/parsers/genbank.py +279 -0
  45. giae-0.2.0/src/giae/py.typed +0 -0
  46. giae-0.2.0/tests/__init__.py +0 -0
  47. giae-0.2.0/tests/fixtures/sample.fasta +6 -0
  48. giae-0.2.0/tests/fixtures/sample.gb +36 -0
  49. giae-0.2.0/tests/test_analysis.py +210 -0
  50. giae-0.2.0/tests/test_cli.py +30 -0
  51. giae-0.2.0/tests/test_conflict.py +54 -0
  52. giae-0.2.0/tests/test_engine.py +286 -0
  53. giae-0.2.0/tests/test_integration_full.py +58 -0
  54. giae-0.2.0/tests/test_interpro.py +177 -0
  55. giae-0.2.0/tests/test_models.py +258 -0
  56. giae-0.2.0/tests/test_novelty.py +229 -0
  57. giae-0.2.0/tests/test_output.py +157 -0
  58. giae-0.2.0/tests/test_parsers.py +94 -0
giae-0.2.0/.gitignore ADDED
@@ -0,0 +1,181 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
175
+
176
+ # Cursor
177
+ # Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
178
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179
+ # refer to https://docs.cursor.com/context/ignore-files
180
+ .cursorignore
181
+ .cursorindexingignore
giae-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Atunrase Ayo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
giae-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,411 @@
1
+ Metadata-Version: 2.4
2
+ Name: giae
3
+ Version: 0.2.0
4
+ Summary: Genome Interpretation & Annotation Engine - An Explainable, Evidence-Centric Framework for Genomic Interpretation
5
+ Project-URL: Homepage, https://github.com/Ayo-Cyber/GIAE
6
+ Project-URL: Documentation, https://github.com/Ayo-Cyber/GIAE#readme
7
+ Project-URL: Repository, https://github.com/Ayo-Cyber/GIAE.git
8
+ Project-URL: Issues, https://github.com/Ayo-Cyber/GIAE/issues
9
+ Author: GIAE Contributors
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: annotation,bioinformatics,biopython,explainable-ai,genomics,interpretation
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.9
24
+ Requires-Dist: biopython>=1.83
25
+ Requires-Dist: click>=8.1.0
26
+ Requires-Dist: numpy>=1.20.0
27
+ Requires-Dist: rich>=13.0.0
28
+ Provides-Extra: ai
29
+ Requires-Dist: fair-esm>=2.0.0; extra == 'ai'
30
+ Requires-Dist: torch>=2.0.0; extra == 'ai'
31
+ Provides-Extra: dev
32
+ Requires-Dist: mypy>=1.8.0; extra == 'dev'
33
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
34
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
35
+ Requires-Dist: ruff>=0.2.0; extra == 'dev'
36
+ Provides-Extra: hmmer
37
+ Requires-Dist: pyhmmer>=0.10.0; extra == 'hmmer'
38
+ Description-Content-Type: text/markdown
39
+
40
+ # GIAE — Genome Interpretation & Annotation Engine
41
+
42
+ > **Explainability-first genome annotation. Every prediction shows its reasoning.**
43
+
44
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](https://python.org)
45
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
46
+ [![Version](https://img.shields.io/badge/version-0.2.0-green)](pyproject.toml)
47
+
48
+ Most genome annotation tools are overconfident. PROKKA, Bakta, and RAST assign a label, hide the evidence, and give you no way to know how certain the prediction is. GIAE takes the opposite approach: every gene interpretation includes the full evidence stack, confidence score, uncertainty sources, and a ranked list of competing hypotheses.
49
+
50
+ ---
51
+
52
+ ## What Makes GIAE Different
53
+
54
+ | Feature | PROKKA / Bakta / RAST | GIAE |
55
+ |---|---|---|
56
+ | Output | Label only | Label + evidence chain + confidence score |
57
+ | Uncertainty | Hidden | Explicit, calibrated per gene |
58
+ | Conflicting evidence | Silently resolved | Flagged and reported |
59
+ | Unknown genes | "hypothetical protein" | Ranked as research priorities |
60
+ | Reasoning | Opaque | Full reasoning chain in every report |
61
+
62
+ ---
63
+
64
+ ## 4-Layer Evidence Pipeline
65
+
66
+ ```
67
+ Genome (.gb / .fasta)
68
+
69
+
70
+ ┌──────────────────────────────────────────────┐
71
+ │ 1. PROSITE Motif Scan weight: 0.80 │ 1,298 curated patterns (bundled)
72
+ │ 2. EBI HMMER / Pfam Domains weight: 0.90 │ Pfam via EBI web API (online)
73
+ │ 3. UniProt API Lookup weight: 1.00 │ Swiss-Prot reviewed entries (online)
74
+ │ 4. Conflict Detection │ flags when sources disagree
75
+ └──────────────────────────────────────────────┘
76
+
77
+
78
+ Interpretation + Confidence Score + Novel Gene Report
79
+ ```
80
+
81
+ Confidence is computed from evidence convergence — when PROSITE, Pfam, and UniProt all agree, confidence is HIGH. When they disagree, the conflict is surfaced explicitly rather than silently resolved.
82
+
83
+ ---
84
+
85
+ ## Installation
86
+
87
+ Install from source (recommended while in active development):
88
+
89
+ ```bash
90
+ git clone https://github.com/Ayo-Cyber/GIAE.git
91
+ cd GIAE
92
+ pip install -e ".[dev]"
93
+ ```
94
+
95
+ Or via pip:
96
+
97
+ ```bash
98
+ pip install giae
99
+ ```
100
+
101
+ **Requirements:** Python 3.10+, BioPython, Click, Rich. No local databases required for the base pipeline — PROSITE (1,298 patterns) is bundled.
102
+
103
+ ---
104
+
105
+ ## Quick Start
106
+
107
+ ```bash
108
+ # Offline mode: PROSITE patterns only — no network, instant startup
109
+ # Lambda phage (92 genes) completes in ~4 seconds
110
+ giae interpret lambda_phage.gb --no-uniprot --no-interpro
111
+
112
+ # Full pipeline: adds EBI HMMER + UniProt API calls
113
+ # Lambda phage takes ~6 minutes (network latency dominates)
114
+ giae interpret lambda_phage.gb
115
+
116
+ # Save Markdown report to file
117
+ giae interpret lambda_phage.gb --output lambda_report.md
118
+
119
+ # JSON output for downstream processing
120
+ giae interpret lambda_phage.gb --format json --output results.json
121
+
122
+ # Large genomes: parallel workers reduce wall time significantly
123
+ # T4 phage (288 genes) — use --no-uniprot --no-interpro for offline speed
124
+ giae interpret T4.gb --workers 4 --no-uniprot --no-interpro
125
+ ```
126
+
127
+ > **Runtime guide:** Offline (PROSITE only) runs in seconds for phage-sized genomes. Online mode (full pipeline) adds ~4–5 seconds per gene for API calls — plan for 5–10 minutes per phage, hours for large bacterial genomes. Use `--workers` to parallelise.
128
+
129
+ ---
130
+
131
+ ## Example Output
132
+
133
+ A well-characterised gene with converging evidence:
134
+
135
+ ```
136
+ Gene: J — Tail fiber protein
137
+ Hypothesis: Tail fiber / host receptor-binding protein
138
+ Confidence: HIGH (0.87)
139
+ Category: structural_protein
140
+
141
+ Evidence:
142
+ [0.82] PROSITE PS51123 — Phage tail fiber repeat
143
+ [0.94] Pfam PF09255 — Phage_tail_fib (e-value: 2.1e-14)
144
+ [0.90] UniProt P03722 — Tail fiber protein J, Lambda phage (Swiss-Prot reviewed)
145
+
146
+ Uncertainty sources: none
147
+ Competing hypotheses: none above threshold
148
+ ```
149
+
150
+ A dark-matter gene with zero detectable signal:
151
+
152
+ ```
153
+ Gene: B — hypothetical protein (147 aa)
154
+ Interpretation: NONE
155
+ Novel Gene Category: DARK MATTER
156
+ Priority: HIGH PRIORITY
157
+ Reason: No sequence homology, domains, or motifs detected
158
+
159
+ Suggested experiments:
160
+ • Recombinant expression and biochemical activity screening
161
+ • Deletion mutant phenotyping to assess essentiality
162
+ • Comparative genomics across related strains
163
+ • Structural characterization by cryo-EM
164
+ ```
165
+
166
+ ---
167
+
168
+ ## Confidence Levels
169
+
170
+ Every prediction carries a numeric score mapped to a named level:
171
+
172
+ | Level | Score range | Meaning |
173
+ |-------|-------------|---------|
174
+ | `HIGH` | ≥ 0.80 | Multiple evidence types converge; strong homology or domain hit |
175
+ | `MODERATE` | 0.50 – 0.79 | Some convergence; one strong signal or moderate homology |
176
+ | `LOW` | 0.30 – 0.49 | Weak or single-type evidence; treat as a lead, not a conclusion |
177
+ | `SPECULATIVE` | < 0.30 | Minimal signal; flagged for review |
178
+
179
+ Scores are adjusted for: evidence diversity (+0.10 for ≥2 types), strong homology (+0.05), high-confidence Pfam domain (+0.08), and penalised for: limited evidence (−0.10), hypothetical homologs (−0.15), single-evidence-type motif-only predictions (capped at 0.85), and conflict (×0.80 penalty).
180
+
181
+ ---
182
+
183
+ ## 7-Phage Benchmark
184
+
185
+ Benchmarked on seven classic bacteriophage genomes (offline pipeline: PROSITE + Pfam):
186
+
187
+ | Phage | Genome | Genes | Interpreted | Dark Matter |
188
+ |-------|--------|-------|-------------|-------------|
189
+ | Lambda (λ) | 48.5 kb | 92 | 45 (48.9%) | 44 (47.8%) |
190
+ | T7 | 39.9 kb | 56 | 19 (33.9%) | 36 (64.3%) |
191
+ | PhiX174 | 5.4 kb | 11 | 0 (0.0%) | 11 (100%) |
192
+ | Phi29 | 19.3 kb | 30 | 13 (43.3%) | 16 (53.3%) |
193
+ | Mu | 36.7 kb | 56 | 20 (35.7%) | 35 (62.5%) |
194
+ | P22 | 41.7 kb | 69 | 30 (43.5%) | 38 (55.1%) |
195
+ | T4 | 168.9 kb | 288 | 75 (26.0%) | 213 (73.9%) |
196
+
197
+ **Median characterization rate: 34.5%**
198
+
199
+ > **Lambda with the full 4-layer online pipeline:** 48.9% — identical to offline. The 44 dark-matter genes remain dark not because databases are small, but because these proteins genuinely have no detectable sequence-based signal. That is the correct answer.
200
+
201
+ > **PhiX174 at 0%** is expected and correct. Its proteins (viral jelly-roll β-barrel capsid, DNA pilot tube) are structurally unique folds with no sequence-recognizable motifs. Structural homology search (Foldseek/AlphaFold) is next on the roadmap.
202
+
203
+ All 7 GenBank files and benchmark reports are in [`case_studies/`](case_studies/).
204
+
205
+ ---
206
+
207
+ ## Novel Gene Discovery
208
+
209
+ Every run produces a `Novel Gene Report` — a structured research agenda for genes that couldn't be interpreted:
210
+
211
+ ```
212
+ Novel Gene Discovery
213
+ Dark Matter: 44 (zero evidence from any source)
214
+ Weak Signal: 13 (confidence < 35%)
215
+ Conflicting: 0
216
+
217
+ Top Research Priorities:
218
+ 1. B — 147 aa HIGH PRIORITY dark_matter
219
+ 2. ea22 — 113 aa HIGH PRIORITY dark_matter
220
+ 3. orf — 98 aa HIGH PRIORITY dark_matter
221
+ ```
222
+
223
+ Three novelty categories:
224
+ - **Dark matter** — zero computational evidence from any source
225
+ - **Weak evidence** — some hits, but confidence below threshold (< 35%)
226
+ - **Conflict** — two or more evidence sources contradict each other
227
+
228
+ Each candidate includes suggested experiments scaled to protein length and category.
229
+
230
+ ---
231
+
232
+ ## Python API
233
+
234
+ Use GIAE programmatically for batch processing or integration into pipelines:
235
+
236
+ ```python
237
+ from giae.parsers.genbank import parse_genbank
238
+ from giae.engine.interpreter import Interpreter
239
+
240
+ # Load genome
241
+ genome = parse_genbank("lambda_phage.gb")
242
+
243
+ # Run offline pipeline (fast, no network)
244
+ interpreter = Interpreter(use_uniprot=False, use_interpro=False)
245
+ summary = interpreter.interpret_genome(genome)
246
+
247
+ print(f"Interpreted {summary.interpreted_genes}/{summary.total_genes} genes")
248
+ print(f"Dark matter: {summary.novel_gene_report.dark_matter_count}")
249
+
250
+ # Inspect individual results
251
+ for result in summary.results:
252
+ if result.interpretation:
253
+ print(result.interpretation.get_explanation())
254
+
255
+ # Quick single-sequence interpretation
256
+ interp = interpreter.quick_interpret("MKVLIFFVIALFSSATAAF...", sequence_type="protein")
257
+ print(interp)
258
+ ```
259
+
260
+ ---
261
+
262
+ ## CLI Reference
263
+
264
+ ```
265
+ Usage: giae [OPTIONS] COMMAND [ARGS]...
266
+
267
+ Commands:
268
+ interpret Interpret a genome file (.gb or .fasta)
269
+ db Database management (download databases, check status)
270
+ ```
271
+
272
+ ### `giae interpret`
273
+
274
+ ```
275
+ Options:
276
+ --output, -o PATH Write report to file (default: stdout)
277
+ --format, -f [report|json] Output format (default: report)
278
+ --workers, -w INT Parallel workers, 1–16 (default: 1)
279
+ --no-uniprot Skip UniProt API (offline mode)
280
+ --no-interpro Skip EBI HMMER domain search (offline mode)
281
+ --verbose, -v Show pipeline details
282
+ ```
283
+
284
+ ### `giae db`
285
+
286
+ Manages optional local databases for the plugin layer. The base pipeline (PROSITE) needs no setup — databases here unlock the local HMMER and BLAST plugins.
287
+
288
+ ```bash
289
+ # Check what's installed
290
+ giae db status
291
+
292
+ # Download PROSITE (latest from ExPASy — updates the bundled copy)
293
+ giae db download prosite
294
+
295
+ # Download SwissProt for local BLAST (requires BLAST+ installed)
296
+ giae db download swissprot
297
+
298
+ # Download Pfam for local HMMER (requires HMMER3 installed)
299
+ giae db download pfam
300
+
301
+ # Force re-download
302
+ giae db download prosite --force
303
+ ```
304
+
305
+ > **Do you need to run `giae db`?** No, for the base pipeline. PROSITE (1,298 patterns) is bundled. `giae db` is only needed if you want local BLAST or HMMER plugins, which bypass the EBI web API with locally-installed tools and larger databases.
306
+
307
+ ---
308
+
309
+ ## Project Structure
310
+
311
+ ```
312
+ GIAE/
313
+ ├── src/giae/
314
+ │ ├── analysis/ # Evidence extraction modules
315
+ │ │ ├── motif.py # PROSITE pattern scanning
316
+ │ │ ├── prosite.py # PROSITE database parser
317
+ │ │ ├── uniprot.py # UniProt REST API client
318
+ │ │ ├── interpro.py # EBI HMMER / InterPro client
319
+ │ │ ├── hmmer.py # Local HMMER plugin
320
+ │ │ ├── blast_local.py # Local BLAST plugin
321
+ │ │ └── ai.py # ESM-2 embedding plugin
322
+ │ ├── cli/
323
+ │ │ ├── main.py # CLI entrypoint
324
+ │ │ └── db.py # Database management commands
325
+ │ ├── engine/
326
+ │ │ ├── interpreter.py # Main orchestrator
327
+ │ │ ├── aggregator.py # Evidence aggregation & weighting
328
+ │ │ ├── hypothesis.py # Hypothesis generation
329
+ │ │ ├── confidence.py # Confidence scoring
330
+ │ │ ├── conflict.py # Conflict detection
331
+ │ │ ├── novelty.py # Novel gene discovery & ranking
332
+ │ │ └── plugin.py # Plugin manager
333
+ │ ├── models/ # Core data models
334
+ │ │ ├── genome.py
335
+ │ │ ├── gene.py
336
+ │ │ ├── protein.py
337
+ │ │ ├── evidence.py
338
+ │ │ └── interpretation.py
339
+ │ ├── output/
340
+ │ │ ├── report.py # Markdown report generator
341
+ │ │ ├── reasoning.py # Reasoning chain formatter
342
+ │ │ └── json_export.py # JSON serialization
343
+ │ └── parsers/ # FASTA / GenBank parsers
344
+ ├── tests/ # pytest test suite
345
+ ├── case_studies/ # 7 phage GenBank files + benchmark reports
346
+ ├── data/prosite/ # Bundled PROSITE database (1,298 patterns)
347
+ ├── pyproject.toml
348
+ └── QUICKSTART.md
349
+ ```
350
+
351
+ ---
352
+
353
+ ## Plugin System
354
+
355
+ GIAE has a plugin architecture for optional heavy-weight evidence sources. Plugins are auto-detected at startup — if the required binary or database path doesn't exist, the plugin is silently skipped. The base pipeline always runs.
356
+
357
+ | Plugin | Requirement | Evidence Type | Weight |
358
+ |--------|-------------|---------------|--------|
359
+ | `HmmerPlugin` | HMMER3 + `~/.giae/hmmer/pfam.hmm` | `DOMAIN_HIT` | 0.90 |
360
+ | `BlastLocalPlugin` | BLAST+ + `~/.giae/blast/swissprot` | `BLAST_HOMOLOGY` | 1.00 |
361
+ | `EsmPlugin` | PyTorch + ESM-2 model | `SEQUENCE_FEATURE` | 0.50 |
362
+
363
+ Install plugin dependencies with `giae db download pfam` / `giae db download swissprot`.
364
+
365
+ ---
366
+
367
+ ## Roadmap
368
+
369
+ - [ ] **Foldseek / AlphaFold structural search** — `STRUCTURAL_HOMOLOGY` evidence (already in codebase); resolves PhiX174-class cases where sequence-based methods fail completely
370
+ - [ ] **EBI BLAST async API** — replace text-based UniProt search with real sequence similarity search
371
+ - [ ] **Interactive HTML reports** — evidence network visualization across all genes
372
+ - [ ] **Database caching** — avoid re-running API calls on identical sequences; essential for large genomes
373
+ - [ ] **Bacterial genome scaling** — currently validated on phages; next target is 4–6 Mb bacterial genomes
374
+ - [ ] **Comparison mode** — diff two genome interpretations side by side
375
+
376
+ ---
377
+
378
+ ## Contributing
379
+
380
+ Issues, PRs, and genome challenges welcome.
381
+
382
+ ```bash
383
+ # Run tests
384
+ pytest tests/ -v
385
+
386
+ # Run with coverage
387
+ pytest tests/ --cov=giae --cov-report=term-missing
388
+ ```
389
+
390
+ ---
391
+
392
+ ## Citation
393
+
394
+ If you use GIAE in research, please cite:
395
+
396
+ ```
397
+ GIAE — Genome Interpretation & Annotation Engine (v0.2.0)
398
+ https://github.com/Ayo-Cyber/GIAE
399
+ ```
400
+
401
+ A formal publication is in preparation.
402
+
403
+ ---
404
+
405
+ ## License
406
+
407
+ MIT — see [LICENSE](LICENSE).
408
+
409
+ ---
410
+
411
+ *GIAE v0.2.0 — Benchmarked on Lambda, T7, PhiX174, Phi29, Mu, P22, and T4.*