giae 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- giae-0.2.0/.gitignore +181 -0
- giae-0.2.0/LICENSE +21 -0
- giae-0.2.0/PKG-INFO +411 -0
- giae-0.2.0/README.md +372 -0
- giae-0.2.0/data/prosite/prosite.dat +321000 -0
- giae-0.2.0/pyproject.toml +124 -0
- giae-0.2.0/src/giae/__init__.py +23 -0
- giae-0.2.0/src/giae/analysis/__init__.py +25 -0
- giae-0.2.0/src/giae/analysis/ai.py +94 -0
- giae-0.2.0/src/giae/analysis/blast_local.py +126 -0
- giae-0.2.0/src/giae/analysis/cache.py +171 -0
- giae-0.2.0/src/giae/analysis/hmmer.py +101 -0
- giae-0.2.0/src/giae/analysis/homology.py +307 -0
- giae-0.2.0/src/giae/analysis/interpro.py +238 -0
- giae-0.2.0/src/giae/analysis/motif.py +335 -0
- giae-0.2.0/src/giae/analysis/orf_finder.py +238 -0
- giae-0.2.0/src/giae/analysis/prosite.py +356 -0
- giae-0.2.0/src/giae/analysis/throttle.py +117 -0
- giae-0.2.0/src/giae/analysis/uniprot.py +330 -0
- giae-0.2.0/src/giae/cli/__init__.py +12 -0
- giae-0.2.0/src/giae/cli/db.py +166 -0
- giae-0.2.0/src/giae/cli/main.py +459 -0
- giae-0.2.0/src/giae/engine/__init__.py +16 -0
- giae-0.2.0/src/giae/engine/aggregator.py +208 -0
- giae-0.2.0/src/giae/engine/confidence.py +239 -0
- giae-0.2.0/src/giae/engine/conflict.py +114 -0
- giae-0.2.0/src/giae/engine/hypothesis.py +443 -0
- giae-0.2.0/src/giae/engine/interpreter.py +481 -0
- giae-0.2.0/src/giae/engine/novelty.py +328 -0
- giae-0.2.0/src/giae/engine/plugin.py +88 -0
- giae-0.2.0/src/giae/models/__init__.py +16 -0
- giae-0.2.0/src/giae/models/evidence.py +134 -0
- giae-0.2.0/src/giae/models/gene.py +188 -0
- giae-0.2.0/src/giae/models/genome.py +222 -0
- giae-0.2.0/src/giae/models/interpretation.py +182 -0
- giae-0.2.0/src/giae/models/protein.py +127 -0
- giae-0.2.0/src/giae/output/__init__.py +10 -0
- giae-0.2.0/src/giae/output/json_export.py +184 -0
- giae-0.2.0/src/giae/output/reasoning.py +120 -0
- giae-0.2.0/src/giae/output/report.py +290 -0
- giae-0.2.0/src/giae/parsers/__init__.py +13 -0
- giae-0.2.0/src/giae/parsers/base.py +137 -0
- giae-0.2.0/src/giae/parsers/fasta.py +130 -0
- giae-0.2.0/src/giae/parsers/genbank.py +279 -0
- giae-0.2.0/src/giae/py.typed +0 -0
- giae-0.2.0/tests/__init__.py +0 -0
- giae-0.2.0/tests/fixtures/sample.fasta +6 -0
- giae-0.2.0/tests/fixtures/sample.gb +36 -0
- giae-0.2.0/tests/test_analysis.py +210 -0
- giae-0.2.0/tests/test_cli.py +30 -0
- giae-0.2.0/tests/test_conflict.py +54 -0
- giae-0.2.0/tests/test_engine.py +286 -0
- giae-0.2.0/tests/test_integration_full.py +58 -0
- giae-0.2.0/tests/test_interpro.py +177 -0
- giae-0.2.0/tests/test_models.py +258 -0
- giae-0.2.0/tests/test_novelty.py +229 -0
- giae-0.2.0/tests/test_output.py +157 -0
- giae-0.2.0/tests/test_parsers.py +94 -0
giae-0.2.0/.gitignore
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
|
|
110
|
+
# pdm
|
|
111
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
112
|
+
#pdm.lock
|
|
113
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
114
|
+
# in version control.
|
|
115
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
|
116
|
+
.pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
121
|
+
__pypackages__/
|
|
122
|
+
|
|
123
|
+
# Celery stuff
|
|
124
|
+
celerybeat-schedule
|
|
125
|
+
celerybeat.pid
|
|
126
|
+
|
|
127
|
+
# SageMath parsed files
|
|
128
|
+
*.sage.py
|
|
129
|
+
|
|
130
|
+
# Environments
|
|
131
|
+
.env
|
|
132
|
+
.venv
|
|
133
|
+
env/
|
|
134
|
+
venv/
|
|
135
|
+
ENV/
|
|
136
|
+
env.bak/
|
|
137
|
+
venv.bak/
|
|
138
|
+
|
|
139
|
+
# Spyder project settings
|
|
140
|
+
.spyderproject
|
|
141
|
+
.spyproject
|
|
142
|
+
|
|
143
|
+
# Rope project settings
|
|
144
|
+
.ropeproject
|
|
145
|
+
|
|
146
|
+
# mkdocs documentation
|
|
147
|
+
/site
|
|
148
|
+
|
|
149
|
+
# mypy
|
|
150
|
+
.mypy_cache/
|
|
151
|
+
.dmypy.json
|
|
152
|
+
dmypy.json
|
|
153
|
+
|
|
154
|
+
# Pyre type checker
|
|
155
|
+
.pyre/
|
|
156
|
+
|
|
157
|
+
# pytype static type analyzer
|
|
158
|
+
.pytype/
|
|
159
|
+
|
|
160
|
+
# Cython debug symbols
|
|
161
|
+
cython_debug/
|
|
162
|
+
|
|
163
|
+
# PyCharm
|
|
164
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
165
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
166
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
167
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
168
|
+
#.idea/
|
|
169
|
+
|
|
170
|
+
# Ruff stuff:
|
|
171
|
+
.ruff_cache/
|
|
172
|
+
|
|
173
|
+
# PyPI configuration file
|
|
174
|
+
.pypirc
|
|
175
|
+
|
|
176
|
+
# Cursor
|
|
177
|
+
# Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
|
|
178
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
179
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
180
|
+
.cursorignore
|
|
181
|
+
.cursorindexingignore
|
giae-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Atunrase Ayo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
giae-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: giae
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Genome Interpretation & Annotation Engine - An Explainable, Evidence-Centric Framework for Genomic Interpretation
|
|
5
|
+
Project-URL: Homepage, https://github.com/Ayo-Cyber/GIAE
|
|
6
|
+
Project-URL: Documentation, https://github.com/Ayo-Cyber/GIAE#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/Ayo-Cyber/GIAE.git
|
|
8
|
+
Project-URL: Issues, https://github.com/Ayo-Cyber/GIAE/issues
|
|
9
|
+
Author: GIAE Contributors
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: annotation,bioinformatics,biopython,explainable-ai,genomics,interpretation
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Requires-Dist: biopython>=1.83
|
|
25
|
+
Requires-Dist: click>=8.1.0
|
|
26
|
+
Requires-Dist: numpy>=1.20.0
|
|
27
|
+
Requires-Dist: rich>=13.0.0
|
|
28
|
+
Provides-Extra: ai
|
|
29
|
+
Requires-Dist: fair-esm>=2.0.0; extra == 'ai'
|
|
30
|
+
Requires-Dist: torch>=2.0.0; extra == 'ai'
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: mypy>=1.8.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: ruff>=0.2.0; extra == 'dev'
|
|
36
|
+
Provides-Extra: hmmer
|
|
37
|
+
Requires-Dist: pyhmmer>=0.10.0; extra == 'hmmer'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# GIAE — Genome Interpretation & Annotation Engine
|
|
41
|
+
|
|
42
|
+
> **Explainability-first genome annotation. Every prediction shows its reasoning.**
|
|
43
|
+
|
|
44
|
+
[](https://python.org)
|
|
45
|
+
[](LICENSE)
|
|
46
|
+
[](pyproject.toml)
|
|
47
|
+
|
|
48
|
+
Most genome annotation tools are overconfident. PROKKA, Bakta, and RAST assign a label, hide the evidence, and give you no way to know how certain the prediction is. GIAE takes the opposite approach: every gene interpretation includes the full evidence stack, confidence score, uncertainty sources, and a ranked list of competing hypotheses.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## What Makes GIAE Different
|
|
53
|
+
|
|
54
|
+
| Feature | PROKKA / Bakta / RAST | GIAE |
|
|
55
|
+
|---|---|---|
|
|
56
|
+
| Output | Label only | Label + evidence chain + confidence score |
|
|
57
|
+
| Uncertainty | Hidden | Explicit, calibrated per gene |
|
|
58
|
+
| Conflicting evidence | Silently resolved | Flagged and reported |
|
|
59
|
+
| Unknown genes | "hypothetical protein" | Ranked as research priorities |
|
|
60
|
+
| Reasoning | Opaque | Full reasoning chain in every report |
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## 4-Layer Evidence Pipeline
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
Genome (.gb / .fasta)
|
|
68
|
+
│
|
|
69
|
+
▼
|
|
70
|
+
┌──────────────────────────────────────────────┐
|
|
71
|
+
│ 1. PROSITE Motif Scan weight: 0.80 │ 1,298 curated patterns (bundled)
|
|
72
|
+
│ 2. EBI HMMER / Pfam Domains weight: 0.90 │ Pfam via EBI web API (online)
|
|
73
|
+
│ 3. UniProt API Lookup weight: 1.00 │ Swiss-Prot reviewed entries (online)
|
|
74
|
+
│ 4. Conflict Detection │ flags when sources disagree
|
|
75
|
+
└──────────────────────────────────────────────┘
|
|
76
|
+
│
|
|
77
|
+
▼
|
|
78
|
+
Interpretation + Confidence Score + Novel Gene Report
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Confidence is computed from evidence convergence — when PROSITE, Pfam, and UniProt all agree, confidence is HIGH. When they disagree, the conflict is surfaced explicitly rather than silently resolved.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
Install from source (recommended while in active development):
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
git clone https://github.com/Ayo-Cyber/GIAE.git
|
|
91
|
+
cd GIAE
|
|
92
|
+
pip install -e ".[dev]"
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Or via pip:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pip install giae
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Requirements:** Python 3.10+, BioPython, Click, Rich. No local databases required for the base pipeline — PROSITE (1,298 patterns) is bundled.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Quick Start
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Offline mode: PROSITE patterns only — no network, instant startup
|
|
109
|
+
# Lambda phage (92 genes) completes in ~4 seconds
|
|
110
|
+
giae interpret lambda_phage.gb --no-uniprot --no-interpro
|
|
111
|
+
|
|
112
|
+
# Full pipeline: adds EBI HMMER + UniProt API calls
|
|
113
|
+
# Lambda phage takes ~6 minutes (network latency dominates)
|
|
114
|
+
giae interpret lambda_phage.gb
|
|
115
|
+
|
|
116
|
+
# Save Markdown report to file
|
|
117
|
+
giae interpret lambda_phage.gb --output lambda_report.md
|
|
118
|
+
|
|
119
|
+
# JSON output for downstream processing
|
|
120
|
+
giae interpret lambda_phage.gb --format json --output results.json
|
|
121
|
+
|
|
122
|
+
# Large genomes: parallel workers reduce wall time significantly
|
|
123
|
+
# T4 phage (288 genes) — use --no-uniprot --no-interpro for offline speed
|
|
124
|
+
giae interpret T4.gb --workers 4 --no-uniprot --no-interpro
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
> **Runtime guide:** Offline (PROSITE only) runs in seconds for phage-sized genomes. Online mode (full pipeline) adds ~4–5 seconds per gene for API calls — plan for 5–10 minutes per phage, hours for large bacterial genomes. Use `--workers` to parallelise.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Example Output
|
|
132
|
+
|
|
133
|
+
A well-characterised gene with converging evidence:
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
Gene: J — Tail fiber protein
|
|
137
|
+
Hypothesis: Tail fiber / host receptor-binding protein
|
|
138
|
+
Confidence: HIGH (0.87)
|
|
139
|
+
Category: structural_protein
|
|
140
|
+
|
|
141
|
+
Evidence:
|
|
142
|
+
[0.82] PROSITE PS51123 — Phage tail fiber repeat
|
|
143
|
+
[0.94] Pfam PF09255 — Phage_tail_fib (e-value: 2.1e-14)
|
|
144
|
+
[0.90] UniProt P03722 — Tail fiber protein J, Lambda phage (Swiss-Prot reviewed)
|
|
145
|
+
|
|
146
|
+
Uncertainty sources: none
|
|
147
|
+
Competing hypotheses: none above threshold
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
A dark-matter gene with zero detectable signal:
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
Gene: B — hypothetical protein (147 aa)
|
|
154
|
+
Interpretation: NONE
|
|
155
|
+
Novel Gene Category: DARK MATTER
|
|
156
|
+
Priority: HIGH PRIORITY
|
|
157
|
+
Reason: No sequence homology, domains, or motifs detected
|
|
158
|
+
|
|
159
|
+
Suggested experiments:
|
|
160
|
+
• Recombinant expression and biochemical activity screening
|
|
161
|
+
• Deletion mutant phenotyping to assess essentiality
|
|
162
|
+
• Comparative genomics across related strains
|
|
163
|
+
• Structural characterization by cryo-EM
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Confidence Levels
|
|
169
|
+
|
|
170
|
+
Every prediction carries a numeric score mapped to a named level:
|
|
171
|
+
|
|
172
|
+
| Level | Score range | Meaning |
|
|
173
|
+
|-------|-------------|---------|
|
|
174
|
+
| `HIGH` | ≥ 0.80 | Multiple evidence types converge; strong homology or domain hit |
|
|
175
|
+
| `MODERATE` | 0.50 – 0.79 | Some convergence; one strong signal or moderate homology |
|
|
176
|
+
| `LOW` | 0.30 – 0.49 | Weak or single-type evidence; treat as a lead, not a conclusion |
|
|
177
|
+
| `SPECULATIVE` | < 0.30 | Minimal signal; flagged for review |
|
|
178
|
+
|
|
179
|
+
Scores are adjusted for: evidence diversity (+0.10 for ≥2 types), strong homology (+0.05), high-confidence Pfam domain (+0.08), and penalised for: limited evidence (−0.10), hypothetical homologs (−0.15), single-evidence-type motif-only predictions (capped at 0.85), and conflict (×0.80 penalty).
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## 7-Phage Benchmark
|
|
184
|
+
|
|
185
|
+
Benchmarked on seven classic bacteriophage genomes (offline pipeline: PROSITE + Pfam):
|
|
186
|
+
|
|
187
|
+
| Phage | Genome | Genes | Interpreted | Dark Matter |
|
|
188
|
+
|-------|--------|-------|-------------|-------------|
|
|
189
|
+
| Lambda (λ) | 48.5 kb | 92 | 45 (48.9%) | 44 (47.8%) |
|
|
190
|
+
| T7 | 39.9 kb | 56 | 19 (33.9%) | 36 (64.3%) |
|
|
191
|
+
| PhiX174 | 5.4 kb | 11 | 0 (0.0%) | 11 (100%) |
|
|
192
|
+
| Phi29 | 19.3 kb | 30 | 13 (43.3%) | 16 (53.3%) |
|
|
193
|
+
| Mu | 36.7 kb | 56 | 20 (35.7%) | 35 (62.5%) |
|
|
194
|
+
| P22 | 41.7 kb | 69 | 30 (43.5%) | 38 (55.1%) |
|
|
195
|
+
| T4 | 168.9 kb | 288 | 75 (26.0%) | 213 (73.9%) |
|
|
196
|
+
|
|
197
|
+
**Median characterization rate: 34.5%**
|
|
198
|
+
|
|
199
|
+
> **Lambda with the full 4-layer online pipeline:** 48.9% — identical to offline. The 44 dark-matter genes remain dark not because databases are small, but because these proteins genuinely have no detectable sequence-based signal. That is the correct answer.
|
|
200
|
+
|
|
201
|
+
> **PhiX174 at 0%** is expected and correct. Its proteins (viral jelly-roll β-barrel capsid, DNA pilot tube) are structurally unique folds with no sequence-recognizable motifs. Structural homology search (Foldseek/AlphaFold) is next on the roadmap.
|
|
202
|
+
|
|
203
|
+
All 7 GenBank files and benchmark reports are in [`case_studies/`](case_studies/).
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Novel Gene Discovery
|
|
208
|
+
|
|
209
|
+
Every run produces a `Novel Gene Report` — a structured research agenda for genes that couldn't be interpreted:
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
Novel Gene Discovery
|
|
213
|
+
Dark Matter: 44 (zero evidence from any source)
|
|
214
|
+
Weak Signal: 13 (confidence < 35%)
|
|
215
|
+
Conflicting: 0
|
|
216
|
+
|
|
217
|
+
Top Research Priorities:
|
|
218
|
+
1. B — 147 aa HIGH PRIORITY dark_matter
|
|
219
|
+
2. ea22 — 113 aa HIGH PRIORITY dark_matter
|
|
220
|
+
3. orf — 98 aa HIGH PRIORITY dark_matter
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Three novelty categories:
|
|
224
|
+
- **Dark matter** — zero computational evidence from any source
|
|
225
|
+
- **Weak evidence** — some hits, but confidence below threshold (< 35%)
|
|
226
|
+
- **Conflict** — two or more evidence sources contradict each other
|
|
227
|
+
|
|
228
|
+
Each candidate includes suggested experiments scaled to protein length and category.
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Python API
|
|
233
|
+
|
|
234
|
+
Use GIAE programmatically for batch processing or integration into pipelines:
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from giae.parsers.genbank import parse_genbank
|
|
238
|
+
from giae.engine.interpreter import Interpreter
|
|
239
|
+
|
|
240
|
+
# Load genome
|
|
241
|
+
genome = parse_genbank("lambda_phage.gb")
|
|
242
|
+
|
|
243
|
+
# Run offline pipeline (fast, no network)
|
|
244
|
+
interpreter = Interpreter(use_uniprot=False, use_interpro=False)
|
|
245
|
+
summary = interpreter.interpret_genome(genome)
|
|
246
|
+
|
|
247
|
+
print(f"Interpreted {summary.interpreted_genes}/{summary.total_genes} genes")
|
|
248
|
+
print(f"Dark matter: {summary.novel_gene_report.dark_matter_count}")
|
|
249
|
+
|
|
250
|
+
# Inspect individual results
|
|
251
|
+
for result in summary.results:
|
|
252
|
+
if result.interpretation:
|
|
253
|
+
print(result.interpretation.get_explanation())
|
|
254
|
+
|
|
255
|
+
# Quick single-sequence interpretation
|
|
256
|
+
interp = interpreter.quick_interpret("MKVLIFFVIALFSSATAAF...", sequence_type="protein")
|
|
257
|
+
print(interp)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## CLI Reference
|
|
263
|
+
|
|
264
|
+
```
|
|
265
|
+
Usage: giae [OPTIONS] COMMAND [ARGS]...
|
|
266
|
+
|
|
267
|
+
Commands:
|
|
268
|
+
interpret Interpret a genome file (.gb or .fasta)
|
|
269
|
+
db Database management (download databases, check status)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### `giae interpret`
|
|
273
|
+
|
|
274
|
+
```
|
|
275
|
+
Options:
|
|
276
|
+
--output, -o PATH Write report to file (default: stdout)
|
|
277
|
+
--format, -f [report|json] Output format (default: report)
|
|
278
|
+
--workers, -w INT Parallel workers, 1–16 (default: 1)
|
|
279
|
+
--no-uniprot Skip UniProt API (offline mode)
|
|
280
|
+
--no-interpro Skip EBI HMMER domain search (offline mode)
|
|
281
|
+
--verbose, -v Show pipeline details
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### `giae db`
|
|
285
|
+
|
|
286
|
+
Manages optional local databases for the plugin layer. The base pipeline (PROSITE) needs no setup — databases here unlock the local HMMER and BLAST plugins.
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
# Check what's installed
|
|
290
|
+
giae db status
|
|
291
|
+
|
|
292
|
+
# Download PROSITE (latest from ExPASy — updates the bundled copy)
|
|
293
|
+
giae db download prosite
|
|
294
|
+
|
|
295
|
+
# Download SwissProt for local BLAST (requires BLAST+ installed)
|
|
296
|
+
giae db download swissprot
|
|
297
|
+
|
|
298
|
+
# Download Pfam for local HMMER (requires HMMER3 installed)
|
|
299
|
+
giae db download pfam
|
|
300
|
+
|
|
301
|
+
# Force re-download
|
|
302
|
+
giae db download prosite --force
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
> **Do you need to run `giae db`?** No, for the base pipeline. PROSITE (1,298 patterns) is bundled. `giae db` is only needed if you want local BLAST or HMMER plugins, which bypass the EBI web API with locally-installed tools and larger databases.
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## Project Structure
|
|
310
|
+
|
|
311
|
+
```
|
|
312
|
+
GIAE/
|
|
313
|
+
├── src/giae/
|
|
314
|
+
│ ├── analysis/ # Evidence extraction modules
|
|
315
|
+
│ │ ├── motif.py # PROSITE pattern scanning
|
|
316
|
+
│ │ ├── prosite.py # PROSITE database parser
|
|
317
|
+
│ │ ├── uniprot.py # UniProt REST API client
|
|
318
|
+
│ │ ├── interpro.py # EBI HMMER / InterPro client
|
|
319
|
+
│ │ ├── hmmer.py # Local HMMER plugin
|
|
320
|
+
│ │ ├── blast_local.py # Local BLAST plugin
|
|
321
|
+
│ │ └── ai.py # ESM-2 embedding plugin
|
|
322
|
+
│ ├── cli/
|
|
323
|
+
│ │ ├── main.py # CLI entrypoint
|
|
324
|
+
│ │ └── db.py # Database management commands
|
|
325
|
+
│ ├── engine/
|
|
326
|
+
│ │ ├── interpreter.py # Main orchestrator
|
|
327
|
+
│ │ ├── aggregator.py # Evidence aggregation & weighting
|
|
328
|
+
│ │ ├── hypothesis.py # Hypothesis generation
|
|
329
|
+
│ │ ├── confidence.py # Confidence scoring
|
|
330
|
+
│ │ ├── conflict.py # Conflict detection
|
|
331
|
+
│ │ ├── novelty.py # Novel gene discovery & ranking
|
|
332
|
+
│ │ └── plugin.py # Plugin manager
|
|
333
|
+
│ ├── models/ # Core data models
|
|
334
|
+
│ │ ├── genome.py
|
|
335
|
+
│ │ ├── gene.py
|
|
336
|
+
│ │ ├── protein.py
|
|
337
|
+
│ │ ├── evidence.py
|
|
338
|
+
│ │ └── interpretation.py
|
|
339
|
+
│ ├── output/
|
|
340
|
+
│ │ ├── report.py # Markdown report generator
|
|
341
|
+
│ │ ├── reasoning.py # Reasoning chain formatter
|
|
342
|
+
│ │ └── json_export.py # JSON serialization
|
|
343
|
+
│ └── parsers/ # FASTA / GenBank parsers
|
|
344
|
+
├── tests/ # pytest test suite
|
|
345
|
+
├── case_studies/ # 7 phage GenBank files + benchmark reports
|
|
346
|
+
├── data/prosite/ # Bundled PROSITE database (1,298 patterns)
|
|
347
|
+
├── pyproject.toml
|
|
348
|
+
└── QUICKSTART.md
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## Plugin System
|
|
354
|
+
|
|
355
|
+
GIAE has a plugin architecture for optional heavy-weight evidence sources. Plugins are auto-detected at startup — if the required binary or database path doesn't exist, the plugin is silently skipped. The base pipeline always runs.
|
|
356
|
+
|
|
357
|
+
| Plugin | Requirement | Evidence Type | Weight |
|
|
358
|
+
|--------|-------------|---------------|--------|
|
|
359
|
+
| `HmmerPlugin` | HMMER3 + `~/.giae/hmmer/pfam.hmm` | `DOMAIN_HIT` | 0.90 |
|
|
360
|
+
| `BlastLocalPlugin` | BLAST+ + `~/.giae/blast/swissprot` | `BLAST_HOMOLOGY` | 1.00 |
|
|
361
|
+
| `EsmPlugin` | PyTorch + ESM-2 model | `SEQUENCE_FEATURE` | 0.50 |
|
|
362
|
+
|
|
363
|
+
Install plugin dependencies with `giae db download pfam` / `giae db download swissprot`.
|
|
364
|
+
|
|
365
|
+
---
|
|
366
|
+
|
|
367
|
+
## Roadmap
|
|
368
|
+
|
|
369
|
+
- [ ] **Foldseek / AlphaFold structural search** — `STRUCTURAL_HOMOLOGY` evidence (already in codebase); resolves PhiX174-class cases where sequence-based methods fail completely
|
|
370
|
+
- [ ] **EBI BLAST async API** — replace text-based UniProt search with real sequence similarity search
|
|
371
|
+
- [ ] **Interactive HTML reports** — evidence network visualization across all genes
|
|
372
|
+
- [ ] **Database caching** — avoid re-running API calls on identical sequences; essential for large genomes
|
|
373
|
+
- [ ] **Bacterial genome scaling** — currently validated on phages; next target is 4–6 Mb bacterial genomes
|
|
374
|
+
- [ ] **Comparison mode** — diff two genome interpretations side by side
|
|
375
|
+
|
|
376
|
+
---
|
|
377
|
+
|
|
378
|
+
## Contributing
|
|
379
|
+
|
|
380
|
+
Issues, PRs, and genome challenges welcome.
|
|
381
|
+
|
|
382
|
+
```bash
|
|
383
|
+
# Run tests
|
|
384
|
+
pytest tests/ -v
|
|
385
|
+
|
|
386
|
+
# Run with coverage
|
|
387
|
+
pytest tests/ --cov=giae --cov-report=term-missing
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
---
|
|
391
|
+
|
|
392
|
+
## Citation
|
|
393
|
+
|
|
394
|
+
If you use GIAE in research, please cite:
|
|
395
|
+
|
|
396
|
+
```
|
|
397
|
+
GIAE — Genome Interpretation & Annotation Engine (v0.2.0)
|
|
398
|
+
https://github.com/Ayo-Cyber/GIAE
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
A formal publication is in preparation.
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
## License
|
|
406
|
+
|
|
407
|
+
MIT — see [LICENSE](LICENSE).
|
|
408
|
+
|
|
409
|
+
---
|
|
410
|
+
|
|
411
|
+
*GIAE v0.2.0 — Benchmarked on Lambda, T7, PhiX174, Phi29, Mu, P22, and T4.*
|