debase 0.4.1__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase-0.4.2/.gitignore +177 -0
- {debase-0.4.1/src/debase.egg-info → debase-0.4.2}/PKG-INFO +1 -1
- debase-0.4.2/environment.yml +21 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase/_version.py +1 -1
- {debase-0.4.1 → debase-0.4.2}/src/debase/enzyme_lineage_extractor.py +14 -8
- {debase-0.4.1 → debase-0.4.2}/src/debase/lineage_format.py +335 -56
- {debase-0.4.1 → debase-0.4.2}/src/debase/reaction_info_extractor.py +60 -32
- {debase-0.4.1 → debase-0.4.2}/src/debase/substrate_scope_extractor.py +366 -93
- {debase-0.4.1 → debase-0.4.2}/src/debase/wrapper.py +37 -11
- {debase-0.4.1 → debase-0.4.2/src/debase.egg-info}/PKG-INFO +1 -1
- {debase-0.4.1 → debase-0.4.2}/src/debase.egg-info/SOURCES.txt +3 -0
- debase-0.4.2/src/debase.egg-info/dependency_links.txt +1 -0
- {debase-0.4.1 → debase-0.4.2}/LICENSE +0 -0
- {debase-0.4.1 → debase-0.4.2}/MANIFEST.in +0 -0
- {debase-0.4.1 → debase-0.4.2}/README.md +0 -0
- {debase-0.4.1 → debase-0.4.2}/pyproject.toml +0 -0
- {debase-0.4.1 → debase-0.4.2}/setup.cfg +0 -0
- {debase-0.4.1 → debase-0.4.2}/setup.py +0 -0
- /debase-0.4.1/src/debase.egg-info/dependency_links.txt → /debase-0.4.2/src/__init__.py +0 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase/__init__.py +0 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase/__main__.py +0 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase/build_db.py +0 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.4.1 → debase-0.4.2}/src/debase.egg-info/top_level.txt +0 -0
debase-0.4.2/.gitignore
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
2
|
+
__pycache__/
|
3
|
+
*.py[cod]
|
4
|
+
*$py.class
|
5
|
+
|
6
|
+
# C extensions
|
7
|
+
*.so
|
8
|
+
|
9
|
+
# Distribution / packaging
|
10
|
+
.Python
|
11
|
+
build/
|
12
|
+
develop-eggs/
|
13
|
+
dist/
|
14
|
+
downloads/
|
15
|
+
eggs/
|
16
|
+
.eggs/
|
17
|
+
lib/
|
18
|
+
lib64/
|
19
|
+
parts/
|
20
|
+
sdist/
|
21
|
+
var/
|
22
|
+
wheels/
|
23
|
+
share/python-wheels/
|
24
|
+
*.egg-info/
|
25
|
+
.installed.cfg
|
26
|
+
*.egg
|
27
|
+
MANIFEST
|
28
|
+
|
29
|
+
# PyInstaller
|
30
|
+
*.manifest
|
31
|
+
*.spec
|
32
|
+
|
33
|
+
# Installer logs
|
34
|
+
pip-log.txt
|
35
|
+
pip-delete-this-directory.txt
|
36
|
+
|
37
|
+
# Unit test / coverage reports
|
38
|
+
htmlcov/
|
39
|
+
.tox/
|
40
|
+
.nox/
|
41
|
+
.coverage
|
42
|
+
.coverage.*
|
43
|
+
.cache
|
44
|
+
nosetests.xml
|
45
|
+
coverage.xml
|
46
|
+
*.cover
|
47
|
+
*.py,cover
|
48
|
+
.hypothesis/
|
49
|
+
.pytest_cache/
|
50
|
+
cover/
|
51
|
+
|
52
|
+
# Jupyter Notebook
|
53
|
+
.ipynb_checkpoints
|
54
|
+
|
55
|
+
# IPython
|
56
|
+
profile_default/
|
57
|
+
ipython_config.py
|
58
|
+
|
59
|
+
# pyenv
|
60
|
+
.python-version
|
61
|
+
|
62
|
+
# pipenv
|
63
|
+
Pipfile.lock
|
64
|
+
|
65
|
+
# poetry
|
66
|
+
poetry.lock
|
67
|
+
|
68
|
+
# pdm
|
69
|
+
.pdm.toml
|
70
|
+
|
71
|
+
# PEP 582
|
72
|
+
__pypackages__/
|
73
|
+
|
74
|
+
# Celery stuff
|
75
|
+
celerybeat-schedule
|
76
|
+
celerybeat.pid
|
77
|
+
|
78
|
+
# SageMath parsed files
|
79
|
+
*.sage.py
|
80
|
+
|
81
|
+
# Environments
|
82
|
+
.env
|
83
|
+
.venv
|
84
|
+
env/
|
85
|
+
venv/
|
86
|
+
ENV/
|
87
|
+
env.bak/
|
88
|
+
venv.bak/
|
89
|
+
|
90
|
+
# Spyder project settings
|
91
|
+
.spyderproject
|
92
|
+
.spyproject
|
93
|
+
|
94
|
+
# Rope project settings
|
95
|
+
.ropeproject
|
96
|
+
|
97
|
+
# mkdocs documentation
|
98
|
+
/site
|
99
|
+
|
100
|
+
# mypy
|
101
|
+
.mypy_cache/
|
102
|
+
.dmypy.json
|
103
|
+
dmypy.json
|
104
|
+
|
105
|
+
# Pyre type checker
|
106
|
+
.pyre/
|
107
|
+
|
108
|
+
# pytype static type analyzer
|
109
|
+
.pytype/
|
110
|
+
|
111
|
+
# Cython debug symbols
|
112
|
+
cython_debug/
|
113
|
+
|
114
|
+
# PyCharm
|
115
|
+
.idea/
|
116
|
+
|
117
|
+
# VS Code
|
118
|
+
.vscode/
|
119
|
+
|
120
|
+
# macOS
|
121
|
+
.DS_Store
|
122
|
+
.AppleDouble
|
123
|
+
.LSOverride
|
124
|
+
|
125
|
+
# Windows
|
126
|
+
Thumbs.db
|
127
|
+
Thumbs.db:encryptable
|
128
|
+
ehthumbs.db
|
129
|
+
ehthumbs_vista.db
|
130
|
+
*.stackdump
|
131
|
+
[Dd]esktop.ini
|
132
|
+
$RECYCLE.BIN/
|
133
|
+
*.cab
|
134
|
+
*.msi
|
135
|
+
*.msix
|
136
|
+
*.msm
|
137
|
+
*.msp
|
138
|
+
*.lnk
|
139
|
+
|
140
|
+
# Linux
|
141
|
+
*~
|
142
|
+
|
143
|
+
# Temporary files
|
144
|
+
*.tmp
|
145
|
+
*.temp
|
146
|
+
*.log
|
147
|
+
.temp_*/
|
148
|
+
.cache/
|
149
|
+
|
150
|
+
# DEBase specific
|
151
|
+
enzyme_pipeline*.log
|
152
|
+
temp_merged_input.csv
|
153
|
+
*.egg-info/
|
154
|
+
|
155
|
+
# Project data and examples
|
156
|
+
data/
|
157
|
+
examples/
|
158
|
+
!examples/test.csv # Keep test.csv as example output
|
159
|
+
|
160
|
+
# Cache files
|
161
|
+
*.pkl
|
162
|
+
*_cache.pkl
|
163
|
+
|
164
|
+
# Large database files
|
165
|
+
*.db
|
166
|
+
|
167
|
+
# PDFs and Excel files
|
168
|
+
*.pdf
|
169
|
+
*.xlsx
|
170
|
+
|
171
|
+
# Backup files
|
172
|
+
*_backup.py
|
173
|
+
lineage_format_backup.py
|
174
|
+
|
175
|
+
# Temporary directories
|
176
|
+
.temp_*
|
177
|
+
enzyme_analysis_*
|
@@ -0,0 +1,21 @@
|
|
1
|
+
name: debase
|
2
|
+
channels:
|
3
|
+
- conda-forge
|
4
|
+
- defaults
|
5
|
+
dependencies:
|
6
|
+
- python=3.9
|
7
|
+
- pandas>=1.0.0
|
8
|
+
- numpy>=1.19.0
|
9
|
+
- matplotlib>=3.3.0
|
10
|
+
- seaborn>=0.11.0
|
11
|
+
- jupyter>=1.0.0
|
12
|
+
- jupyterlab>=3.0.0
|
13
|
+
- openpyxl>=3.0.0
|
14
|
+
- biopython>=1.78
|
15
|
+
- requests>=2.25.0
|
16
|
+
- tqdm>=4.60.0
|
17
|
+
- rdkit>=2020.03.1
|
18
|
+
- pip
|
19
|
+
- pip:
|
20
|
+
- PyMuPDF>=1.18.0
|
21
|
+
- google-generativeai>=0.3.0
|
@@ -645,11 +645,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
|
|
645
645
|
came from which parent and what mutations were introduced).
|
646
646
|
|
647
647
|
Respond with a JSON array of objects, each containing:
|
648
|
-
- "location": the identifier (e.g. "Table S1", "Figure 2B", "
|
648
|
+
- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
|
649
649
|
- "type": one of "table", "figure", "text", "section"
|
650
650
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
651
651
|
- "reason": brief explanation of why this location likely contains lineage
|
652
652
|
|
653
|
+
IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
|
654
|
+
|
653
655
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
654
656
|
mutation lists should be ranked higher than figure showing complete variant lineages.
|
655
657
|
Text sections is used when no suitable tables/figurews exist.
|
@@ -747,7 +749,7 @@ def identify_campaigns(
|
|
747
749
|
debug_dir: str | Path | None = None,
|
748
750
|
) -> List[Campaign]:
|
749
751
|
"""Identify distinct directed evolution campaigns in the manuscript."""
|
750
|
-
prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text
|
752
|
+
prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
|
751
753
|
campaigns_data: List[dict] = []
|
752
754
|
try:
|
753
755
|
campaigns_data = generate_json_with_retry(
|
@@ -825,7 +827,7 @@ def identify_evolution_locations(
|
|
825
827
|
|
826
828
|
# Include TOC before the main text
|
827
829
|
combined_text = toc_text + text if toc_text else text
|
828
|
-
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
|
830
|
+
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
|
829
831
|
locs: List[dict] = []
|
830
832
|
try:
|
831
833
|
locs = generate_json_with_retry(
|
@@ -1306,7 +1308,7 @@ def get_lineage(
|
|
1306
1308
|
5. Return both variants and campaigns.
|
1307
1309
|
"""
|
1308
1310
|
# First, identify campaigns in the manuscript
|
1309
|
-
campaigns = identify_campaigns(full_text
|
1311
|
+
campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
|
1310
1312
|
|
1311
1313
|
if campaigns:
|
1312
1314
|
log.info(f"Identified {len(campaigns)} distinct campaigns")
|
@@ -1364,7 +1366,7 @@ def get_lineage(
|
|
1364
1366
|
context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
|
1365
1367
|
locations_with_context.append({
|
1366
1368
|
'location': loc,
|
1367
|
-
'context': context_text
|
1369
|
+
'context': context_text # Full extracted context
|
1368
1370
|
})
|
1369
1371
|
|
1370
1372
|
# For each campaign, ask Gemini to select the best location
|
@@ -1554,13 +1556,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
|
1554
1556
|
Look for table of contents entries or section listings that mention sequences.
|
1555
1557
|
Return a JSON array where each element has:
|
1556
1558
|
- "section": the section heading or description
|
1557
|
-
- "page": the page number
|
1559
|
+
- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
|
1558
1560
|
|
1559
1561
|
Focus on:
|
1560
1562
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
1561
|
-
-
|
1563
|
+
- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
|
1562
1564
|
- Prioritize sections that mention "protein" or "amino acid" sequences
|
1563
1565
|
|
1566
|
+
CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
|
1567
|
+
- Correct: "53", "S12", "147"
|
1568
|
+
- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
|
1569
|
+
|
1564
1570
|
Return [] if no sequence sections are found.
|
1565
1571
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
1566
1572
|
|
@@ -1572,7 +1578,7 @@ TEXT (truncated):
|
|
1572
1578
|
|
1573
1579
|
def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
|
1574
1580
|
"""Ask Gemini for promising places to look for sequences."""
|
1575
|
-
prompt = _SEQ_LOC_PROMPT.format(chunk=text
|
1581
|
+
prompt = _SEQ_LOC_PROMPT.format(chunk=text)
|
1576
1582
|
try:
|
1577
1583
|
locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
|
1578
1584
|
return locs if isinstance(locs, list) else []
|