debase 0.4.1__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase-0.4.3/.gitignore +177 -0
- {debase-0.4.1/src/debase.egg-info → debase-0.4.3}/PKG-INFO +1 -1
- debase-0.4.3/environment.yml +21 -0
- {debase-0.4.1 → debase-0.4.3}/src/debase/_version.py +1 -1
- {debase-0.4.1 → debase-0.4.3}/src/debase/cleanup_sequence.py +151 -1
- {debase-0.4.1 → debase-0.4.3}/src/debase/enzyme_lineage_extractor.py +114 -20
- {debase-0.4.1 → debase-0.4.3}/src/debase/lineage_format.py +335 -56
- {debase-0.4.1 → debase-0.4.3}/src/debase/reaction_info_extractor.py +60 -32
- {debase-0.4.1 → debase-0.4.3}/src/debase/substrate_scope_extractor.py +366 -93
- {debase-0.4.1 → debase-0.4.3}/src/debase/wrapper.py +37 -11
- {debase-0.4.1 → debase-0.4.3/src/debase.egg-info}/PKG-INFO +1 -1
- {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/SOURCES.txt +3 -0
- debase-0.4.3/src/debase.egg-info/dependency_links.txt +1 -0
- {debase-0.4.1 → debase-0.4.3}/LICENSE +0 -0
- {debase-0.4.1 → debase-0.4.3}/MANIFEST.in +0 -0
- {debase-0.4.1 → debase-0.4.3}/README.md +0 -0
- {debase-0.4.1 → debase-0.4.3}/pyproject.toml +0 -0
- {debase-0.4.1 → debase-0.4.3}/setup.cfg +0 -0
- {debase-0.4.1 → debase-0.4.3}/setup.py +0 -0
- /debase-0.4.1/src/debase.egg-info/dependency_links.txt → /debase-0.4.3/src/__init__.py +0 -0
- {debase-0.4.1 → debase-0.4.3}/src/debase/__init__.py +0 -0
- {debase-0.4.1 → debase-0.4.3}/src/debase/__main__.py +0 -0
- {debase-0.4.1 → debase-0.4.3}/src/debase/build_db.py +0 -0
- {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.4.1 → debase-0.4.3}/src/debase.egg-info/top_level.txt +0 -0
debase-0.4.3/.gitignore
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
2
|
+
__pycache__/
|
3
|
+
*.py[cod]
|
4
|
+
*$py.class
|
5
|
+
|
6
|
+
# C extensions
|
7
|
+
*.so
|
8
|
+
|
9
|
+
# Distribution / packaging
|
10
|
+
.Python
|
11
|
+
build/
|
12
|
+
develop-eggs/
|
13
|
+
dist/
|
14
|
+
downloads/
|
15
|
+
eggs/
|
16
|
+
.eggs/
|
17
|
+
lib/
|
18
|
+
lib64/
|
19
|
+
parts/
|
20
|
+
sdist/
|
21
|
+
var/
|
22
|
+
wheels/
|
23
|
+
share/python-wheels/
|
24
|
+
*.egg-info/
|
25
|
+
.installed.cfg
|
26
|
+
*.egg
|
27
|
+
MANIFEST
|
28
|
+
|
29
|
+
# PyInstaller
|
30
|
+
*.manifest
|
31
|
+
*.spec
|
32
|
+
|
33
|
+
# Installer logs
|
34
|
+
pip-log.txt
|
35
|
+
pip-delete-this-directory.txt
|
36
|
+
|
37
|
+
# Unit test / coverage reports
|
38
|
+
htmlcov/
|
39
|
+
.tox/
|
40
|
+
.nox/
|
41
|
+
.coverage
|
42
|
+
.coverage.*
|
43
|
+
.cache
|
44
|
+
nosetests.xml
|
45
|
+
coverage.xml
|
46
|
+
*.cover
|
47
|
+
*.py,cover
|
48
|
+
.hypothesis/
|
49
|
+
.pytest_cache/
|
50
|
+
cover/
|
51
|
+
|
52
|
+
# Jupyter Notebook
|
53
|
+
.ipynb_checkpoints
|
54
|
+
|
55
|
+
# IPython
|
56
|
+
profile_default/
|
57
|
+
ipython_config.py
|
58
|
+
|
59
|
+
# pyenv
|
60
|
+
.python-version
|
61
|
+
|
62
|
+
# pipenv
|
63
|
+
Pipfile.lock
|
64
|
+
|
65
|
+
# poetry
|
66
|
+
poetry.lock
|
67
|
+
|
68
|
+
# pdm
|
69
|
+
.pdm.toml
|
70
|
+
|
71
|
+
# PEP 582
|
72
|
+
__pypackages__/
|
73
|
+
|
74
|
+
# Celery stuff
|
75
|
+
celerybeat-schedule
|
76
|
+
celerybeat.pid
|
77
|
+
|
78
|
+
# SageMath parsed files
|
79
|
+
*.sage.py
|
80
|
+
|
81
|
+
# Environments
|
82
|
+
.env
|
83
|
+
.venv
|
84
|
+
env/
|
85
|
+
venv/
|
86
|
+
ENV/
|
87
|
+
env.bak/
|
88
|
+
venv.bak/
|
89
|
+
|
90
|
+
# Spyder project settings
|
91
|
+
.spyderproject
|
92
|
+
.spyproject
|
93
|
+
|
94
|
+
# Rope project settings
|
95
|
+
.ropeproject
|
96
|
+
|
97
|
+
# mkdocs documentation
|
98
|
+
/site
|
99
|
+
|
100
|
+
# mypy
|
101
|
+
.mypy_cache/
|
102
|
+
.dmypy.json
|
103
|
+
dmypy.json
|
104
|
+
|
105
|
+
# Pyre type checker
|
106
|
+
.pyre/
|
107
|
+
|
108
|
+
# pytype static type analyzer
|
109
|
+
.pytype/
|
110
|
+
|
111
|
+
# Cython debug symbols
|
112
|
+
cython_debug/
|
113
|
+
|
114
|
+
# PyCharm
|
115
|
+
.idea/
|
116
|
+
|
117
|
+
# VS Code
|
118
|
+
.vscode/
|
119
|
+
|
120
|
+
# macOS
|
121
|
+
.DS_Store
|
122
|
+
.AppleDouble
|
123
|
+
.LSOverride
|
124
|
+
|
125
|
+
# Windows
|
126
|
+
Thumbs.db
|
127
|
+
Thumbs.db:encryptable
|
128
|
+
ehthumbs.db
|
129
|
+
ehthumbs_vista.db
|
130
|
+
*.stackdump
|
131
|
+
[Dd]esktop.ini
|
132
|
+
$RECYCLE.BIN/
|
133
|
+
*.cab
|
134
|
+
*.msi
|
135
|
+
*.msix
|
136
|
+
*.msm
|
137
|
+
*.msp
|
138
|
+
*.lnk
|
139
|
+
|
140
|
+
# Linux
|
141
|
+
*~
|
142
|
+
|
143
|
+
# Temporary files
|
144
|
+
*.tmp
|
145
|
+
*.temp
|
146
|
+
*.log
|
147
|
+
.temp_*/
|
148
|
+
.cache/
|
149
|
+
|
150
|
+
# DEBase specific
|
151
|
+
enzyme_pipeline*.log
|
152
|
+
temp_merged_input.csv
|
153
|
+
*.egg-info/
|
154
|
+
|
155
|
+
# Project data and examples
|
156
|
+
data/
|
157
|
+
examples/
|
158
|
+
!examples/test.csv # Keep test.csv as example output
|
159
|
+
|
160
|
+
# Cache files
|
161
|
+
*.pkl
|
162
|
+
*_cache.pkl
|
163
|
+
|
164
|
+
# Large database files
|
165
|
+
*.db
|
166
|
+
|
167
|
+
# PDFs and Excel files
|
168
|
+
*.pdf
|
169
|
+
*.xlsx
|
170
|
+
|
171
|
+
# Backup files
|
172
|
+
*_backup.py
|
173
|
+
lineage_format_backup.py
|
174
|
+
|
175
|
+
# Temporary directories
|
176
|
+
.temp_*
|
177
|
+
enzyme_analysis_*
|
@@ -0,0 +1,21 @@
|
|
1
|
+
name: debase
|
2
|
+
channels:
|
3
|
+
- conda-forge
|
4
|
+
- defaults
|
5
|
+
dependencies:
|
6
|
+
- python=3.9
|
7
|
+
- pandas>=1.0.0
|
8
|
+
- numpy>=1.19.0
|
9
|
+
- matplotlib>=3.3.0
|
10
|
+
- seaborn>=0.11.0
|
11
|
+
- jupyter>=1.0.0
|
12
|
+
- jupyterlab>=3.0.0
|
13
|
+
- openpyxl>=3.0.0
|
14
|
+
- biopython>=1.78
|
15
|
+
- requests>=2.25.0
|
16
|
+
- tqdm>=4.60.0
|
17
|
+
- rdkit>=2020.03.1
|
18
|
+
- pip
|
19
|
+
- pip:
|
20
|
+
- PyMuPDF>=1.18.0
|
21
|
+
- google-generativeai>=0.3.0
|
@@ -11,6 +11,7 @@ Usage:
|
|
11
11
|
|
12
12
|
import argparse
|
13
13
|
import logging
|
14
|
+
import os
|
14
15
|
import re
|
15
16
|
import sys
|
16
17
|
from dataclasses import dataclass, field
|
@@ -19,11 +20,20 @@ from typing import Dict, List, Optional, Set, Tuple, Union
|
|
19
20
|
|
20
21
|
import pandas as pd
|
21
22
|
|
23
|
+
try:
|
24
|
+
import google.generativeai as genai # type: ignore
|
25
|
+
GEMINI_OK = True
|
26
|
+
except ImportError: # pragma: no cover
|
27
|
+
GEMINI_OK = False
|
28
|
+
|
22
29
|
|
23
30
|
# === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
|
24
31
|
|
25
32
|
VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
|
26
33
|
|
34
|
+
# Gemini API configuration
|
35
|
+
GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
|
36
|
+
|
27
37
|
# Configure module logger
|
28
38
|
log = logging.getLogger(__name__)
|
29
39
|
|
@@ -565,7 +575,136 @@ class SequenceGenerator:
|
|
565
575
|
return None
|
566
576
|
|
567
577
|
|
568
|
-
# === 7.
|
578
|
+
# === 7. GEMINI PARENT IDENTIFICATION === ------------------------------------
|
579
|
+
|
580
|
+
def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
581
|
+
"""Use Gemini API to identify parent enzymes for entries with missing parent information."""
|
582
|
+
if not GEMINI_OK:
|
583
|
+
log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
|
584
|
+
return df
|
585
|
+
|
586
|
+
if not GEMINI_API_KEY:
|
587
|
+
log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
|
588
|
+
return df
|
589
|
+
|
590
|
+
try:
|
591
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
592
|
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
593
|
+
except Exception as e:
|
594
|
+
log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
|
595
|
+
return df
|
596
|
+
|
597
|
+
# Find entries with empty sequences but missing parent information
|
598
|
+
entries_needing_parents = []
|
599
|
+
for idx, row in df.iterrows():
|
600
|
+
protein_seq = str(row.get("protein_sequence", "")).strip()
|
601
|
+
parent_id = str(row.get("parent_enzyme_id", "")).strip()
|
602
|
+
|
603
|
+
# Only process entries that have empty sequences AND no parent info
|
604
|
+
if (not protein_seq or protein_seq == "nan") and (not parent_id or parent_id == "nan"):
|
605
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
606
|
+
campaign_id = str(row.get("campaign_id", ""))
|
607
|
+
generation = str(row.get("generation", ""))
|
608
|
+
|
609
|
+
entries_needing_parents.append({
|
610
|
+
"idx": idx,
|
611
|
+
"enzyme_id": enzyme_id,
|
612
|
+
"campaign_id": campaign_id,
|
613
|
+
"generation": generation
|
614
|
+
})
|
615
|
+
|
616
|
+
if not entries_needing_parents:
|
617
|
+
log.info("No entries need parent identification from Gemini")
|
618
|
+
return df
|
619
|
+
|
620
|
+
log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
|
621
|
+
|
622
|
+
# Create a lookup of all available enzyme IDs for context
|
623
|
+
available_enzymes = {}
|
624
|
+
for idx, row in df.iterrows():
|
625
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
626
|
+
campaign_id = str(row.get("campaign_id", ""))
|
627
|
+
protein_seq = str(row.get("protein_sequence", "")).strip()
|
628
|
+
generation = str(row.get("generation", ""))
|
629
|
+
|
630
|
+
if enzyme_id and enzyme_id != "nan":
|
631
|
+
available_enzymes[enzyme_id] = {
|
632
|
+
"campaign_id": campaign_id,
|
633
|
+
"has_sequence": bool(protein_seq and protein_seq != "nan"),
|
634
|
+
"generation": generation
|
635
|
+
}
|
636
|
+
|
637
|
+
identified_count = 0
|
638
|
+
for entry in entries_needing_parents:
|
639
|
+
enzyme_id = entry["enzyme_id"]
|
640
|
+
campaign_id = entry["campaign_id"]
|
641
|
+
generation = entry["generation"]
|
642
|
+
|
643
|
+
# Create context for Gemini
|
644
|
+
context_info = []
|
645
|
+
context_info.append(f"Enzyme ID: {enzyme_id}")
|
646
|
+
context_info.append(f"Campaign ID: {campaign_id}")
|
647
|
+
if generation:
|
648
|
+
context_info.append(f"Generation: {generation}")
|
649
|
+
|
650
|
+
# Add available enzymes from the same campaign for context
|
651
|
+
campaign_enzymes = []
|
652
|
+
for enz_id, enz_data in available_enzymes.items():
|
653
|
+
if enz_data["campaign_id"] == campaign_id:
|
654
|
+
status = "with sequence" if enz_data["has_sequence"] else "without sequence"
|
655
|
+
gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
|
656
|
+
campaign_enzymes.append(f" - {enz_id} {status} {gen_info}")
|
657
|
+
|
658
|
+
if campaign_enzymes:
|
659
|
+
context_info.append("Available enzymes in same campaign:")
|
660
|
+
context_info.extend(campaign_enzymes[:10]) # Limit to first 10 for context
|
661
|
+
|
662
|
+
context_text = "\n".join(context_info)
|
663
|
+
|
664
|
+
prompt = f"""
|
665
|
+
Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
|
666
|
+
|
667
|
+
{context_text}
|
668
|
+
|
669
|
+
This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
|
670
|
+
|
671
|
+
Please provide your response in this format:
|
672
|
+
Parent: [parent_enzyme_id or "Unknown"]
|
673
|
+
|
674
|
+
If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
|
675
|
+
"""
|
676
|
+
|
677
|
+
try:
|
678
|
+
response = model.generate_content(prompt)
|
679
|
+
response_text = response.text.strip()
|
680
|
+
|
681
|
+
# Parse the response
|
682
|
+
parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
|
683
|
+
|
684
|
+
if parent_match:
|
685
|
+
parent = parent_match.group(1).strip()
|
686
|
+
if parent and parent != "Unknown" and parent != "No parent identified":
|
687
|
+
# Verify the parent exists in our available enzymes
|
688
|
+
if parent in available_enzymes:
|
689
|
+
df.at[entry["idx"], "parent_enzyme_id"] = parent
|
690
|
+
identified_count += 1
|
691
|
+
log.info(f"Identified parent for {enzyme_id}: {parent}")
|
692
|
+
else:
|
693
|
+
log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
|
694
|
+
|
695
|
+
except Exception as e:
|
696
|
+
log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
|
697
|
+
continue
|
698
|
+
|
699
|
+
if identified_count > 0:
|
700
|
+
log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
|
701
|
+
else:
|
702
|
+
log.info("No parent enzymes were identified using Gemini API")
|
703
|
+
|
704
|
+
return df
|
705
|
+
|
706
|
+
|
707
|
+
# === 8. MAIN PROCESSOR === ---------------------------------------------------
|
569
708
|
|
570
709
|
class SequenceProcessor:
|
571
710
|
"""Main processor for handling the complete workflow."""
|
@@ -866,6 +1005,17 @@ class SequenceProcessor:
|
|
866
1005
|
self.process_remaining()
|
867
1006
|
self.backward_pass()
|
868
1007
|
|
1008
|
+
# Use Gemini to identify parent enzymes for entries with missing sequences
|
1009
|
+
log.info(f"Identifying parents with Gemini for campaign: {campaign_id}")
|
1010
|
+
self.df = identify_parents_with_gemini(self.df)
|
1011
|
+
|
1012
|
+
# Rebuild relationships after parent identification
|
1013
|
+
self.generator = SequenceGenerator(self.df)
|
1014
|
+
|
1015
|
+
# Try to fill sequences again after parent identification
|
1016
|
+
log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
|
1017
|
+
self.process_remaining()
|
1018
|
+
|
869
1019
|
# Update the original dataframe with results
|
870
1020
|
original_df.loc[campaign_mask, :] = self.df
|
871
1021
|
|
@@ -142,21 +142,36 @@ def extract_text(pdf_path: str | Path | bytes) -> str:
|
|
142
142
|
|
143
143
|
|
144
144
|
def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
|
145
|
-
"""Extract figure/table captions
|
145
|
+
"""Extract ALL figure/table captions with extensive surrounding context.
|
146
146
|
|
147
147
|
The function scans every text line on every page and keeps lines whose first
|
148
148
|
token matches `_CAPTION_PREFIX_RE`. This covers labels such as:
|
149
|
-
* Fig. 1, Figure 2A,
|
149
|
+
* Fig. 1, Figure 2A, Figure 2B, Figure 2C (ALL sub-captions)
|
150
150
|
* Table S1, Table 4, Scheme 2, Chart 1B
|
151
|
-
* Supplementary Fig.
|
151
|
+
* Supplementary Fig. S5A, S5B, S5C (ALL variations)
|
152
|
+
|
153
|
+
For SI documents, includes extensive context since understanding what each
|
154
|
+
section contains is crucial for accurate location identification.
|
152
155
|
"""
|
153
156
|
|
154
157
|
doc = _open_doc(pdf_path)
|
155
158
|
captions: list[str] = []
|
156
159
|
try:
|
157
|
-
for page in doc:
|
160
|
+
for page_num, page in enumerate(doc):
|
158
161
|
page_dict = page.get_text("dict")
|
162
|
+
|
163
|
+
# Get all text blocks on this page for broader context
|
164
|
+
page_text_blocks = []
|
159
165
|
for block in page_dict.get("blocks", []):
|
166
|
+
block_text = ""
|
167
|
+
for line in block.get("lines", []):
|
168
|
+
text_line = "".join(span["text"] for span in line.get("spans", []))
|
169
|
+
if text_line.strip():
|
170
|
+
block_text += text_line.strip() + " "
|
171
|
+
if block_text.strip():
|
172
|
+
page_text_blocks.append(block_text.strip())
|
173
|
+
|
174
|
+
for block_idx, block in enumerate(page_dict.get("blocks", [])):
|
160
175
|
# Get all lines in this block
|
161
176
|
block_lines = []
|
162
177
|
for line in block.get("lines", []):
|
@@ -166,21 +181,94 @@ def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -
|
|
166
181
|
# Check if any line starts with a caption prefix
|
167
182
|
for i, line in enumerate(block_lines):
|
168
183
|
if _CAPTION_PREFIX_RE.match(line):
|
169
|
-
|
170
|
-
|
184
|
+
context_parts = []
|
185
|
+
|
186
|
+
# Add page context for SI documents (more critical there)
|
187
|
+
context_parts.append(f"Page {page_num + 1}")
|
188
|
+
|
189
|
+
# Add extensive context before the caption (5-7 lines for SI context)
|
190
|
+
context_before = []
|
191
|
+
|
192
|
+
# First try to get context from current block
|
193
|
+
for k in range(max(0, i-7), i):
|
194
|
+
if k < len(block_lines) and block_lines[k].strip():
|
195
|
+
if not _CAPTION_PREFIX_RE.match(block_lines[k]):
|
196
|
+
context_before.append(block_lines[k])
|
197
|
+
|
198
|
+
# If not enough context, look at previous text blocks on the page
|
199
|
+
if len(context_before) < 3 and block_idx > 0:
|
200
|
+
prev_block_text = page_text_blocks[block_idx - 1] if block_idx < len(page_text_blocks) else ""
|
201
|
+
if prev_block_text:
|
202
|
+
# Get last few sentences from previous block
|
203
|
+
sentences = prev_block_text.split('. ')
|
204
|
+
context_before = sentences[-2:] + context_before if len(sentences) > 1 else [prev_block_text] + context_before
|
205
|
+
|
206
|
+
if context_before:
|
207
|
+
# Include more extensive context for better understanding
|
208
|
+
context_text = " ".join(context_before[-5:]) # Last 5 lines/sentences of context
|
209
|
+
context_parts.append("Context: " + context_text)
|
210
|
+
|
211
|
+
# Extract the COMPLETE caption including all sub-parts
|
171
212
|
caption_parts = [line]
|
172
|
-
|
213
|
+
j = i + 1
|
214
|
+
|
215
|
+
# Continue collecting caption text until we hit a clear break
|
216
|
+
while j < len(block_lines):
|
173
217
|
next_line = block_lines[j]
|
174
|
-
|
175
|
-
|
176
|
-
|
218
|
+
|
219
|
+
# Stop if we hit an empty line followed by non-caption text
|
220
|
+
if not next_line:
|
221
|
+
# Check if the line after empty is a new caption
|
222
|
+
if j + 1 < len(block_lines) and _CAPTION_PREFIX_RE.match(block_lines[j + 1]):
|
223
|
+
break
|
224
|
+
# If next non-empty line is not a caption, continue collecting
|
225
|
+
elif j + 1 < len(block_lines):
|
226
|
+
j += 1
|
227
|
+
continue
|
228
|
+
else:
|
229
|
+
break
|
230
|
+
|
231
|
+
# Stop if we hit a new caption
|
177
232
|
if _CAPTION_PREFIX_RE.match(next_line):
|
178
233
|
break
|
234
|
+
|
235
|
+
# Include this line as part of the caption
|
179
236
|
caption_parts.append(next_line)
|
237
|
+
j += 1
|
180
238
|
|
181
|
-
# Join the parts
|
239
|
+
# Join the caption parts
|
182
240
|
full_caption = " ".join(caption_parts)
|
183
|
-
|
241
|
+
context_parts.append("Caption: " + full_caption)
|
242
|
+
|
243
|
+
# Add extensive context after the caption (especially important for SI)
|
244
|
+
context_after = []
|
245
|
+
|
246
|
+
# Look for descriptive text following the caption
|
247
|
+
for k in range(j, min(len(block_lines), j + 10)): # Look ahead up to 10 lines
|
248
|
+
if k < len(block_lines) and block_lines[k].strip():
|
249
|
+
if not _CAPTION_PREFIX_RE.match(block_lines[k]):
|
250
|
+
context_after.append(block_lines[k])
|
251
|
+
|
252
|
+
# If not enough context, look at next text blocks
|
253
|
+
if len(context_after) < 3 and block_idx + 1 < len(page_text_blocks):
|
254
|
+
next_block_text = page_text_blocks[block_idx + 1]
|
255
|
+
if next_block_text:
|
256
|
+
# Get first few sentences from next block
|
257
|
+
sentences = next_block_text.split('. ')
|
258
|
+
context_after.extend(sentences[:3] if len(sentences) > 1 else [next_block_text])
|
259
|
+
|
260
|
+
if context_after:
|
261
|
+
# Include extensive following context
|
262
|
+
following_text = " ".join(context_after[:7]) # First 7 lines of following context
|
263
|
+
context_parts.append("Following: " + following_text)
|
264
|
+
|
265
|
+
# For SI documents, add section context if this appears to be a section header
|
266
|
+
if any(keyword in full_caption.lower() for keyword in ['supplementary', 'supporting', 'si ', 's1', 's2', 's3']):
|
267
|
+
context_parts.append("SI_SECTION: This appears to be supplementary material content")
|
268
|
+
|
269
|
+
# Combine all parts with proper separation
|
270
|
+
full_caption_with_context = " | ".join(context_parts)
|
271
|
+
captions.append(full_caption_with_context)
|
184
272
|
finally:
|
185
273
|
doc.close()
|
186
274
|
|
@@ -645,11 +733,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
|
|
645
733
|
came from which parent and what mutations were introduced).
|
646
734
|
|
647
735
|
Respond with a JSON array of objects, each containing:
|
648
|
-
- "location": the identifier (e.g. "Table S1", "Figure 2B", "
|
736
|
+
- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
|
649
737
|
- "type": one of "table", "figure", "text", "section"
|
650
738
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
651
739
|
- "reason": brief explanation of why this location likely contains lineage
|
652
740
|
|
741
|
+
IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
|
742
|
+
|
653
743
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
654
744
|
mutation lists should be ranked higher than figure showing complete variant lineages.
|
655
745
|
Text sections is used when no suitable tables/figurews exist.
|
@@ -747,7 +837,7 @@ def identify_campaigns(
|
|
747
837
|
debug_dir: str | Path | None = None,
|
748
838
|
) -> List[Campaign]:
|
749
839
|
"""Identify distinct directed evolution campaigns in the manuscript."""
|
750
|
-
prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text
|
840
|
+
prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
|
751
841
|
campaigns_data: List[dict] = []
|
752
842
|
try:
|
753
843
|
campaigns_data = generate_json_with_retry(
|
@@ -825,7 +915,7 @@ def identify_evolution_locations(
|
|
825
915
|
|
826
916
|
# Include TOC before the main text
|
827
917
|
combined_text = toc_text + text if toc_text else text
|
828
|
-
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
|
918
|
+
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
|
829
919
|
locs: List[dict] = []
|
830
920
|
try:
|
831
921
|
locs = generate_json_with_retry(
|
@@ -1306,7 +1396,7 @@ def get_lineage(
|
|
1306
1396
|
5. Return both variants and campaigns.
|
1307
1397
|
"""
|
1308
1398
|
# First, identify campaigns in the manuscript
|
1309
|
-
campaigns = identify_campaigns(full_text
|
1399
|
+
campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
|
1310
1400
|
|
1311
1401
|
if campaigns:
|
1312
1402
|
log.info(f"Identified {len(campaigns)} distinct campaigns")
|
@@ -1364,7 +1454,7 @@ def get_lineage(
|
|
1364
1454
|
context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
|
1365
1455
|
locations_with_context.append({
|
1366
1456
|
'location': loc,
|
1367
|
-
'context': context_text
|
1457
|
+
'context': context_text # Full extracted context
|
1368
1458
|
})
|
1369
1459
|
|
1370
1460
|
# For each campaign, ask Gemini to select the best location
|
@@ -1554,13 +1644,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
|
1554
1644
|
Look for table of contents entries or section listings that mention sequences.
|
1555
1645
|
Return a JSON array where each element has:
|
1556
1646
|
- "section": the section heading or description
|
1557
|
-
- "page": the page number
|
1647
|
+
- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
|
1558
1648
|
|
1559
1649
|
Focus on:
|
1560
1650
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
1561
|
-
-
|
1651
|
+
- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
|
1562
1652
|
- Prioritize sections that mention "protein" or "amino acid" sequences
|
1563
1653
|
|
1654
|
+
CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
|
1655
|
+
- Correct: "53", "S12", "147"
|
1656
|
+
- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
|
1657
|
+
|
1564
1658
|
Return [] if no sequence sections are found.
|
1565
1659
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
1566
1660
|
|
@@ -1572,7 +1666,7 @@ TEXT (truncated):
|
|
1572
1666
|
|
1573
1667
|
def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
|
1574
1668
|
"""Ask Gemini for promising places to look for sequences."""
|
1575
|
-
prompt = _SEQ_LOC_PROMPT.format(chunk=text
|
1669
|
+
prompt = _SEQ_LOC_PROMPT.format(chunk=text)
|
1576
1670
|
try:
|
1577
1671
|
locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
|
1578
1672
|
return locs if isinstance(locs, list) else []
|