debase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/__init__.py +18 -0
- debase/__main__.py +9 -0
- debase/_version.py +3 -0
- debase/build_db.py +190 -0
- debase/cleanup_sequence.py +905 -0
- debase/enzyme_lineage_extractor.py +2169 -0
- debase/lineage_format.py +808 -0
- debase/reaction_info_extractor.py +2331 -0
- debase/substrate_scope_extractor.py +2039 -0
- debase/wrapper.py +303 -0
- debase-0.1.0.dist-info/METADATA +299 -0
- debase-0.1.0.dist-info/RECORD +17 -0
- debase-0.1.0.dist-info/WHEEL +5 -0
- debase-0.1.0.dist-info/entry_points.txt +2 -0
- debase-0.1.0.dist-info/licenses/LICENSE +21 -0
- debase-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2039 @@
|
|
1
|
+
"""substrate_scope_extractor.py
|
2
|
+
|
3
|
+
Single-file, maintainable CLI tool that extracts the *substrate scope* table
|
4
|
+
from one or two PDFs (manuscript + SI) using Google Gemini (or compatible).
|
5
|
+
|
6
|
+
The file mirrors the *section layout* and logging/debug philosophy of
|
7
|
+
`enzyme_lineage_extractor.py` so that both tools share a consistent developer
|
8
|
+
experience and can even live in the same package.
|
9
|
+
|
10
|
+
Navigate quickly by jumping to the numbered headers:
|
11
|
+
|
12
|
+
# === 1. CONFIG & CONSTANTS ===
|
13
|
+
# === 2. DOMAIN MODELS ===
|
14
|
+
# === 3. LOGGING HELPERS ===
|
15
|
+
# === 4. PDF HELPERS ===
|
16
|
+
# === 5. LLM (GEMINI) HELPERS ===
|
17
|
+
# === 6. SCOPE EXTRACTION ===
|
18
|
+
# === 7. VALIDATION & MERGE ===
|
19
|
+
# === 8. PIPELINE ORCHESTRATOR ===
|
20
|
+
# === 9. CLI ENTRYPOINT ===
|
21
|
+
"""
|
22
|
+
|
23
|
+
# === 1. CONFIG & CONSTANTS ===
|
24
|
+
from __future__ import annotations
|
25
|
+
|
26
|
+
import os
|
27
|
+
import re
|
28
|
+
import json
|
29
|
+
import time
|
30
|
+
import logging
|
31
|
+
from pathlib import Path
|
32
|
+
from dataclasses import dataclass, field
|
33
|
+
from typing import List, Optional, Dict, Any, Union
|
34
|
+
|
35
|
+
MODEL_NAME: str = "gemini-2.5-flash"
|
36
|
+
MAX_CHARS: int = 150_000 # Max characters sent to LLM
|
37
|
+
BATCH_SIZE: int = 10 # Batch size when extracting reactions
|
38
|
+
MAX_RETRIES: int = 4 # LLM retry loop
|
39
|
+
CACHE_DIR: Path = Path.home() / ".cache" / "substrate_scope"
|
40
|
+
|
41
|
+
# Ensure cache directory exists
|
42
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
43
|
+
|
44
|
+
# === 2. DOMAIN MODELS ===
|
45
|
+
@dataclass
|
46
|
+
class SubstrateProduct:
|
47
|
+
"""Chemical entity in a substrate scope reaction."""
|
48
|
+
name: str
|
49
|
+
iupac_name: Optional[str] = None
|
50
|
+
|
51
|
+
@dataclass
|
52
|
+
class Cofactor:
|
53
|
+
"""Cofactor with optional IUPAC name and role."""
|
54
|
+
name: str
|
55
|
+
iupac_name: Optional[str] = None
|
56
|
+
role: Optional[str] = None
|
57
|
+
|
58
|
+
@dataclass
|
59
|
+
class ReactionConditions:
|
60
|
+
"""Reaction conditions for substrate scope."""
|
61
|
+
temperature: Optional[str] = None
|
62
|
+
ph: Optional[str] = None
|
63
|
+
substrate_concentration: Optional[str] = None
|
64
|
+
buffer: Optional[str] = None
|
65
|
+
other_conditions: Optional[str] = None
|
66
|
+
|
67
|
+
@dataclass
|
68
|
+
class ScopeEntry:
|
69
|
+
"""Single substrate scope reaction data point."""
|
70
|
+
enzyme_id: str
|
71
|
+
substrates: List[SubstrateProduct] = field(default_factory=list)
|
72
|
+
products: List[SubstrateProduct] = field(default_factory=list)
|
73
|
+
cofactors: List[Cofactor] = field(default_factory=list)
|
74
|
+
|
75
|
+
# Performance metrics
|
76
|
+
yield_percent: Optional[float] = None
|
77
|
+
ttn: Optional[float] = None
|
78
|
+
ee: Optional[float] = None
|
79
|
+
|
80
|
+
# Reaction conditions
|
81
|
+
conditions: ReactionConditions = field(default_factory=ReactionConditions)
|
82
|
+
|
83
|
+
# Metadata
|
84
|
+
data_location: Optional[str] = None
|
85
|
+
data_source_type: Dict[str, str] = field(default_factory=dict)
|
86
|
+
|
87
|
+
# Lineage information (populated during merge)
|
88
|
+
parent_id: Optional[str] = None
|
89
|
+
mutations: Optional[str] = None
|
90
|
+
generation: Optional[int] = None
|
91
|
+
aa_seq: Optional[str] = None
|
92
|
+
dna_seq: Optional[str] = None
|
93
|
+
confidence: Optional[float] = None
|
94
|
+
notes: str = ""
|
95
|
+
|
96
|
+
@dataclass
|
97
|
+
class CompoundMapping:
|
98
|
+
"""Mapping between compound identifiers and IUPAC names."""
|
99
|
+
identifiers: List[str]
|
100
|
+
iupac_name: str
|
101
|
+
common_names: List[str] = field(default_factory=list)
|
102
|
+
compound_type: str = "unknown"
|
103
|
+
source_location: Optional[str] = None
|
104
|
+
|
105
|
+
# === 3. LOGGING HELPERS ===
|
106
|
+
|
107
|
+
# --- Debug dump helper ----------------------------------------------------
|
108
|
+
def _dump(text: str | bytes, path: Path | str) -> None:
|
109
|
+
"""Write `text` / `bytes` to `path`, creating parent dirs as needed."""
|
110
|
+
p = Path(path)
|
111
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
112
|
+
mode = "wb" if isinstance(text, (bytes, bytearray)) else "w"
|
113
|
+
with p.open(mode) as fh:
|
114
|
+
fh.write(text)
|
115
|
+
|
116
|
+
def get_logger(name: str = __name__) -> logging.Logger:
|
117
|
+
logger = logging.getLogger(name)
|
118
|
+
if not logger.handlers:
|
119
|
+
handler = logging.StreamHandler()
|
120
|
+
fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
121
|
+
handler.setFormatter(logging.Formatter(fmt=fmt, datefmt="%Y-%m-%d %H:%M:%S"))
|
122
|
+
logger.addHandler(handler)
|
123
|
+
logger.setLevel(logging.INFO)
|
124
|
+
return logger
|
125
|
+
|
126
|
+
log = get_logger(__name__)
|
127
|
+
|
128
|
+
# === 4. PDF HELPERS ===
|
129
|
+
try:
|
130
|
+
import fitz # PyMuPDF
|
131
|
+
except ImportError as exc: # pragma: no cover
|
132
|
+
raise ImportError(
|
133
|
+
"PyMuPDF is required for PDF parsing. Install with `pip install pymupdf`."
|
134
|
+
) from exc
|
135
|
+
|
136
|
+
from base64 import b64encode
|
137
|
+
|
138
|
+
# Improved caption prefix regex - captures most journal variants
|
139
|
+
# Simplified pattern: match any line starting with Table, Figure, Scheme, Chart, etc.
|
140
|
+
# This catches all variations including "Table S 2", "Figure.", etc.
|
141
|
+
_CAPTION_PREFIX_RE = re.compile(
|
142
|
+
r"^(Table|Figure|Fig|Scheme|Chart|Extended\s+Data\s+Fig|ED\s+Fig|Supplementary\s+(?:Table|Figure)).*",
|
143
|
+
re.I | re.M
|
144
|
+
)
|
145
|
+
|
146
|
+
def _open_doc(pdf_path: str | Path | bytes):
|
147
|
+
if isinstance(pdf_path, (str, Path)):
|
148
|
+
return fitz.open(pdf_path) # type: ignore[arg-type]
|
149
|
+
return fitz.open(stream=pdf_path, filetype="pdf") # type: ignore[arg-type]
|
150
|
+
|
151
|
+
def extract_text(pdf_path: str | Path | bytes) -> str:
|
152
|
+
"""Extract raw text from a PDF file (all blocks)."""
|
153
|
+
doc = _open_doc(pdf_path)
|
154
|
+
try:
|
155
|
+
return "\n".join(page.get_text() for page in doc)
|
156
|
+
finally:
|
157
|
+
doc.close()
|
158
|
+
|
159
|
+
def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
|
160
|
+
"""Extract figure/table captions using the improved regex."""
|
161
|
+
doc = _open_doc(pdf_path)
|
162
|
+
captions: list[str] = []
|
163
|
+
try:
|
164
|
+
for page in doc:
|
165
|
+
page_dict = page.get_text("dict")
|
166
|
+
for block in page_dict.get("blocks", []):
|
167
|
+
# Get all lines in this block
|
168
|
+
block_lines = []
|
169
|
+
for line in block.get("lines", []):
|
170
|
+
text_line = "".join(span["text"] for span in line.get("spans", []))
|
171
|
+
block_lines.append(text_line.strip())
|
172
|
+
|
173
|
+
# Check if any line starts with a caption prefix
|
174
|
+
for i, line in enumerate(block_lines):
|
175
|
+
if _CAPTION_PREFIX_RE.match(line):
|
176
|
+
# Found a caption start - collect lines
|
177
|
+
caption_parts = [line]
|
178
|
+
for j in range(i + 1, len(block_lines)):
|
179
|
+
next_line = block_lines[j]
|
180
|
+
if not next_line: # Empty line signals end
|
181
|
+
break
|
182
|
+
if _CAPTION_PREFIX_RE.match(next_line):
|
183
|
+
break
|
184
|
+
caption_parts.append(next_line)
|
185
|
+
|
186
|
+
full_caption = " ".join(caption_parts)
|
187
|
+
captions.append(full_caption)
|
188
|
+
finally:
|
189
|
+
doc.close()
|
190
|
+
|
191
|
+
joined = "\n".join(captions)
|
192
|
+
return joined[:max_chars]
|
193
|
+
|
194
|
+
def limited_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -> str:
|
195
|
+
"""Concatenate **all text** from PDFs, trimmed to `max_chars`."""
|
196
|
+
total = 0
|
197
|
+
chunks: list[str] = []
|
198
|
+
for p in pdf_paths:
|
199
|
+
t = extract_text(p)
|
200
|
+
if total + len(t) > max_chars:
|
201
|
+
t = t[: max_chars - total]
|
202
|
+
chunks.append(t)
|
203
|
+
total += len(t)
|
204
|
+
if total >= max_chars:
|
205
|
+
break
|
206
|
+
return "\n".join(chunks)
|
207
|
+
|
208
|
+
def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -> str:
|
209
|
+
"""Concatenate caption text and SI table of contents from PDFs, trimmed to `max_chars`."""
|
210
|
+
total = 0
|
211
|
+
chunks: list[str] = []
|
212
|
+
|
213
|
+
for idx, p in enumerate(pdf_paths):
|
214
|
+
# For SI (second PDF), first extract table of contents pages
|
215
|
+
if idx == 1: # SI document
|
216
|
+
doc = _open_doc(p)
|
217
|
+
try:
|
218
|
+
# Extract first few pages which typically contain TOC
|
219
|
+
toc_text = []
|
220
|
+
for page_num in range(min(5, doc.page_count)):
|
221
|
+
if total >= max_chars:
|
222
|
+
break
|
223
|
+
page = doc.load_page(page_num)
|
224
|
+
page_text = page.get_text()
|
225
|
+
|
226
|
+
# Look for TOC indicators
|
227
|
+
if any(indicator in page_text.lower() for indicator in
|
228
|
+
['table of contents', 'supporting information', 'contents', 'page']):
|
229
|
+
toc_text.append(f"\n[SI TOC Page {page_num + 1}]\n{page_text}")
|
230
|
+
total += len(page_text)
|
231
|
+
|
232
|
+
if toc_text:
|
233
|
+
chunks.extend(toc_text)
|
234
|
+
finally:
|
235
|
+
doc.close()
|
236
|
+
|
237
|
+
# Extract captions
|
238
|
+
if total < max_chars:
|
239
|
+
t = extract_captions(p)
|
240
|
+
if total + len(t) > max_chars:
|
241
|
+
t = t[: max_chars - total]
|
242
|
+
chunks.append(t)
|
243
|
+
total += len(t)
|
244
|
+
if total >= max_chars:
|
245
|
+
break
|
246
|
+
|
247
|
+
return "\n".join(chunks)
|
248
|
+
|
249
|
+
def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str]:
|
250
|
+
"""Extract figure as a page region when embedded images aren't available.
|
251
|
+
|
252
|
+
Args:
|
253
|
+
pdf_paths: List of PDF paths to search
|
254
|
+
figure_ref: Figure reference to search for (e.g., "Figure 3" or "Figure 3(a)")
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
Base64-encoded PNG string or None if not found
|
258
|
+
"""
|
259
|
+
if not pdf_paths:
|
260
|
+
return None
|
261
|
+
|
262
|
+
# Always extract the base figure number, removing sub-letters like (a), (b), c, etc.
|
263
|
+
import re
|
264
|
+
# Match patterns like "Figure 1", "Figure 1c", "Figure 1(c)", "Fig. 1", etc.
|
265
|
+
base_figure_match = re.match(r'((?:Figure|Fig\.?)\s*\d+)', figure_ref, re.IGNORECASE)
|
266
|
+
if base_figure_match:
|
267
|
+
base_figure_ref = base_figure_match.group(1)
|
268
|
+
log.info("Extracting entire figure '%s' from reference '%s'", base_figure_ref, figure_ref)
|
269
|
+
else:
|
270
|
+
base_figure_ref = figure_ref
|
271
|
+
|
272
|
+
for pdf_path in pdf_paths:
|
273
|
+
doc = _open_doc(pdf_path)
|
274
|
+
try:
|
275
|
+
for page_num in range(doc.page_count):
|
276
|
+
page = doc.load_page(page_num)
|
277
|
+
page_text = page.get_text()
|
278
|
+
|
279
|
+
# Check if this page contains the figure caption
|
280
|
+
found = False
|
281
|
+
caption_instances = None
|
282
|
+
|
283
|
+
# Look for figure caption
|
284
|
+
variations = [
|
285
|
+
f"{base_figure_ref}.", # "Figure 1." - most reliable
|
286
|
+
f"{base_figure_ref} ", # "Figure 1 "
|
287
|
+
base_figure_ref,
|
288
|
+
]
|
289
|
+
|
290
|
+
for variation in variations:
|
291
|
+
caption_instances = page.search_for(variation, quads=False)
|
292
|
+
if caption_instances:
|
293
|
+
# Check if this is likely a caption (not a reference in text)
|
294
|
+
for rect in caption_instances:
|
295
|
+
# Get text around this location
|
296
|
+
x0, y0, x1, y1 = rect
|
297
|
+
text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+300, y1+20))
|
298
|
+
# Check if it looks like a figure caption
|
299
|
+
if any(keyword in text_around.lower() for keyword in
|
300
|
+
['directed evolution', 'substrate scope', '(a)', '(b)', '(c)']):
|
301
|
+
found = True
|
302
|
+
caption_rect = rect
|
303
|
+
break
|
304
|
+
if found:
|
305
|
+
break
|
306
|
+
|
307
|
+
if not found:
|
308
|
+
continue
|
309
|
+
|
310
|
+
log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
|
311
|
+
|
312
|
+
# Extract a region of the page above the caption
|
313
|
+
# The figure should be between the top of the viewable area and the caption
|
314
|
+
page_rect = page.rect
|
315
|
+
|
316
|
+
# Define the region to extract
|
317
|
+
# Extract everything above the caption
|
318
|
+
top_margin = 0 # Start from the very top of the page
|
319
|
+
bottom_margin = 5 # Small margin above caption
|
320
|
+
left_margin = 0 # Use full page width
|
321
|
+
right_margin = 0
|
322
|
+
|
323
|
+
# Calculate the figure region - everything from top to caption
|
324
|
+
fig_top = top_margin
|
325
|
+
fig_bottom = caption_rect.y0 - bottom_margin
|
326
|
+
fig_left = left_margin
|
327
|
+
fig_right = page_rect.width - right_margin
|
328
|
+
|
329
|
+
# Create the clip rectangle
|
330
|
+
clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
|
331
|
+
|
332
|
+
# Extract the region as an image
|
333
|
+
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
334
|
+
pix = page.get_pixmap(clip=clip_rect, matrix=mat)
|
335
|
+
|
336
|
+
# Convert to PNG
|
337
|
+
img_bytes = pix.tobytes("png")
|
338
|
+
log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
|
339
|
+
clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
|
340
|
+
|
341
|
+
return b64encode(img_bytes).decode()
|
342
|
+
|
343
|
+
finally:
|
344
|
+
doc.close()
|
345
|
+
|
346
|
+
log.warning("Could not find figure caption for '%s'", figure_ref)
|
347
|
+
return None
|
348
|
+
|
349
|
+
|
350
|
+
def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
|
351
|
+
"""Extract text around a specific reference (e.g., 'Figure 3')."""
|
352
|
+
import re
|
353
|
+
extracted_sections = []
|
354
|
+
|
355
|
+
# Try to extract base figure/table reference
|
356
|
+
base_ref_match = re.match(r'((?:Figure|Fig|Table)\s*\d+)', ref, re.IGNORECASE)
|
357
|
+
base_ref = base_ref_match.group(1) if base_ref_match else ref
|
358
|
+
|
359
|
+
for pdf_path in pdf_paths:
|
360
|
+
doc = _open_doc(pdf_path)
|
361
|
+
try:
|
362
|
+
for page_num in range(doc.page_count):
|
363
|
+
page = doc.load_page(page_num)
|
364
|
+
page_text = page.get_text()
|
365
|
+
|
366
|
+
# Try different variations of the reference
|
367
|
+
ref_variations = [
|
368
|
+
ref, # Original (e.g., "Figure 3(a)")
|
369
|
+
base_ref, # Base reference (e.g., "Figure 3")
|
370
|
+
ref.replace("(", " ").replace(")", ""), # "Figure 3 a"
|
371
|
+
ref.replace("(", "").replace(")", ""), # "Figure 3a"
|
372
|
+
ref.replace("Figure", "Fig"), # "Fig 3(a)"
|
373
|
+
base_ref.replace("Figure", "Fig"), # "Fig 3"
|
374
|
+
]
|
375
|
+
|
376
|
+
# Find the reference in the page
|
377
|
+
found = False
|
378
|
+
pos = -1
|
379
|
+
used_ref = ref
|
380
|
+
|
381
|
+
for ref_var in ref_variations:
|
382
|
+
if ref_var.lower() in page_text.lower():
|
383
|
+
pos = page_text.lower().find(ref_var.lower())
|
384
|
+
used_ref = ref_var
|
385
|
+
found = True
|
386
|
+
break
|
387
|
+
|
388
|
+
if found and pos >= 0:
|
389
|
+
# Extract context around it
|
390
|
+
start = max(0, pos - context_chars)
|
391
|
+
end = min(len(page_text), pos + len(used_ref) + context_chars)
|
392
|
+
|
393
|
+
section = page_text[start:end]
|
394
|
+
extracted_sections.append(
|
395
|
+
f"\n=== Context around '{ref}' (found as '{used_ref}') in {pdf_path.name}, page {page_num + 1} ===\n{section}"
|
396
|
+
)
|
397
|
+
log.debug("Found '%s' as '%s' on page %d of %s", ref, used_ref, page_num + 1, pdf_path.name)
|
398
|
+
finally:
|
399
|
+
doc.close()
|
400
|
+
|
401
|
+
if not extracted_sections:
|
402
|
+
log.warning("Could not find reference '%s' or base reference '%s' in any PDF", ref, base_ref)
|
403
|
+
|
404
|
+
return "\n".join(extracted_sections)
|
405
|
+
|
406
|
+
def _extract_sections_by_title(pdf_paths: List[Path], section_titles: List[str], max_chars_per_section: int = 10000) -> str:
|
407
|
+
"""Extract sections by their titles from PDFs."""
|
408
|
+
import re
|
409
|
+
extracted_sections = []
|
410
|
+
|
411
|
+
for pdf_path in pdf_paths:
|
412
|
+
doc = _open_doc(pdf_path)
|
413
|
+
try:
|
414
|
+
# Build full text with page markers
|
415
|
+
pages_text = []
|
416
|
+
for i, page in enumerate(doc):
|
417
|
+
page_text = page.get_text()
|
418
|
+
pages_text.append(f"\n[PAGE {i+1}]\n{page_text}")
|
419
|
+
full_text = "".join(pages_text)
|
420
|
+
|
421
|
+
for title in section_titles:
|
422
|
+
# Find section start
|
423
|
+
title_pattern = re.escape(title)
|
424
|
+
match = re.search(rf'{title_pattern}', full_text, re.IGNORECASE)
|
425
|
+
|
426
|
+
if match:
|
427
|
+
start_pos = match.start()
|
428
|
+
|
429
|
+
# Find the page number
|
430
|
+
page_match = re.search(r'\[PAGE (\d+)\]', full_text[:start_pos][::-1])
|
431
|
+
page_num = "unknown"
|
432
|
+
if page_match:
|
433
|
+
page_num = page_match.group(1)[::-1]
|
434
|
+
|
435
|
+
# Try to find the next section header
|
436
|
+
next_section_patterns = [
|
437
|
+
r'\n[A-Z][A-Za-z\s]+:\s*\n', # "Section Title:\n"
|
438
|
+
r'\n\d+\.\s+[A-Z]', # "1. Next Section"
|
439
|
+
r'\n[A-Z]{2,}[A-Z\s]*\n', # "SECTION HEADER\n"
|
440
|
+
r'\nReferences\s*\n',
|
441
|
+
r'\nAcknowledg',
|
442
|
+
r'\n\[PAGE \d+\]', # Next page
|
443
|
+
]
|
444
|
+
|
445
|
+
end_pos = len(full_text)
|
446
|
+
for pattern in next_section_patterns:
|
447
|
+
next_match = re.search(pattern, full_text[start_pos + 100:], re.IGNORECASE)
|
448
|
+
if next_match:
|
449
|
+
end_pos = min(end_pos, start_pos + 100 + next_match.start())
|
450
|
+
|
451
|
+
# Extract section with size limit
|
452
|
+
section_text = full_text[start_pos:min(start_pos + max_chars_per_section, end_pos)]
|
453
|
+
|
454
|
+
# Clean up page markers
|
455
|
+
section_text = re.sub(r'\[PAGE \d+\]', '', section_text)
|
456
|
+
|
457
|
+
extracted_sections.append(
|
458
|
+
f"\n=== Section: '{title}' from {pdf_path.name} (starting page {page_num}) ===\n{section_text}"
|
459
|
+
)
|
460
|
+
log.info("Extracted section '%s' (%d chars) from %s",
|
461
|
+
title, len(section_text), pdf_path.name)
|
462
|
+
finally:
|
463
|
+
doc.close()
|
464
|
+
|
465
|
+
return "\n".join(extracted_sections)
|
466
|
+
|
467
|
+
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int]) -> str:
|
468
|
+
"""Extract text from a specific page number in the PDFs."""
|
469
|
+
# Convert page number to int and handle S-prefix
|
470
|
+
page_str = str(page_num).strip().upper()
|
471
|
+
if page_str.startswith('S'):
|
472
|
+
# Supplementary page - look in the SI PDF (second PDF)
|
473
|
+
actual_page = int(page_str[1:]) - 1 # 0-indexed
|
474
|
+
pdf_index = 1 if len(pdf_paths) > 1 else 0
|
475
|
+
else:
|
476
|
+
# Regular page - look in the main PDF
|
477
|
+
actual_page = int(page_str) - 1 # 0-indexed
|
478
|
+
pdf_index = 0
|
479
|
+
|
480
|
+
if pdf_index >= len(pdf_paths):
|
481
|
+
log.warning("Page %s requested but not enough PDFs provided", page_str)
|
482
|
+
return ""
|
483
|
+
|
484
|
+
try:
|
485
|
+
doc = _open_doc(pdf_paths[pdf_index])
|
486
|
+
if 0 <= actual_page < len(doc):
|
487
|
+
page = doc[actual_page]
|
488
|
+
page_text = page.get_text()
|
489
|
+
doc.close()
|
490
|
+
log.info("Extracted %d chars from page %s of %s",
|
491
|
+
len(page_text), page_str, pdf_paths[pdf_index].name)
|
492
|
+
return page_text
|
493
|
+
else:
|
494
|
+
log.warning("Page %s (index %d) out of range for %s (has %d pages)",
|
495
|
+
page_str, actual_page, pdf_paths[pdf_index].name, len(doc))
|
496
|
+
doc.close()
|
497
|
+
return ""
|
498
|
+
except Exception as e:
|
499
|
+
log.error("Failed to extract page %s: %s", page_str, e)
|
500
|
+
return ""
|
501
|
+
|
502
|
+
def _extract_text_from_pages(pdf_paths: List[Path], page_nums: List[Union[str, int]], max_pages: int = 10) -> str:
|
503
|
+
"""Extract text from multiple page numbers."""
|
504
|
+
all_text = []
|
505
|
+
pages_extracted = 0
|
506
|
+
|
507
|
+
for page_num in page_nums[:max_pages]:
|
508
|
+
page_text = _extract_text_from_page(pdf_paths, page_num)
|
509
|
+
if page_text:
|
510
|
+
all_text.append(f"\n[PAGE {page_num}]\n{page_text}")
|
511
|
+
pages_extracted += 1
|
512
|
+
|
513
|
+
if pages_extracted == 0:
|
514
|
+
log.warning("No pages extracted from requested pages: %s", page_nums[:5])
|
515
|
+
else:
|
516
|
+
log.info("Extracted text from %d pages", pages_extracted)
|
517
|
+
return "\n".join(all_text)
|
518
|
+
|
519
|
+
# === 5. LLM (GEMINI) HELPERS === ---------------------------------------------
|
520
|
+
from typing import Tuple
|
521
|
+
|
522
|
+
_BACKOFF_BASE = 2.0 # exponential back-off base (seconds)
|
523
|
+
|
524
|
+
# -- 5.1 Import whichever SDK is installed -----------------------------------
|
525
|
+
|
526
|
+
def _import_gemini_sdk() -> Tuple[str, Any]:
|
527
|
+
"""Return (flavor, module) where flavor in {"new", "legacy"}."""
|
528
|
+
try:
|
529
|
+
import google.generativeai as genai # official SDK >= 1.0
|
530
|
+
return "new", genai
|
531
|
+
except ImportError:
|
532
|
+
try:
|
533
|
+
import google_generativeai as genai # legacy prerelease name
|
534
|
+
return "legacy", genai
|
535
|
+
except ImportError as exc:
|
536
|
+
raise ImportError(
|
537
|
+
"Neither 'google-generativeai' (>=1.0) nor 'google_generativeai'\n"
|
538
|
+
"is installed. Run: pip install --upgrade google-generativeai"
|
539
|
+
) from exc
|
540
|
+
|
541
|
+
_SDK_FLAVOR, _genai = _import_gemini_sdk()
|
542
|
+
|
543
|
+
# -- 5.2 Model factory --------------------------------------------------------
|
544
|
+
|
545
|
+
def get_model():
|
546
|
+
"""Configure API key and return a `GenerativeModel` instance."""
|
547
|
+
api_key = os.getenv("GEMINI_API_KEY")
|
548
|
+
if not api_key:
|
549
|
+
raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
|
550
|
+
_genai.configure(api_key=api_key)
|
551
|
+
# Positional constructor arg works for both SDK flavors
|
552
|
+
return _genai.GenerativeModel(MODEL_NAME)
|
553
|
+
|
554
|
+
# === 5.3 Unified call helper ----------------------------------------------
|
555
|
+
|
556
|
+
def _extract_text(resp) -> str:
|
557
|
+
"""
|
558
|
+
Pull the *first* textual part out of a GenerativeAI response, handling both
|
559
|
+
the old prerelease SDK and the >=1.0 SDK.
|
560
|
+
|
561
|
+
Returns an empty string if no textual content is found.
|
562
|
+
"""
|
563
|
+
# 1) Legacy SDK (<= 0.4) - still has nice `.text`
|
564
|
+
if getattr(resp, "text", None):
|
565
|
+
return resp.text
|
566
|
+
|
567
|
+
# 2) >= 1.0 SDK
|
568
|
+
if getattr(resp, "candidates", None):
|
569
|
+
cand = resp.candidates[0]
|
570
|
+
|
571
|
+
# 2a) Some beta builds still expose `.text`
|
572
|
+
if getattr(cand, "text", None):
|
573
|
+
return cand.text
|
574
|
+
|
575
|
+
# 2b) Official path: candidate.content.parts[*].text
|
576
|
+
if getattr(cand, "content", None):
|
577
|
+
parts = [
|
578
|
+
part.text # Part objects have .text
|
579
|
+
for part in cand.content.parts
|
580
|
+
if getattr(part, "text", None)
|
581
|
+
]
|
582
|
+
if parts:
|
583
|
+
return "".join(parts)
|
584
|
+
|
585
|
+
# 3) As a last resort fall back to str()
|
586
|
+
return str(resp)
|
587
|
+
|
588
|
+
def generate_json_with_retry(
|
589
|
+
model,
|
590
|
+
prompt: str,
|
591
|
+
schema_hint: str | None = None,
|
592
|
+
*,
|
593
|
+
max_retries: int = MAX_RETRIES,
|
594
|
+
debug_dir: str | Path | None = None,
|
595
|
+
tag: str = 'gemini',
|
596
|
+
):
|
597
|
+
"""
|
598
|
+
Call Gemini with retries & exponential back-off, returning parsed JSON.
|
599
|
+
|
600
|
+
Also strips Markdown fences that the model may wrap around its JSON.
|
601
|
+
"""
|
602
|
+
# Log prompt details
|
603
|
+
log.info("=== GEMINI API CALL: %s ===", tag.upper())
|
604
|
+
log.info("Prompt length: %d characters", len(prompt))
|
605
|
+
log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
|
606
|
+
|
607
|
+
# Save full prompt to debug directory if provided
|
608
|
+
if debug_dir:
|
609
|
+
debug_path = Path(debug_dir)
|
610
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
611
|
+
prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
|
612
|
+
with open(prompt_file, 'w') as f:
|
613
|
+
f.write(f"=== PROMPT FOR {tag.upper()} ===\n")
|
614
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
615
|
+
f.write(f"Length: {len(prompt)} characters\n")
|
616
|
+
f.write("="*80 + "\n\n")
|
617
|
+
f.write(prompt)
|
618
|
+
log.info("Full prompt saved to: %s", prompt_file)
|
619
|
+
|
620
|
+
fence_re = re.compile(r"```json|```", re.I)
|
621
|
+
for attempt in range(1, max_retries + 1):
|
622
|
+
try:
|
623
|
+
log.info("Calling Gemini API (attempt %d/%d)...", attempt, max_retries)
|
624
|
+
resp = model.generate_content(prompt)
|
625
|
+
raw = _extract_text(resp).strip()
|
626
|
+
|
627
|
+
# Log response
|
628
|
+
log.info("Gemini response length: %d characters", len(raw))
|
629
|
+
log.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
|
630
|
+
|
631
|
+
# Save full response to debug directory
|
632
|
+
if debug_dir:
|
633
|
+
response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
|
634
|
+
with open(response_file, 'w') as f:
|
635
|
+
f.write(f"=== RESPONSE FOR {tag.upper()} ===\n")
|
636
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
637
|
+
f.write(f"Length: {len(raw)} characters\n")
|
638
|
+
f.write("="*80 + "\n\n")
|
639
|
+
f.write(raw)
|
640
|
+
log.info("Full response saved to: %s", response_file)
|
641
|
+
|
642
|
+
# Remove common Markdown fences
|
643
|
+
if raw.startswith("```"):
|
644
|
+
raw = fence_re.sub("", raw).strip()
|
645
|
+
|
646
|
+
# Try to find JSON in the response
|
647
|
+
# First, try to parse as-is
|
648
|
+
try:
|
649
|
+
parsed = json.loads(raw)
|
650
|
+
except json.JSONDecodeError:
|
651
|
+
# If that fails, look for JSON array or object
|
652
|
+
# Find the first '[' or '{' and the matching closing bracket
|
653
|
+
json_start = -1
|
654
|
+
json_end = -1
|
655
|
+
bracket_stack = []
|
656
|
+
in_string = False
|
657
|
+
escape_next = False
|
658
|
+
|
659
|
+
for i, char in enumerate(raw):
|
660
|
+
if escape_next:
|
661
|
+
escape_next = False
|
662
|
+
continue
|
663
|
+
|
664
|
+
if char == '\\':
|
665
|
+
escape_next = True
|
666
|
+
continue
|
667
|
+
|
668
|
+
if char == '"' and not escape_next:
|
669
|
+
in_string = not in_string
|
670
|
+
continue
|
671
|
+
|
672
|
+
if in_string:
|
673
|
+
continue
|
674
|
+
|
675
|
+
if char in '[{':
|
676
|
+
if json_start == -1:
|
677
|
+
json_start = i
|
678
|
+
bracket_stack.append(char)
|
679
|
+
elif char in ']}':
|
680
|
+
if bracket_stack:
|
681
|
+
opening = bracket_stack.pop()
|
682
|
+
if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
|
683
|
+
if not bracket_stack: # Found complete JSON
|
684
|
+
json_end = i + 1
|
685
|
+
break
|
686
|
+
|
687
|
+
if json_start >= 0 and json_end > json_start:
|
688
|
+
# Extract the JSON portion
|
689
|
+
json_str = raw[json_start:json_end]
|
690
|
+
parsed = json.loads(json_str)
|
691
|
+
else:
|
692
|
+
# Look for simple [] in the response
|
693
|
+
if '[]' in raw:
|
694
|
+
parsed = []
|
695
|
+
else:
|
696
|
+
# No JSON structure found, re-raise the original error
|
697
|
+
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
698
|
+
log.info("Successfully parsed JSON response")
|
699
|
+
return parsed
|
700
|
+
except Exception as exc: # broad except OK here
|
701
|
+
log.warning(
|
702
|
+
"Gemini call failed (attempt %d/%d): %s",
|
703
|
+
attempt, max_retries, exc,
|
704
|
+
)
|
705
|
+
if attempt == max_retries:
|
706
|
+
raise
|
707
|
+
time.sleep(_BACKOFF_BASE ** attempt)
|
708
|
+
# -------------------------------------------------------------------- end 5 ---
|
709
|
+
|
710
|
+
# === 6. SCOPE EXTRACTION ===
|
711
|
+
"""
|
712
|
+
Substrate scope extraction with compound mapping, enzyme-substrate pairing,
|
713
|
+
and individual reaction extraction.
|
714
|
+
"""
|
715
|
+
|
716
|
+
# ---- 6.1 Prompt templates -------------------------------------------------
|
717
|
+
|
718
|
+
_SCOPE_LOC_PROMPT = """
|
719
|
+
You are an expert reader of biocatalysis manuscripts.
|
720
|
+
Analyze this paper and identify all locations containing substrate scope data.
|
721
|
+
|
722
|
+
Your task is to:
|
723
|
+
1. Identify all locations (tables, figures, text) containing substrate scope reaction data
|
724
|
+
2. Distinguish substrate scope studies from model reactions used for evolution
|
725
|
+
3. Determine which enzyme variants were tested in substrate scope studies
|
726
|
+
4. Note if multiple substrates are tested with the same enzyme variant
|
727
|
+
|
728
|
+
Return your analysis as JSON array (max {max_results} locations):
|
729
|
+
[
|
730
|
+
{{
|
731
|
+
"location": "e.g., SI Table 4, Figure 3, etc.",
|
732
|
+
"type": "table/figure/text",
|
733
|
+
"confidence": 0-100,
|
734
|
+
"reason": "why this contains substrate scope",
|
735
|
+
"enzyme_variants_tested": ["list", "of", "enzyme", "variants"],
|
736
|
+
"number_of_substrates": "approximate number"
|
737
|
+
}}
|
738
|
+
]
|
739
|
+
""".strip()
|
740
|
+
|
741
|
+
_IUPAC_SECTION_PROMPT = """
|
742
|
+
Analyze the provided Supporting Information table of contents pages to identify sections containing compound IUPAC names.
|
743
|
+
|
744
|
+
Look for sections that contain:
|
745
|
+
1. "Synthesis" of compounds with numbered identifiers
|
746
|
+
2. "Characterization" data for compounds
|
747
|
+
3. "General procedure" sections listing compounds
|
748
|
+
4. "NMR spectra" or "MS data" sections
|
749
|
+
|
750
|
+
IMPORTANT:
|
751
|
+
- Return the EXACT page range where compounds are characterized
|
752
|
+
- Use S prefix for supplementary pages (e.g., "S22-S30" not "22-30")
|
753
|
+
- Include both starting AND ending page numbers
|
754
|
+
|
755
|
+
Return JSON:
|
756
|
+
{
|
757
|
+
"iupac_sections": [
|
758
|
+
{
|
759
|
+
"section": "exact section title as written",
|
760
|
+
"page_range": "page range (e.g., 'S22-S45')",
|
761
|
+
"description": "what compounds are described"
|
762
|
+
}
|
763
|
+
]
|
764
|
+
}
|
765
|
+
""".strip()
|
766
|
+
|
767
|
+
_COMPOUND_MAPPING_PROMPT = """
|
768
|
+
Extract compound identifiers and their chemical names EXACTLY as they appear in the text.
|
769
|
+
|
770
|
+
STRICT RULES:
|
771
|
+
1. ONLY extract what is explicitly written in the text
|
772
|
+
2. Look for patterns where compound IDs are paired with chemical names
|
773
|
+
3. DO NOT infer, generate, or guess any chemical names
|
774
|
+
4. If a compound ID appears without a chemical name, return null for iupac_name
|
775
|
+
5. If a product was "not detected" or "not formed", return null for iupac_name
|
776
|
+
|
777
|
+
For each compound:
|
778
|
+
- identifier: The exact compound ID as written (e.g., "1", "2a", "SM-1")
|
779
|
+
- iupac_name: The chemical name if explicitly provided, otherwise null
|
780
|
+
- common_names: Any alternative names mentioned
|
781
|
+
- compound_type: substrate/product/reagent/catalyst/other
|
782
|
+
- source_location: The exact text excerpt where this information was found
|
783
|
+
|
784
|
+
Return as JSON:
|
785
|
+
{
|
786
|
+
"compound_mappings": [
|
787
|
+
{
|
788
|
+
"identifier": "string",
|
789
|
+
"iupac_name": "string or null",
|
790
|
+
"common_names": ["array of strings"],
|
791
|
+
"compound_type": "string",
|
792
|
+
"source_location": "string"
|
793
|
+
}
|
794
|
+
]
|
795
|
+
}
|
796
|
+
|
797
|
+
Note: It is better to return null than to hallucinate or infer chemical structures.
|
798
|
+
""".strip()
|
799
|
+
|
800
|
+
_SUBSTRATE_SCOPE_PROMPT = """
|
801
|
+
Extract ALL substrate scope data from the primary sources in one complete extraction.
|
802
|
+
{extraction_hints}
|
803
|
+
|
804
|
+
For EACH reaction, extract:
|
805
|
+
1. Enzyme variant ID
|
806
|
+
2. Substrate identifiers (e.g., "6a", "5")
|
807
|
+
3. Product identifiers (e.g., "7a", "7b", "7d", "7e") - ALWAYS include even if no yield
|
808
|
+
4. Performance metrics (yield%, ee%, dr, TTN)
|
809
|
+
5. Reaction conditions (temperature, pH, buffer, substrate concentrations - NOT dithionite/reducing agents)
|
810
|
+
6. Data location (which figure/table this comes from)
|
811
|
+
|
812
|
+
CRITICAL - NO HALLUCINATION OR MODIFICATION:
|
813
|
+
- Extract values EXACTLY as written in the primary source - NO CHANGES WHATSOEVER
|
814
|
+
- DO NOT round, estimate, convert, or modify any numbers
|
815
|
+
- If the text shows "53%", report 53.0, not 53 or 53.00
|
816
|
+
- If the text shows "<5%", report exactly "<5" as a string in notes, yield_percent=null
|
817
|
+
- If the text shows "trace", report exactly "trace" in notes, yield_percent=null
|
818
|
+
- If the text shows "n.d.", report exactly "n.d." in notes, yield_percent=null
|
819
|
+
- If the text shows "80:20 er", calculate ee as 60.0 (|80-20|)
|
820
|
+
- If the text shows "91% ee", report ee_percent as 91.0
|
821
|
+
- If no value is shown, return null, not 0 or empty string
|
822
|
+
- Extract ALL reactions from ALL identified locations
|
823
|
+
- Use compound identifiers EXACTLY as shown (not IUPAC names)
|
824
|
+
- For every entry, there needs to be identifier for both substrates and products, even if yield is null or activity is 0.
|
825
|
+
- Extract reaction conditions EXACTLY as written - NO PARAPHRASING
|
826
|
+
- IMPORTANT: Substrate concentration refers to the concentration of the actual chemical substrates being transformed in the reaction, NOT reducing agents (e.g., dithionite, NADH) or other additives
|
827
|
+
|
828
|
+
IMPORTANT: Each substrate should have a corresponding product identifier. Even when there is no yield, return
|
829
|
+
the exact identifier as seen in the reaction.
|
830
|
+
|
831
|
+
Return as JSON:
|
832
|
+
{{
|
833
|
+
"substrate_scope_data": [
|
834
|
+
{{
|
835
|
+
"enzyme_id": "enzyme variant name",
|
836
|
+
"substrate_ids": ["list of substrate identifiers"],
|
837
|
+
"product_ids": ["list of product identifiers"],
|
838
|
+
"yield_percent": null or number,
|
839
|
+
"ee_percent": null or number,
|
840
|
+
"dr": "ratio if reported",
|
841
|
+
"ttn": null or number,
|
842
|
+
"reaction_conditions": {{
|
843
|
+
"temperature": "",
|
844
|
+
"ph": "",
|
845
|
+
"buffer": "",
|
846
|
+
"substrate_concentration": "concentration of actual substrates/reagents, NOT reducing agents like dithionite",
|
847
|
+
"other_conditions": "including enzyme loading, reducing agents (e.g., dithionite), time, etc."
|
848
|
+
}},
|
849
|
+
"data_location": "specific figure/table",
|
850
|
+
"notes": "any special notes (e.g., 'no product detected')"
|
851
|
+
}}
|
852
|
+
]
|
853
|
+
}}
|
854
|
+
""".strip()
|
855
|
+
|
856
|
+
|
857
|
+
# ---- 6.2 Helper functions -------------------------------------------------
|
858
|
+
|
859
|
+
def identify_scope_locations(
|
860
|
+
text: str,
|
861
|
+
model,
|
862
|
+
*,
|
863
|
+
max_results: int = 5,
|
864
|
+
debug_dir: str | Path | None = None,
|
865
|
+
) -> List[dict]:
|
866
|
+
"""Ask Gemini where substrate scope data is located."""
|
867
|
+
prompt = _SCOPE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
|
868
|
+
locs: List[dict] = []
|
869
|
+
try:
|
870
|
+
locs = generate_json_with_retry(
|
871
|
+
model,
|
872
|
+
prompt,
|
873
|
+
debug_dir=debug_dir,
|
874
|
+
tag="scope_locate",
|
875
|
+
)
|
876
|
+
except Exception as exc: # pragma: no cover
|
877
|
+
log.warning("identify_scope_locations(): %s", exc)
|
878
|
+
return locs if isinstance(locs, list) else []
|
879
|
+
|
880
|
+
def identify_iupac_sections(
|
881
|
+
text: str,
|
882
|
+
model,
|
883
|
+
*,
|
884
|
+
pdf_paths: List[Path] = None,
|
885
|
+
debug_dir: str | Path | None = None,
|
886
|
+
) -> List[dict]:
|
887
|
+
"""Identify sections containing IUPAC names from SI table of contents."""
|
888
|
+
# Extract only SI TOC pages (first 5 pages of SI)
|
889
|
+
si_toc_text = ""
|
890
|
+
if pdf_paths and len(pdf_paths) > 1:
|
891
|
+
si_pdf = pdf_paths[1] # Second PDF is SI
|
892
|
+
doc = _open_doc(si_pdf)
|
893
|
+
try:
|
894
|
+
for page_num in range(min(5, doc.page_count)):
|
895
|
+
page = doc.load_page(page_num)
|
896
|
+
page_text = page.get_text()
|
897
|
+
si_toc_text += f"\n[SI Page {page_num + 1}]\n{page_text}"
|
898
|
+
finally:
|
899
|
+
doc.close()
|
900
|
+
|
901
|
+
if not si_toc_text:
|
902
|
+
# Fallback to caption text
|
903
|
+
si_toc_text = text[:15_000]
|
904
|
+
|
905
|
+
prompt = _IUPAC_SECTION_PROMPT + "\n\nTEXT:\n" + si_toc_text
|
906
|
+
|
907
|
+
try:
|
908
|
+
data = generate_json_with_retry(
|
909
|
+
model,
|
910
|
+
prompt,
|
911
|
+
debug_dir=debug_dir,
|
912
|
+
tag="iupac_sections",
|
913
|
+
)
|
914
|
+
|
915
|
+
sections = data.get("iupac_sections", []) if isinstance(data, dict) else []
|
916
|
+
log.info("Identified %d sections containing IUPAC names", len(sections))
|
917
|
+
return sections
|
918
|
+
|
919
|
+
except Exception as exc:
|
920
|
+
log.warning("Failed to identify IUPAC sections: %s", exc)
|
921
|
+
return []
|
922
|
+
|
923
|
+
def _extract_compound_mappings_from_text(
|
924
|
+
extraction_text: str,
|
925
|
+
model,
|
926
|
+
compound_ids: List[str] = None,
|
927
|
+
debug_dir: str | Path | None = None,
|
928
|
+
tag_suffix: str = "",
|
929
|
+
) -> Dict[str, CompoundMapping]:
|
930
|
+
"""Helper function to extract compound mappings from provided text."""
|
931
|
+
prompt = _COMPOUND_MAPPING_PROMPT
|
932
|
+
if compound_ids:
|
933
|
+
prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
|
934
|
+
prompt += "\n\nTEXT:\n" + extraction_text
|
935
|
+
|
936
|
+
tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
|
937
|
+
|
938
|
+
try:
|
939
|
+
data = generate_json_with_retry(
|
940
|
+
model,
|
941
|
+
prompt,
|
942
|
+
debug_dir=debug_dir,
|
943
|
+
tag=tag,
|
944
|
+
)
|
945
|
+
|
946
|
+
mappings = {}
|
947
|
+
compound_mappings_data = data.get("compound_mappings") or []
|
948
|
+
for item in compound_mappings_data:
|
949
|
+
# Handle both old format (with identifiers list) and new format (with identifier string)
|
950
|
+
identifiers = item.get("identifiers") or []
|
951
|
+
if not identifiers and item.get("identifier"):
|
952
|
+
identifiers = [item.get("identifier")]
|
953
|
+
|
954
|
+
mapping = CompoundMapping(
|
955
|
+
identifiers=identifiers,
|
956
|
+
iupac_name=item.get("iupac_name", ""),
|
957
|
+
common_names=item.get("common_names") or [],
|
958
|
+
compound_type=item.get("compound_type", "unknown"),
|
959
|
+
source_location=item.get("source_location")
|
960
|
+
)
|
961
|
+
|
962
|
+
# Create lookup entries for all identifiers and common names
|
963
|
+
for identifier in mapping.identifiers + mapping.common_names:
|
964
|
+
if identifier:
|
965
|
+
mappings[identifier.lower().strip()] = mapping
|
966
|
+
|
967
|
+
return mappings
|
968
|
+
|
969
|
+
except Exception as exc:
|
970
|
+
log.error("Failed to extract compound mappings: %s", exc)
|
971
|
+
return {}
|
972
|
+
|
973
|
+
def _extract_compound_mappings_with_figures(
|
974
|
+
text: str,
|
975
|
+
model,
|
976
|
+
compound_ids: List[str],
|
977
|
+
figure_images: Dict[str, str],
|
978
|
+
pdf_paths: List[Path],
|
979
|
+
debug_dir: str | Path | None = None,
|
980
|
+
tag_suffix: str = "",
|
981
|
+
) -> Dict[str, CompoundMapping]:
|
982
|
+
"""Extract compound mappings using multimodal approach with figures."""
|
983
|
+
# Enhanced prompt for figure-based extraction
|
984
|
+
prompt = """You are an expert chemist analyzing chemical figures and manuscript text to identify compound IUPAC names.
|
985
|
+
|
986
|
+
TASK: Find the IUPAC names for these specific compound identifiers: """ + ", ".join(sorted(compound_ids)) + """
|
987
|
+
|
988
|
+
APPROACH (in order of preference):
|
989
|
+
1. First, look for explicitly written IUPAC names in text or captions
|
990
|
+
2. If not found, look for common/trivial names that you can convert to IUPAC
|
991
|
+
3. As last resort, carefully analyze chemical structures in figures to derive IUPAC names
|
992
|
+
|
993
|
+
CRITICAL ACCURACY REQUIREMENTS:
|
994
|
+
When deriving IUPAC names from structures:
|
995
|
+
- Count ALL atoms and bonds carefully - do not miss any substituents
|
996
|
+
- Verify the COMPLETE structure matches your IUPAC name
|
997
|
+
- For cyclopropanes: use "cyclopropane-1-carboxylate" NOT "cyclopropanecarboxylate"
|
998
|
+
- Include stereochemistry only if clearly shown (trans-, cis-, R/S)
|
999
|
+
- Double-check ring sizes, substituent positions, and functional groups
|
1000
|
+
- If a structure is unclear or ambiguous, return null rather than guess
|
1001
|
+
|
1002
|
+
VALIDATION CHECKLIST before providing an IUPAC name:
|
1003
|
+
□ Have I accounted for EVERY atom in the structure?
|
1004
|
+
□ Have I identified ALL functional groups correctly?
|
1005
|
+
□ Is the parent chain/ring correctly identified?
|
1006
|
+
□ Are substituent positions numbered correctly?
|
1007
|
+
□ Is the name formatted with proper punctuation (hyphens, commas)?
|
1008
|
+
□ Would this IUPAC name regenerate EXACTLY the structure shown?
|
1009
|
+
|
1010
|
+
Common mistakes to avoid:
|
1011
|
+
- Missing substituents (e.g., forgetting a methoxy group)
|
1012
|
+
- Wrong ring size (e.g., calling a benzene ring a cyclohexane)
|
1013
|
+
- Incorrect substituent positions
|
1014
|
+
- Using "benzyl" vs "phenyl" incorrectly
|
1015
|
+
- Missing or incorrect stereochemistry
|
1016
|
+
|
1017
|
+
Return as JSON:
|
1018
|
+
{
|
1019
|
+
"compound_mappings": [
|
1020
|
+
{
|
1021
|
+
"identifier": "compound identifier",
|
1022
|
+
"iupac_name": "valid IUPAC systematic name or null if uncertain",
|
1023
|
+
"common_names": ["common names found in text"],
|
1024
|
+
"compound_type": "substrate/product/reagent",
|
1025
|
+
"source_location": "where found/how determined"
|
1026
|
+
}
|
1027
|
+
]
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
TEXT FROM MANUSCRIPT:
|
1031
|
+
""" + text
|
1032
|
+
|
1033
|
+
# Prepare multimodal content
|
1034
|
+
content_parts = [prompt]
|
1035
|
+
|
1036
|
+
# Add figure images
|
1037
|
+
if figure_images:
|
1038
|
+
import PIL.Image
|
1039
|
+
import io
|
1040
|
+
import base64
|
1041
|
+
|
1042
|
+
for fig_ref, fig_base64 in figure_images.items():
|
1043
|
+
try:
|
1044
|
+
img_bytes = base64.b64decode(fig_base64)
|
1045
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1046
|
+
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1047
|
+
content_parts.append(image)
|
1048
|
+
log.info("Added figure %s to multimodal compound mapping", fig_ref)
|
1049
|
+
except Exception as e:
|
1050
|
+
log.warning("Failed to add figure %s: %s", fig_ref, e)
|
1051
|
+
|
1052
|
+
tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
|
1053
|
+
|
1054
|
+
try:
|
1055
|
+
# Log multimodal call
|
1056
|
+
log.info("=== GEMINI MULTIMODAL API CALL: COMPOUND_MAPPING_WITH_FIGURES ===")
|
1057
|
+
log.info("Text prompt length: %d characters", len(prompt))
|
1058
|
+
log.info("Number of images: %d", len(content_parts) - 1)
|
1059
|
+
log.info("Compounds to find: %s", ", ".join(sorted(compound_ids)))
|
1060
|
+
|
1061
|
+
# Save debug info
|
1062
|
+
if debug_dir:
|
1063
|
+
debug_path = Path(debug_dir)
|
1064
|
+
prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
|
1065
|
+
with open(prompt_file, 'w') as f:
|
1066
|
+
f.write(f"=== PROMPT FOR {tag.upper()} ===\n")
|
1067
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
1068
|
+
f.write(f"Text length: {len(prompt)} characters\n")
|
1069
|
+
f.write(f"Images included: {len(content_parts) - 1}\n")
|
1070
|
+
for fig_ref in figure_images.keys():
|
1071
|
+
f.write(f" - {fig_ref}\n")
|
1072
|
+
f.write("="*80 + "\n\n")
|
1073
|
+
f.write(prompt)
|
1074
|
+
log.info("Full prompt saved to: %s", prompt_file)
|
1075
|
+
|
1076
|
+
# Make multimodal API call
|
1077
|
+
response = model.generate_content(content_parts)
|
1078
|
+
raw_text = _extract_text(response).strip()
|
1079
|
+
|
1080
|
+
# Log response
|
1081
|
+
log.info("Gemini multimodal response length: %d characters", len(raw_text))
|
1082
|
+
|
1083
|
+
if debug_dir:
|
1084
|
+
response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
|
1085
|
+
with open(response_file, 'w') as f:
|
1086
|
+
f.write(f"=== RESPONSE FOR {tag.upper()} ===\n")
|
1087
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
1088
|
+
f.write(f"Length: {len(raw_text)} characters\n")
|
1089
|
+
f.write("="*80 + "\n\n")
|
1090
|
+
f.write(raw_text)
|
1091
|
+
log.info("Full response saved to: %s", response_file)
|
1092
|
+
|
1093
|
+
# Parse JSON
|
1094
|
+
import json
|
1095
|
+
data = json.loads(raw_text.strip('```json').strip('```').strip())
|
1096
|
+
|
1097
|
+
mappings = {}
|
1098
|
+
compound_mappings_data = data.get("compound_mappings") or []
|
1099
|
+
for item in compound_mappings_data:
|
1100
|
+
identifiers = item.get("identifiers") or []
|
1101
|
+
if not identifiers and item.get("identifier"):
|
1102
|
+
identifiers = [item.get("identifier")]
|
1103
|
+
|
1104
|
+
mapping = CompoundMapping(
|
1105
|
+
identifiers=identifiers,
|
1106
|
+
iupac_name=item.get("iupac_name", ""),
|
1107
|
+
common_names=item.get("common_names") or [],
|
1108
|
+
compound_type=item.get("compound_type", "unknown"),
|
1109
|
+
source_location=item.get("source_location")
|
1110
|
+
)
|
1111
|
+
|
1112
|
+
for identifier in mapping.identifiers + mapping.common_names:
|
1113
|
+
if identifier:
|
1114
|
+
mappings[identifier.lower().strip()] = mapping
|
1115
|
+
|
1116
|
+
return mappings
|
1117
|
+
|
1118
|
+
except Exception as exc:
|
1119
|
+
log.error("Failed to extract compound mappings with figures: %s", exc)
|
1120
|
+
return {}
|
1121
|
+
|
1122
|
+
def _extract_text_for_compound_mapping(
|
1123
|
+
pdf_paths: List[Path],
|
1124
|
+
iupac_sections: List[dict],
|
1125
|
+
text_fallback: str,
|
1126
|
+
) -> str:
|
1127
|
+
"""Extract text from identified IUPAC sections for compound mapping."""
|
1128
|
+
extraction_text = ""
|
1129
|
+
|
1130
|
+
if iupac_sections and pdf_paths:
|
1131
|
+
log.info("Extracting text from %d identified IUPAC sections", len(iupac_sections))
|
1132
|
+
|
1133
|
+
# Use page-based extraction for each section
|
1134
|
+
for section in iupac_sections:
|
1135
|
+
section_title = section.get('section', '')
|
1136
|
+
page_range = section.get('page_range') or section.get('page', '')
|
1137
|
+
|
1138
|
+
if page_range:
|
1139
|
+
log.info("Extracting section '%s' from page range %s", section_title, page_range)
|
1140
|
+
|
1141
|
+
# Extract multiple pages starting from the given page
|
1142
|
+
pages_to_extract = []
|
1143
|
+
|
1144
|
+
if '-' in str(page_range):
|
1145
|
+
# Handle ranges like "S22-S45"
|
1146
|
+
parts = page_range.split('-')
|
1147
|
+
start_page = parts[0].strip()
|
1148
|
+
end_page = parts[1].strip() if len(parts) > 1 else None
|
1149
|
+
|
1150
|
+
# Extract all pages in the range
|
1151
|
+
if start_page.startswith('S') and end_page and end_page.startswith('S'):
|
1152
|
+
try:
|
1153
|
+
start_num = int(start_page[1:])
|
1154
|
+
end_num = int(end_page[1:])
|
1155
|
+
for i in range(start_num, min(end_num + 1, start_num + 15)): # Max 15 pages
|
1156
|
+
pages_to_extract.append(f"S{i}")
|
1157
|
+
except:
|
1158
|
+
pages_to_extract.append(start_page)
|
1159
|
+
else:
|
1160
|
+
pages_to_extract.append(start_page)
|
1161
|
+
else:
|
1162
|
+
# Single page - extract it plus next 10 pages
|
1163
|
+
start_page = str(page_range).strip()
|
1164
|
+
|
1165
|
+
# Ensure S prefix for SI pages
|
1166
|
+
if not start_page.startswith('S') and start_page.isdigit():
|
1167
|
+
start_page = 'S' + start_page
|
1168
|
+
|
1169
|
+
pages_to_extract.append(start_page)
|
1170
|
+
|
1171
|
+
# Add next 10 pages
|
1172
|
+
try:
|
1173
|
+
if start_page.startswith('S'):
|
1174
|
+
base_num = int(start_page[1:])
|
1175
|
+
for i in range(1, 11): # Extract 10 more pages
|
1176
|
+
pages_to_extract.append(f"S{base_num + i}")
|
1177
|
+
else:
|
1178
|
+
base_num = int(start_page)
|
1179
|
+
for i in range(1, 11):
|
1180
|
+
pages_to_extract.append(str(base_num + i))
|
1181
|
+
except:
|
1182
|
+
pass
|
1183
|
+
|
1184
|
+
# Extract the pages
|
1185
|
+
page_text = _extract_text_from_pages(pdf_paths, pages_to_extract, max_pages=15)
|
1186
|
+
if page_text:
|
1187
|
+
extraction_text += f"\n\n=== Section: '{section_title}' starting from page {page_range} ===\n{page_text}"
|
1188
|
+
else:
|
1189
|
+
# Try title-based extraction as fallback
|
1190
|
+
section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=10000)
|
1191
|
+
if section_text:
|
1192
|
+
extraction_text += section_text
|
1193
|
+
|
1194
|
+
if not extraction_text:
|
1195
|
+
log.warning("No text extracted from IUPAC sections, falling back to limited text")
|
1196
|
+
extraction_text = text_fallback[:30_000]
|
1197
|
+
else:
|
1198
|
+
# Fallback to limited text
|
1199
|
+
extraction_text = text_fallback[:30_000]
|
1200
|
+
|
1201
|
+
return extraction_text
|
1202
|
+
|
1203
|
+
def extract_compound_mappings(
|
1204
|
+
text: str,
|
1205
|
+
model,
|
1206
|
+
*,
|
1207
|
+
pdf_paths: List[Path] = None,
|
1208
|
+
iupac_sections: List[dict] = None,
|
1209
|
+
compound_ids: List[str] = None,
|
1210
|
+
debug_dir: str | Path | None = None,
|
1211
|
+
) -> Dict[str, CompoundMapping]:
|
1212
|
+
"""Extract compound ID to IUPAC name mappings from identified sections.
|
1213
|
+
|
1214
|
+
Uses an adaptive strategy:
|
1215
|
+
1. First attempts extraction from identified IUPAC sections
|
1216
|
+
2. Checks for missing compounds
|
1217
|
+
3. Expands search to additional sections if compounds are missing
|
1218
|
+
"""
|
1219
|
+
# Step 1: Extract text from initially identified sections
|
1220
|
+
extraction_text = _extract_text_for_compound_mapping(pdf_paths, iupac_sections, text)
|
1221
|
+
|
1222
|
+
# Step 2: First extraction attempt
|
1223
|
+
mappings = _extract_compound_mappings_from_text(
|
1224
|
+
extraction_text, model, compound_ids, debug_dir, tag_suffix="initial"
|
1225
|
+
)
|
1226
|
+
log.info("Initial extraction found %d compound mappings", len(mappings))
|
1227
|
+
|
1228
|
+
# Step 3: Check for missing compounds
|
1229
|
+
missing_compounds = []
|
1230
|
+
if compound_ids:
|
1231
|
+
for cid in compound_ids:
|
1232
|
+
mapping = mappings.get(cid.lower().strip())
|
1233
|
+
if not mapping or not mapping.iupac_name:
|
1234
|
+
missing_compounds.append(cid)
|
1235
|
+
|
1236
|
+
# Step 4: Adaptive expansion if compounds are missing
|
1237
|
+
if missing_compounds and pdf_paths:
|
1238
|
+
log.info("Found %d compounds without IUPAC names: %s",
|
1239
|
+
len(missing_compounds), sorted(missing_compounds))
|
1240
|
+
log.info("Expanding search to additional sections...")
|
1241
|
+
|
1242
|
+
# Define additional sections that might contain compound definitions
|
1243
|
+
additional_sections = [
|
1244
|
+
"Engineering strategy",
|
1245
|
+
"Screening for benzyl acrylate cyclopropanation",
|
1246
|
+
"Evolution campaign",
|
1247
|
+
"General procedure",
|
1248
|
+
"Experimental procedures",
|
1249
|
+
"Materials and methods",
|
1250
|
+
"Substrate synthesis"
|
1251
|
+
]
|
1252
|
+
|
1253
|
+
# Extract text from additional sections
|
1254
|
+
additional_text = _extract_sections_by_title(
|
1255
|
+
pdf_paths, additional_sections, max_chars_per_section=5000
|
1256
|
+
)
|
1257
|
+
|
1258
|
+
if additional_text:
|
1259
|
+
log.info("Extracted %d chars from additional sections", len(additional_text))
|
1260
|
+
|
1261
|
+
# Second extraction attempt with expanded text
|
1262
|
+
expanded_mappings = _extract_compound_mappings_from_text(
|
1263
|
+
additional_text, model, missing_compounds, debug_dir, tag_suffix="expanded"
|
1264
|
+
)
|
1265
|
+
|
1266
|
+
# Merge new mappings
|
1267
|
+
new_found = 0
|
1268
|
+
for key, mapping in expanded_mappings.items():
|
1269
|
+
if key not in mappings or not mappings[key].iupac_name:
|
1270
|
+
if mapping.iupac_name: # Only add if we found an IUPAC name
|
1271
|
+
mappings[key] = mapping
|
1272
|
+
new_found += 1
|
1273
|
+
log.info("Found IUPAC name for '%s': %s",
|
1274
|
+
key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
|
1275
|
+
|
1276
|
+
log.info("Expanded search found %d additional compound mappings", new_found)
|
1277
|
+
else:
|
1278
|
+
log.warning("No additional text found in expanded sections")
|
1279
|
+
|
1280
|
+
# Step 5: Check again for still missing compounds
|
1281
|
+
still_missing = []
|
1282
|
+
for cid in missing_compounds:
|
1283
|
+
mapping = mappings.get(cid.lower().strip())
|
1284
|
+
if not mapping or not mapping.iupac_name:
|
1285
|
+
still_missing.append(cid)
|
1286
|
+
|
1287
|
+
# Step 6: Final fallback - use figures and full manuscript if compounds are still missing
|
1288
|
+
# COMMENTED OUT: Figure-based IUPAC extraction is unreliable
|
1289
|
+
# Generating IUPAC names from visual structures leads to errors
|
1290
|
+
# Only use text-based extraction for reliability
|
1291
|
+
|
1292
|
+
# if still_missing:
|
1293
|
+
# log.info("Still missing IUPAC names for %d compounds: %s",
|
1294
|
+
# len(still_missing), sorted(still_missing))
|
1295
|
+
# log.info("Attempting final extraction using figures and full manuscript...")
|
1296
|
+
#
|
1297
|
+
# # Extract figure images if available
|
1298
|
+
# figure_images = {}
|
1299
|
+
# if hasattr(extract_compound_mappings, '_figure_images_cache'):
|
1300
|
+
# figure_images = extract_compound_mappings._figure_images_cache
|
1301
|
+
#
|
1302
|
+
# # Use multimodal approach with figures and manuscript text
|
1303
|
+
# final_mappings = _extract_compound_mappings_with_figures(
|
1304
|
+
# text[:50_000], model, still_missing, figure_images,
|
1305
|
+
# pdf_paths, debug_dir, tag_suffix="figures"
|
1306
|
+
# )
|
1307
|
+
#
|
1308
|
+
# # Merge final mappings
|
1309
|
+
# final_found = 0
|
1310
|
+
# for key, mapping in final_mappings.items():
|
1311
|
+
# if key not in mappings or not mappings[key].iupac_name:
|
1312
|
+
# if mapping.iupac_name:
|
1313
|
+
# mappings[key] = mapping
|
1314
|
+
# final_found += 1
|
1315
|
+
# log.info("Found IUPAC name for '%s' using figures: %s",
|
1316
|
+
# key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
|
1317
|
+
#
|
1318
|
+
# log.info("Figure-based search found %d additional compound mappings", final_found)
|
1319
|
+
|
1320
|
+
if still_missing:
|
1321
|
+
log.info("Still missing IUPAC names for %d compounds: %s",
|
1322
|
+
len(still_missing), sorted(still_missing))
|
1323
|
+
log.info("Note: Figure-based IUPAC extraction is disabled for reliability")
|
1324
|
+
|
1325
|
+
log.info("Total compound mappings extracted: %d", len(mappings))
|
1326
|
+
return mappings
|
1327
|
+
|
1328
|
+
def extract_all_substrate_scope_data(
|
1329
|
+
text: str,
|
1330
|
+
model,
|
1331
|
+
locations: List[dict],
|
1332
|
+
*,
|
1333
|
+
pdf_paths: List[Path] = None,
|
1334
|
+
figure_images: Dict[str, str] = None,
|
1335
|
+
debug_dir: str | Path | None = None,
|
1336
|
+
) -> List[dict]:
|
1337
|
+
"""Extract all substrate scope data at once from all primary sources."""
|
1338
|
+
extraction_hints = ""
|
1339
|
+
all_refs = []
|
1340
|
+
|
1341
|
+
if locations:
|
1342
|
+
# Include ALL locations, not just primary
|
1343
|
+
location_strs = []
|
1344
|
+
for loc in locations[:3]: # Up to 3 locations
|
1345
|
+
loc_str = loc.get('location', '')
|
1346
|
+
location_strs.append(loc_str)
|
1347
|
+
all_refs.append(loc_str)
|
1348
|
+
|
1349
|
+
extraction_hints = f"\nSubstrate scope locations: {', '.join(location_strs)}"
|
1350
|
+
|
1351
|
+
# Collect all enzyme variants
|
1352
|
+
all_variants = []
|
1353
|
+
for loc in locations:
|
1354
|
+
variants = loc.get('enzyme_variants_tested', [])
|
1355
|
+
all_variants.extend(variants)
|
1356
|
+
|
1357
|
+
if all_variants:
|
1358
|
+
unique_variants = list(set(all_variants))
|
1359
|
+
extraction_hints += f"\nEnzyme variants: {', '.join(unique_variants)}"
|
1360
|
+
|
1361
|
+
# Extract text from ALL identified locations
|
1362
|
+
extraction_texts = []
|
1363
|
+
|
1364
|
+
for ref in all_refs:
|
1365
|
+
if ref and pdf_paths:
|
1366
|
+
ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
|
1367
|
+
if ref_text:
|
1368
|
+
# Add figure image notation if available
|
1369
|
+
if figure_images and ref in figure_images:
|
1370
|
+
ref_text = f"[FIGURE IMAGE EXTRACTED: {ref}]\n\n{ref_text}"
|
1371
|
+
extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
|
1372
|
+
|
1373
|
+
if not extraction_texts:
|
1374
|
+
extraction_texts = [text[:50_000]]
|
1375
|
+
|
1376
|
+
extraction_text = "\n\n".join(extraction_texts)
|
1377
|
+
|
1378
|
+
prompt = _SUBSTRATE_SCOPE_PROMPT.format(extraction_hints=extraction_hints)
|
1379
|
+
prompt += "\n\nTEXT:\n" + extraction_text
|
1380
|
+
|
1381
|
+
# Prepare multimodal content with images
|
1382
|
+
content_parts = [prompt]
|
1383
|
+
|
1384
|
+
# Add figure images to the prompt
|
1385
|
+
if figure_images:
|
1386
|
+
import PIL.Image
|
1387
|
+
import io
|
1388
|
+
import base64
|
1389
|
+
|
1390
|
+
for fig_ref, fig_base64 in figure_images.items():
|
1391
|
+
try:
|
1392
|
+
# Convert base64 to PIL Image
|
1393
|
+
img_bytes = base64.b64decode(fig_base64)
|
1394
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1395
|
+
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1396
|
+
content_parts.append(image)
|
1397
|
+
log.info("Added figure %s to multimodal prompt", fig_ref)
|
1398
|
+
except Exception as e:
|
1399
|
+
log.warning("Failed to add figure %s: %s", fig_ref, e)
|
1400
|
+
|
1401
|
+
try:
|
1402
|
+
# Use multimodal content if we have images
|
1403
|
+
if len(content_parts) > 1:
|
1404
|
+
# Log multimodal API call
|
1405
|
+
log.info("=== GEMINI MULTIMODAL API CALL: SUBSTRATE_SCOPE_WITH_FIGURES ===")
|
1406
|
+
log.info("Text prompt length: %d characters", len(prompt))
|
1407
|
+
log.info("Number of images: %d", len(content_parts) - 1)
|
1408
|
+
log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
|
1409
|
+
|
1410
|
+
# Save prompt and image info to debug directory
|
1411
|
+
if debug_dir:
|
1412
|
+
debug_path = Path(debug_dir)
|
1413
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1414
|
+
prompt_file = debug_path / f"substrate_scope_multimodal_prompt_{int(time.time())}.txt"
|
1415
|
+
|
1416
|
+
# Build prompt info including image references
|
1417
|
+
prompt_info = f"=== PROMPT FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n"
|
1418
|
+
prompt_info += f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
1419
|
+
prompt_info += f"Text length: {len(prompt)} characters\n"
|
1420
|
+
prompt_info += f"Images included: {len(content_parts) - 1}\n"
|
1421
|
+
for fig_ref in figure_images.keys():
|
1422
|
+
prompt_info += f" - {fig_ref}\n"
|
1423
|
+
prompt_info += "="*80 + "\n\n"
|
1424
|
+
prompt_info += prompt
|
1425
|
+
|
1426
|
+
_dump(prompt_info, prompt_file)
|
1427
|
+
log.info("Full prompt saved to: %s", prompt_file)
|
1428
|
+
|
1429
|
+
log.info("Calling Gemini Multimodal API...")
|
1430
|
+
response = model.generate_content(content_parts)
|
1431
|
+
raw_text = _extract_text(response).strip()
|
1432
|
+
|
1433
|
+
# Log and save response
|
1434
|
+
log.info("Gemini multimodal response length: %d characters", len(raw_text))
|
1435
|
+
log.info("First 500 chars of response:\n%s\n...(truncated)", raw_text[:500])
|
1436
|
+
|
1437
|
+
if debug_dir:
|
1438
|
+
debug_path = Path(debug_dir)
|
1439
|
+
response_file = debug_path / f"substrate_scope_multimodal_response_{int(time.time())}.txt"
|
1440
|
+
with open(response_file, 'w') as f:
|
1441
|
+
f.write(f"=== RESPONSE FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n")
|
1442
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
1443
|
+
f.write(f"Length: {len(raw_text)} characters\n")
|
1444
|
+
f.write("="*80 + "\n\n")
|
1445
|
+
f.write(raw_text)
|
1446
|
+
log.info("Full response saved to: %s", response_file)
|
1447
|
+
|
1448
|
+
# Parse JSON from response
|
1449
|
+
import json
|
1450
|
+
data = json.loads(raw_text.strip('```json').strip('```').strip())
|
1451
|
+
else:
|
1452
|
+
data = generate_json_with_retry(
|
1453
|
+
model,
|
1454
|
+
prompt,
|
1455
|
+
debug_dir=debug_dir,
|
1456
|
+
tag="substrate_scope",
|
1457
|
+
)
|
1458
|
+
|
1459
|
+
scope_data = data.get("substrate_scope_data", [])
|
1460
|
+
log.info("Extracted %d substrate scope entries", len(scope_data))
|
1461
|
+
return scope_data
|
1462
|
+
|
1463
|
+
except Exception as exc:
|
1464
|
+
log.error("Failed to extract substrate scope data: %s", exc)
|
1465
|
+
return []
|
1466
|
+
|
1467
|
+
def _extract_single_reaction(
|
1468
|
+
text: str,
|
1469
|
+
model,
|
1470
|
+
enzyme_id: str,
|
1471
|
+
substrate_name: str,
|
1472
|
+
data_location: str,
|
1473
|
+
context_pairs: List[Tuple[str, str, str]] = None,
|
1474
|
+
*,
|
1475
|
+
pdf_paths: List[Path] = None,
|
1476
|
+
debug_dir: str | Path | None = None,
|
1477
|
+
) -> Optional[dict]:
|
1478
|
+
"""Extract data for a single enzyme-substrate pair."""
|
1479
|
+
# Build context
|
1480
|
+
context_info = ""
|
1481
|
+
if context_pairs:
|
1482
|
+
context_info = "\nCONTEXT - NEIGHBORING ENTRIES:\n"
|
1483
|
+
for ctx_enzyme, ctx_substrate, _ in context_pairs[:4]:
|
1484
|
+
if ctx_enzyme == enzyme_id and ctx_substrate != substrate_name:
|
1485
|
+
context_info += f"- {ctx_substrate} (same enzyme, different substrate)\n"
|
1486
|
+
|
1487
|
+
# Extract focused text for this specific reaction
|
1488
|
+
if data_location and pdf_paths:
|
1489
|
+
# Extract text around the data location and reaction conditions
|
1490
|
+
extraction_text = _extract_text_around_reference(pdf_paths, data_location, context_chars=2000)
|
1491
|
+
|
1492
|
+
# Also extract reaction conditions section if available
|
1493
|
+
conditions_sections = ["General procedure", "Reaction conditions", "Standard conditions"]
|
1494
|
+
conditions_text = _extract_sections_by_title(pdf_paths, conditions_sections, max_chars_per_section=2000)
|
1495
|
+
|
1496
|
+
if conditions_text:
|
1497
|
+
extraction_text += "\n\n=== REACTION CONDITIONS ===\n" + conditions_text
|
1498
|
+
else:
|
1499
|
+
extraction_text = text[:20_000]
|
1500
|
+
|
1501
|
+
prompt = _SINGLE_REACTION_PROMPT.format(
|
1502
|
+
enzyme_id=enzyme_id,
|
1503
|
+
substrate_name=substrate_name,
|
1504
|
+
data_location=data_location,
|
1505
|
+
context_info=context_info
|
1506
|
+
)
|
1507
|
+
prompt += "\n\nTEXT:\n" + extraction_text
|
1508
|
+
|
1509
|
+
try:
|
1510
|
+
return generate_json_with_retry(
|
1511
|
+
model,
|
1512
|
+
prompt,
|
1513
|
+
debug_dir=debug_dir,
|
1514
|
+
tag=f"reaction_{enzyme_id[:10]}_{substrate_name[:10]}",
|
1515
|
+
)
|
1516
|
+
except Exception as exc:
|
1517
|
+
log.error("Failed to extract reaction %s-%s: %s", enzyme_id, substrate_name, exc)
|
1518
|
+
return None
|
1519
|
+
|
1520
|
+
def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping]) -> List[ScopeEntry]:
|
1521
|
+
"""Convert raw JSON to ScopeEntry objects with IUPAC enhancement."""
|
1522
|
+
entries: List[ScopeEntry] = []
|
1523
|
+
|
1524
|
+
for item in data:
|
1525
|
+
try:
|
1526
|
+
# Parse substrate IDs
|
1527
|
+
substrates = []
|
1528
|
+
substrate_ids = item.get("substrate_ids") or []
|
1529
|
+
# Also handle old format
|
1530
|
+
if not substrate_ids and item.get("substrates"):
|
1531
|
+
substrates_data = item.get("substrates") or []
|
1532
|
+
for s in substrates_data:
|
1533
|
+
if isinstance(s, dict):
|
1534
|
+
substrate_ids.append(s.get("identifier") or s.get("name", ""))
|
1535
|
+
else:
|
1536
|
+
substrate_ids.append(str(s))
|
1537
|
+
|
1538
|
+
for sid in substrate_ids:
|
1539
|
+
# Look up IUPAC name
|
1540
|
+
iupac_name = None
|
1541
|
+
mapping = compound_mappings.get(str(sid).lower())
|
1542
|
+
if mapping:
|
1543
|
+
iupac_name = mapping.iupac_name
|
1544
|
+
|
1545
|
+
substrates.append(SubstrateProduct(name=str(sid), iupac_name=iupac_name))
|
1546
|
+
|
1547
|
+
# Parse product IDs
|
1548
|
+
products = []
|
1549
|
+
product_ids = item.get("product_ids") or []
|
1550
|
+
# Also handle old format
|
1551
|
+
if not product_ids and item.get("products"):
|
1552
|
+
products_data = item.get("products") or []
|
1553
|
+
for p in products_data:
|
1554
|
+
if isinstance(p, dict):
|
1555
|
+
product_ids.append(p.get("identifier") or p.get("name", ""))
|
1556
|
+
else:
|
1557
|
+
product_ids.append(str(p))
|
1558
|
+
|
1559
|
+
for pid in product_ids:
|
1560
|
+
# Look up IUPAC name
|
1561
|
+
iupac_name = None
|
1562
|
+
mapping = compound_mappings.get(str(pid).lower())
|
1563
|
+
if mapping:
|
1564
|
+
iupac_name = mapping.iupac_name
|
1565
|
+
|
1566
|
+
products.append(SubstrateProduct(name=str(pid), iupac_name=iupac_name))
|
1567
|
+
|
1568
|
+
# Parse cofactors
|
1569
|
+
cofactors = []
|
1570
|
+
cofactors_data = item.get("cofactors") or []
|
1571
|
+
for c in cofactors_data:
|
1572
|
+
if isinstance(c, dict):
|
1573
|
+
cofactors.append(Cofactor(
|
1574
|
+
name=c.get("name", ""),
|
1575
|
+
iupac_name=c.get("iupac_name"),
|
1576
|
+
role=c.get("role")
|
1577
|
+
))
|
1578
|
+
|
1579
|
+
# Parse conditions
|
1580
|
+
cond_data = item.get("reaction_conditions", {})
|
1581
|
+
conditions = ReactionConditions(
|
1582
|
+
temperature=cond_data.get("temperature"),
|
1583
|
+
ph=cond_data.get("ph"),
|
1584
|
+
substrate_concentration=cond_data.get("substrate_concentration"),
|
1585
|
+
buffer=cond_data.get("buffer"),
|
1586
|
+
other_conditions=cond_data.get("other_conditions")
|
1587
|
+
)
|
1588
|
+
|
1589
|
+
# Parse numeric values
|
1590
|
+
def parse_numeric(val):
|
1591
|
+
if not val or val in ["", "n.d.", "N/A", None]:
|
1592
|
+
return None
|
1593
|
+
try:
|
1594
|
+
# Extract numeric part
|
1595
|
+
match = re.search(r'(\d+\.?\d*)', str(val))
|
1596
|
+
return float(match.group(1)) if match else None
|
1597
|
+
except:
|
1598
|
+
return None
|
1599
|
+
|
1600
|
+
# Parse ee - handle both percentage and ratio formats
|
1601
|
+
ee_value = item.get("ee_percent")
|
1602
|
+
if ee_value is None and item.get("ee"):
|
1603
|
+
# Try to extract from ratio format like "80:20 er"
|
1604
|
+
ee_str = str(item.get("ee"))
|
1605
|
+
match = re.search(r'(\d+):(\d+)', ee_str)
|
1606
|
+
if match:
|
1607
|
+
major = float(match.group(1))
|
1608
|
+
minor = float(match.group(2))
|
1609
|
+
# Convert ratio to ee%
|
1610
|
+
ee_value = abs(major - minor)
|
1611
|
+
|
1612
|
+
entry = ScopeEntry(
|
1613
|
+
enzyme_id=item.get("enzyme_id", ""),
|
1614
|
+
substrates=substrates,
|
1615
|
+
products=products,
|
1616
|
+
cofactors=cofactors,
|
1617
|
+
yield_percent=parse_numeric(item.get("yield_percent")),
|
1618
|
+
ttn=parse_numeric(item.get("ttn")),
|
1619
|
+
ee=parse_numeric(ee_value),
|
1620
|
+
conditions=conditions,
|
1621
|
+
data_location=item.get("data_location", ""),
|
1622
|
+
data_source_type={"all": "text/figure"},
|
1623
|
+
notes=item.get("notes", "")
|
1624
|
+
)
|
1625
|
+
|
1626
|
+
entries.append(entry)
|
1627
|
+
|
1628
|
+
except Exception as exc: # pragma: no cover
|
1629
|
+
log.debug("Skipping malformed scope entry %s: %s", item, exc)
|
1630
|
+
|
1631
|
+
return entries
|
1632
|
+
|
1633
|
+
# ---- 6.3 Public API -------------------------------------------------------
|
1634
|
+
|
1635
|
+
def get_substrate_scope(
|
1636
|
+
caption_text: str,
|
1637
|
+
full_text: str,
|
1638
|
+
model,
|
1639
|
+
*,
|
1640
|
+
pdf_paths: Optional[List[Path]] = None,
|
1641
|
+
debug_dir: str | Path | None = None,
|
1642
|
+
) -> List[ScopeEntry]:
|
1643
|
+
"""
|
1644
|
+
High-level wrapper used by the pipeline.
|
1645
|
+
|
1646
|
+
1. Use captions to identify substrate scope locations
|
1647
|
+
2. Identify sections containing IUPAC names
|
1648
|
+
3. Extract compound mappings from identified sections
|
1649
|
+
4. Identify enzyme-substrate pairs
|
1650
|
+
5. Extract individual reactions with context
|
1651
|
+
"""
|
1652
|
+
# Step 1: Find locations using captions
|
1653
|
+
locations = identify_scope_locations(caption_text, model, debug_dir=debug_dir)
|
1654
|
+
if locations:
|
1655
|
+
location_summary = []
|
1656
|
+
for loc in locations[:3]:
|
1657
|
+
location_summary.append(
|
1658
|
+
f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, "
|
1659
|
+
f"confidence: {loc.get('confidence', 0)})"
|
1660
|
+
)
|
1661
|
+
log.info("Identified %d substrate scope locations: %s",
|
1662
|
+
len(locations), ", ".join(location_summary))
|
1663
|
+
else:
|
1664
|
+
log.warning("No substrate scope locations identified")
|
1665
|
+
return []
|
1666
|
+
|
1667
|
+
# Step 2: Extract all substrate scope data first
|
1668
|
+
# (This gets us the compound IDs we need to map)
|
1669
|
+
time.sleep(2) # Rate limiting
|
1670
|
+
log.info("Extracting all substrate scope data from all identified sources...")
|
1671
|
+
|
1672
|
+
# Extract images for all figure locations
|
1673
|
+
figure_images = {}
|
1674
|
+
for loc in locations:
|
1675
|
+
location_str = loc.get('location', '')
|
1676
|
+
# Extract if it's marked as figure type OR if location contains "Figure" or "Fig"
|
1677
|
+
if pdf_paths and ('figure' in location_str.lower() or 'fig' in location_str.lower() or loc.get('type') == 'figure'):
|
1678
|
+
figure_ref = location_str
|
1679
|
+
confidence = loc.get('confidence', 0)
|
1680
|
+
log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, loc.get('type', 'unknown'))
|
1681
|
+
figure_image = extract_figure_image(pdf_paths, figure_ref)
|
1682
|
+
if figure_image:
|
1683
|
+
log.info("Successfully extracted figure image for %s (%d bytes)",
|
1684
|
+
figure_ref, len(figure_image))
|
1685
|
+
figure_images[figure_ref] = figure_image
|
1686
|
+
|
1687
|
+
# Save figure image if debug_dir is enabled
|
1688
|
+
if debug_dir:
|
1689
|
+
import base64
|
1690
|
+
debug_path = Path(debug_dir)
|
1691
|
+
image_path = debug_path / f"figure_image_{figure_ref.replace(' ', '_')}.png"
|
1692
|
+
with open(image_path, 'wb') as f:
|
1693
|
+
f.write(base64.b64decode(figure_image))
|
1694
|
+
log.info("Saved figure image to %s", image_path)
|
1695
|
+
else:
|
1696
|
+
log.warning("Failed to extract figure image for %s", figure_ref)
|
1697
|
+
|
1698
|
+
# Extract all substrate scope data in one call
|
1699
|
+
raw_entries = extract_all_substrate_scope_data(
|
1700
|
+
full_text, model, locations,
|
1701
|
+
pdf_paths=pdf_paths,
|
1702
|
+
figure_images=figure_images,
|
1703
|
+
debug_dir=debug_dir
|
1704
|
+
)
|
1705
|
+
|
1706
|
+
if not raw_entries:
|
1707
|
+
log.warning("No substrate scope data found")
|
1708
|
+
return []
|
1709
|
+
|
1710
|
+
# Step 3: Now identify IUPAC sections using SI TOC pages
|
1711
|
+
log.info("Identifying sections containing IUPAC names from SI table of contents...")
|
1712
|
+
iupac_sections = identify_iupac_sections(caption_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
1713
|
+
|
1714
|
+
# Step 4: Extract compound mappings from identified sections
|
1715
|
+
# Now we know which compound IDs to look for from the substrate scope data
|
1716
|
+
log.info("Extracting compound ID to IUPAC mappings...")
|
1717
|
+
|
1718
|
+
# Collect all compound IDs from substrate scope data
|
1719
|
+
all_compound_ids = set()
|
1720
|
+
for entry in raw_entries:
|
1721
|
+
substrate_ids = entry.get('substrate_ids') or []
|
1722
|
+
for sid in substrate_ids:
|
1723
|
+
all_compound_ids.add(str(sid))
|
1724
|
+
product_ids = entry.get('product_ids') or []
|
1725
|
+
for pid in product_ids:
|
1726
|
+
all_compound_ids.add(str(pid))
|
1727
|
+
|
1728
|
+
log.info("Found %d unique compound IDs to map: %s", len(all_compound_ids), sorted(all_compound_ids))
|
1729
|
+
|
1730
|
+
# Store figure images in the function for later use
|
1731
|
+
extract_compound_mappings._figure_images_cache = figure_images
|
1732
|
+
|
1733
|
+
compound_mappings = extract_compound_mappings(full_text, model,
|
1734
|
+
pdf_paths=pdf_paths,
|
1735
|
+
iupac_sections=iupac_sections,
|
1736
|
+
compound_ids=list(all_compound_ids),
|
1737
|
+
debug_dir=debug_dir)
|
1738
|
+
|
1739
|
+
# Step 5: Parse all entries with compound mappings
|
1740
|
+
entries = _parse_scope_entries(raw_entries, compound_mappings)
|
1741
|
+
log.info("Successfully parsed %d substrate scope entries", len(entries))
|
1742
|
+
|
1743
|
+
return entries
|
1744
|
+
|
1745
|
+
# === 7. VALIDATION & MERGE ===
|
1746
|
+
"""Validation, duplicate detection, and merging with lineage data."""
|
1747
|
+
|
1748
|
+
def validate_scope_entries(entries: List[ScopeEntry]) -> List[str]:
|
1749
|
+
"""Validate for suspicious patterns like duplicate values."""
|
1750
|
+
warnings = []
|
1751
|
+
|
1752
|
+
# Track values
|
1753
|
+
ttn_values: Dict[float, List[str]] = {}
|
1754
|
+
yield_values: Dict[float, List[str]] = {}
|
1755
|
+
ee_values: Dict[float, List[str]] = {}
|
1756
|
+
|
1757
|
+
for entry in entries:
|
1758
|
+
substrate_name = entry.substrates[0].name if entry.substrates else "Unknown"
|
1759
|
+
key = f"{entry.enzyme_id}-{substrate_name}"
|
1760
|
+
|
1761
|
+
if entry.ttn is not None:
|
1762
|
+
if entry.ttn not in ttn_values:
|
1763
|
+
ttn_values[entry.ttn] = []
|
1764
|
+
ttn_values[entry.ttn].append(key)
|
1765
|
+
|
1766
|
+
if entry.yield_percent is not None:
|
1767
|
+
if entry.yield_percent not in yield_values:
|
1768
|
+
yield_values[entry.yield_percent] = []
|
1769
|
+
yield_values[entry.yield_percent].append(key)
|
1770
|
+
|
1771
|
+
if entry.ee is not None:
|
1772
|
+
if entry.ee not in ee_values:
|
1773
|
+
ee_values[entry.ee] = []
|
1774
|
+
ee_values[entry.ee].append(key)
|
1775
|
+
|
1776
|
+
# Check for suspicious duplicates
|
1777
|
+
for value, items in ttn_values.items():
|
1778
|
+
if len(items) > 1:
|
1779
|
+
warnings.append(f"Multiple entries have TTN={value}: {', '.join(items[:3])}")
|
1780
|
+
|
1781
|
+
for value, items in yield_values.items():
|
1782
|
+
if len(items) > 1:
|
1783
|
+
warnings.append(f"Multiple entries have yield={value}%: {', '.join(items[:3])}")
|
1784
|
+
|
1785
|
+
for value, items in ee_values.items():
|
1786
|
+
if len(items) > 1:
|
1787
|
+
warnings.append(f"Multiple entries have ee={value}%: {', '.join(items[:3])}")
|
1788
|
+
|
1789
|
+
if warnings:
|
1790
|
+
log.warning("Validation warnings found - possible extraction errors")
|
1791
|
+
for warning in warnings:
|
1792
|
+
log.warning(" %s", warning)
|
1793
|
+
|
1794
|
+
return warnings
|
1795
|
+
|
1796
|
+
def merge_with_lineage(
|
1797
|
+
entries: List[ScopeEntry],
|
1798
|
+
lineage_csv: Optional[Path]
|
1799
|
+
) -> List[ScopeEntry]:
|
1800
|
+
"""Merge substrate scope entries with enzyme lineage data."""
|
1801
|
+
if not lineage_csv or not lineage_csv.exists():
|
1802
|
+
return entries
|
1803
|
+
|
1804
|
+
try:
|
1805
|
+
import pandas as pd
|
1806
|
+
lineage_df = pd.read_csv(lineage_csv)
|
1807
|
+
log.info("Loading lineage data from %s (%d enzymes)", lineage_csv, len(lineage_df))
|
1808
|
+
|
1809
|
+
# Create lookup map (case-insensitive)
|
1810
|
+
lineage_map = {}
|
1811
|
+
for _, row in lineage_df.iterrows():
|
1812
|
+
enzyme_id = str(row.get('enzyme_id', ''))
|
1813
|
+
lineage_map[enzyme_id.lower()] = {
|
1814
|
+
'parent_id': row.get('parent_id'),
|
1815
|
+
'mutations': row.get('mutations'),
|
1816
|
+
'generation': row.get('generation'),
|
1817
|
+
'aa_seq': row.get('aa_seq'),
|
1818
|
+
'dna_seq': row.get('dna_seq'),
|
1819
|
+
'confidence': row.get('confidence')
|
1820
|
+
}
|
1821
|
+
|
1822
|
+
# Merge
|
1823
|
+
merged_count = 0
|
1824
|
+
for entry in entries:
|
1825
|
+
key = entry.enzyme_id.lower()
|
1826
|
+
if key in lineage_map:
|
1827
|
+
data = lineage_map[key]
|
1828
|
+
entry.parent_id = data['parent_id']
|
1829
|
+
entry.mutations = data['mutations']
|
1830
|
+
entry.generation = data['generation']
|
1831
|
+
entry.aa_seq = data['aa_seq']
|
1832
|
+
entry.dna_seq = data['dna_seq']
|
1833
|
+
entry.confidence = data['confidence']
|
1834
|
+
merged_count += 1
|
1835
|
+
|
1836
|
+
log.info("Merged lineage data for %d/%d entries", merged_count, len(entries))
|
1837
|
+
|
1838
|
+
except Exception as exc:
|
1839
|
+
log.error("Failed to merge with lineage: %s", exc)
|
1840
|
+
|
1841
|
+
return entries
|
1842
|
+
|
1843
|
+
# === 8. PIPELINE ORCHESTRATOR ===
|
1844
|
+
"""High-level function that ties everything together."""
|
1845
|
+
|
1846
|
+
import pandas as pd
|
1847
|
+
|
1848
|
+
def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
1849
|
+
"""Convert ScopeEntry objects to tidy DataFrame."""
|
1850
|
+
rows = []
|
1851
|
+
|
1852
|
+
for entry in entries:
|
1853
|
+
row = {
|
1854
|
+
'enzyme_id': entry.enzyme_id,
|
1855
|
+
'parent_enzyme_id': entry.parent_id or '',
|
1856
|
+
'mutations': entry.mutations or '',
|
1857
|
+
'generation': entry.generation if entry.generation is not None else '',
|
1858
|
+
'protein_sequence': entry.aa_seq or '',
|
1859
|
+
'nucleotide_sequence': entry.dna_seq or '',
|
1860
|
+
'sequence_confidence': str(entry.confidence) if entry.confidence is not None else '',
|
1861
|
+
'flag': '',
|
1862
|
+
|
1863
|
+
'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
|
1864
|
+
'substrate_iupac_list': '; '.join(s.iupac_name or '' for s in entry.substrates),
|
1865
|
+
'product_list': '; '.join(p.name for p in entry.products if p.name),
|
1866
|
+
'product_iupac_list': '; '.join(p.iupac_name or '' for p in entry.products),
|
1867
|
+
|
1868
|
+
'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
|
1869
|
+
'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),
|
1870
|
+
'cofactor_roles': '; '.join(c.role or '' for c in entry.cofactors),
|
1871
|
+
|
1872
|
+
'yield': str(entry.yield_percent) if entry.yield_percent is not None else '',
|
1873
|
+
'ttn': str(entry.ttn) if entry.ttn is not None else '',
|
1874
|
+
'ee': str(entry.ee) if entry.ee is not None else '',
|
1875
|
+
|
1876
|
+
'reaction_temperature': entry.conditions.temperature or '',
|
1877
|
+
'reaction_ph': entry.conditions.ph or '',
|
1878
|
+
'reaction_substrate_concentration': entry.conditions.substrate_concentration or '',
|
1879
|
+
'reaction_buffer': entry.conditions.buffer or '',
|
1880
|
+
'reaction_other_conditions': entry.conditions.other_conditions or '',
|
1881
|
+
|
1882
|
+
'data_location': entry.data_location or ''
|
1883
|
+
}
|
1884
|
+
rows.append(row)
|
1885
|
+
|
1886
|
+
df = pd.DataFrame(rows)
|
1887
|
+
|
1888
|
+
# Define column order
|
1889
|
+
column_order = [
|
1890
|
+
'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation',
|
1891
|
+
'protein_sequence', 'nucleotide_sequence', 'sequence_confidence', 'flag',
|
1892
|
+
'substrate_list', 'substrate_iupac_list',
|
1893
|
+
'product_list', 'product_iupac_list',
|
1894
|
+
'cofactor_list', 'cofactor_iupac_list', 'cofactor_roles',
|
1895
|
+
'yield', 'ttn', 'ee',
|
1896
|
+
'reaction_temperature', 'reaction_ph', 'reaction_substrate_concentration',
|
1897
|
+
'reaction_buffer', 'reaction_other_conditions',
|
1898
|
+
'data_location'
|
1899
|
+
]
|
1900
|
+
|
1901
|
+
# Ensure all columns exist
|
1902
|
+
for col in column_order:
|
1903
|
+
if col not in df.columns:
|
1904
|
+
df[col] = ''
|
1905
|
+
|
1906
|
+
# Reorder
|
1907
|
+
df = df[column_order]
|
1908
|
+
|
1909
|
+
return df
|
1910
|
+
|
1911
|
+
def run_pipeline(
|
1912
|
+
manuscript: Union[str, Path],
|
1913
|
+
si: Optional[Union[str, Path]] = None,
|
1914
|
+
output_csv: Optional[Union[str, Path]] = None,
|
1915
|
+
*,
|
1916
|
+
lineage_csv: Optional[Union[str, Path]] = None,
|
1917
|
+
debug_dir: str | Path | None = None,
|
1918
|
+
) -> pd.DataFrame:
|
1919
|
+
"""Execute the end-to-end substrate scope extraction pipeline.
|
1920
|
+
|
1921
|
+
Parameters
|
1922
|
+
----------
|
1923
|
+
manuscript : str | Path
|
1924
|
+
Path to the main PDF file.
|
1925
|
+
si : str | Path | None, optional
|
1926
|
+
Path to the Supplementary Information PDF, if available.
|
1927
|
+
output_csv : str | Path | None, optional
|
1928
|
+
If provided, the substrate scope table will be written here.
|
1929
|
+
lineage_csv : str | Path | None, optional
|
1930
|
+
Path to enzyme lineage CSV for sequence merging.
|
1931
|
+
|
1932
|
+
Returns
|
1933
|
+
-------
|
1934
|
+
pandas.DataFrame
|
1935
|
+
One row per substrate-enzyme combination with all data.
|
1936
|
+
"""
|
1937
|
+
t0 = time.perf_counter()
|
1938
|
+
manuscript = Path(manuscript)
|
1939
|
+
si_path = Path(si) if si else None
|
1940
|
+
|
1941
|
+
# 1. Prepare raw text ------------------------------------------------------
|
1942
|
+
pdf_paths = [p for p in (manuscript, si_path) if p]
|
1943
|
+
caption_text = limited_caption_concat(*pdf_paths)
|
1944
|
+
full_text = limited_concat(*pdf_paths)
|
1945
|
+
|
1946
|
+
log.info("Loaded %d chars of captions and %d chars of full text",
|
1947
|
+
len(caption_text), len(full_text))
|
1948
|
+
|
1949
|
+
# 2. Connect to Gemini -----------------------------------------------------
|
1950
|
+
model = get_model()
|
1951
|
+
|
1952
|
+
# 3. Extract substrate scope -----------------------------------------------
|
1953
|
+
entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
1954
|
+
|
1955
|
+
if not entries:
|
1956
|
+
raise RuntimeError("Pipeline aborted: failed to extract any substrate scope data")
|
1957
|
+
|
1958
|
+
# 4. Merge with lineage if available ---------------------------------------
|
1959
|
+
if lineage_csv:
|
1960
|
+
entries = merge_with_lineage(entries, Path(lineage_csv))
|
1961
|
+
|
1962
|
+
# 5. Validate entries ------------------------------------------------------
|
1963
|
+
warnings = validate_scope_entries(entries)
|
1964
|
+
if warnings:
|
1965
|
+
log.warning("Found %d validation warnings", len(warnings))
|
1966
|
+
|
1967
|
+
# 6. Convert to DataFrame --------------------------------------------------
|
1968
|
+
df_final = _entries_to_dataframe(entries)
|
1969
|
+
|
1970
|
+
# 7. Write CSV if requested ------------------------------------------------
|
1971
|
+
if output_csv:
|
1972
|
+
output_path = Path(output_csv)
|
1973
|
+
df_final.to_csv(output_path, index=False)
|
1974
|
+
log.info(
|
1975
|
+
"Saved substrate scope CSV -> %s (%.1f kB)",
|
1976
|
+
output_path,
|
1977
|
+
output_path.stat().st_size / 1024,
|
1978
|
+
)
|
1979
|
+
|
1980
|
+
log.info(
|
1981
|
+
"Pipeline finished in %.2f s (entries: %d)",
|
1982
|
+
time.perf_counter() - t0,
|
1983
|
+
len(df_final),
|
1984
|
+
)
|
1985
|
+
return df_final
|
1986
|
+
|
1987
|
+
# === 9. CLI ENTRYPOINT ===
|
1988
|
+
"""Simple argparse wrapper matching enzyme_lineage_extractor.py style."""
|
1989
|
+
|
1990
|
+
import argparse
|
1991
|
+
|
1992
|
+
# -- 9.1 Argument parser ----------------------------------------------------
|
1993
|
+
|
1994
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
1995
|
+
p = argparse.ArgumentParser(
|
1996
|
+
prog="substrate_scope_extractor",
|
1997
|
+
description="Extract substrate scope data from PDFs using Google Gemini",
|
1998
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
1999
|
+
)
|
2000
|
+
p.add_argument("--manuscript", required=True, help="Path to main manuscript PDF")
|
2001
|
+
p.add_argument("--si", help="Path to Supplementary Information PDF")
|
2002
|
+
p.add_argument("-o", "--output", help="CSV file for extracted data")
|
2003
|
+
p.add_argument("--lineage-csv", help="Path to enzyme lineage CSV for merging")
|
2004
|
+
p.add_argument(
|
2005
|
+
"-v",
|
2006
|
+
"--verbose",
|
2007
|
+
action="count",
|
2008
|
+
default=0,
|
2009
|
+
help="Increase verbosity; repeat (-vv) for DEBUG logging",
|
2010
|
+
)
|
2011
|
+
p.add_argument(
|
2012
|
+
"--debug-dir",
|
2013
|
+
metavar="DIR",
|
2014
|
+
help="Write ALL intermediate artefacts (prompts, raw Gemini replies) to DIR",
|
2015
|
+
)
|
2016
|
+
return p
|
2017
|
+
|
2018
|
+
# -- 9.2 main() -------------------------------------------------------------
|
2019
|
+
|
2020
|
+
def main(argv: Optional[List[str]] = None) -> None:
|
2021
|
+
parser = _build_arg_parser()
|
2022
|
+
args = parser.parse_args(argv)
|
2023
|
+
|
2024
|
+
# Configure logging early so everything respects the chosen level.
|
2025
|
+
level = logging.DEBUG if args.verbose >= 2 else logging.INFO if args.verbose else logging.WARNING
|
2026
|
+
logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
|
2027
|
+
|
2028
|
+
run_pipeline(
|
2029
|
+
manuscript=args.manuscript,
|
2030
|
+
si=args.si,
|
2031
|
+
output_csv=args.output,
|
2032
|
+
lineage_csv=args.lineage_csv,
|
2033
|
+
debug_dir=args.debug_dir,
|
2034
|
+
)
|
2035
|
+
|
2036
|
+
if __name__ == "__main__":
|
2037
|
+
main()
|
2038
|
+
|
2039
|
+
# -------------------------------------------------------------------- end 9 ---
|