debase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2039 @@
1
+ """substrate_scope_extractor.py
2
+
3
+ Single-file, maintainable CLI tool that extracts the *substrate scope* table
4
+ from one or two PDFs (manuscript + SI) using Google Gemini (or compatible).
5
+
6
+ The file mirrors the *section layout* and logging/debug philosophy of
7
+ `enzyme_lineage_extractor.py` so that both tools share a consistent developer
8
+ experience and can even live in the same package.
9
+
10
+ Navigate quickly by jumping to the numbered headers:
11
+
12
+ # === 1. CONFIG & CONSTANTS ===
13
+ # === 2. DOMAIN MODELS ===
14
+ # === 3. LOGGING HELPERS ===
15
+ # === 4. PDF HELPERS ===
16
+ # === 5. LLM (GEMINI) HELPERS ===
17
+ # === 6. SCOPE EXTRACTION ===
18
+ # === 7. VALIDATION & MERGE ===
19
+ # === 8. PIPELINE ORCHESTRATOR ===
20
+ # === 9. CLI ENTRYPOINT ===
21
+ """
22
+
23
+ # === 1. CONFIG & CONSTANTS ===
24
+ from __future__ import annotations
25
+
26
+ import os
27
+ import re
28
+ import json
29
+ import time
30
+ import logging
31
+ from pathlib import Path
32
+ from dataclasses import dataclass, field
33
+ from typing import List, Optional, Dict, Any, Union
34
+
35
+ MODEL_NAME: str = "gemini-2.5-flash"
36
+ MAX_CHARS: int = 150_000 # Max characters sent to LLM
37
+ BATCH_SIZE: int = 10 # Batch size when extracting reactions
38
+ MAX_RETRIES: int = 4 # LLM retry loop
39
+ CACHE_DIR: Path = Path.home() / ".cache" / "substrate_scope"
40
+
41
+ # Ensure cache directory exists
42
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
43
+
44
+ # === 2. DOMAIN MODELS ===
45
+ @dataclass
46
+ class SubstrateProduct:
47
+ """Chemical entity in a substrate scope reaction."""
48
+ name: str
49
+ iupac_name: Optional[str] = None
50
+
51
+ @dataclass
52
+ class Cofactor:
53
+ """Cofactor with optional IUPAC name and role."""
54
+ name: str
55
+ iupac_name: Optional[str] = None
56
+ role: Optional[str] = None
57
+
58
+ @dataclass
59
+ class ReactionConditions:
60
+ """Reaction conditions for substrate scope."""
61
+ temperature: Optional[str] = None
62
+ ph: Optional[str] = None
63
+ substrate_concentration: Optional[str] = None
64
+ buffer: Optional[str] = None
65
+ other_conditions: Optional[str] = None
66
+
67
+ @dataclass
68
+ class ScopeEntry:
69
+ """Single substrate scope reaction data point."""
70
+ enzyme_id: str
71
+ substrates: List[SubstrateProduct] = field(default_factory=list)
72
+ products: List[SubstrateProduct] = field(default_factory=list)
73
+ cofactors: List[Cofactor] = field(default_factory=list)
74
+
75
+ # Performance metrics
76
+ yield_percent: Optional[float] = None
77
+ ttn: Optional[float] = None
78
+ ee: Optional[float] = None
79
+
80
+ # Reaction conditions
81
+ conditions: ReactionConditions = field(default_factory=ReactionConditions)
82
+
83
+ # Metadata
84
+ data_location: Optional[str] = None
85
+ data_source_type: Dict[str, str] = field(default_factory=dict)
86
+
87
+ # Lineage information (populated during merge)
88
+ parent_id: Optional[str] = None
89
+ mutations: Optional[str] = None
90
+ generation: Optional[int] = None
91
+ aa_seq: Optional[str] = None
92
+ dna_seq: Optional[str] = None
93
+ confidence: Optional[float] = None
94
+ notes: str = ""
95
+
96
+ @dataclass
97
+ class CompoundMapping:
98
+ """Mapping between compound identifiers and IUPAC names."""
99
+ identifiers: List[str]
100
+ iupac_name: str
101
+ common_names: List[str] = field(default_factory=list)
102
+ compound_type: str = "unknown"
103
+ source_location: Optional[str] = None
104
+
105
+ # === 3. LOGGING HELPERS ===
106
+
107
+ # --- Debug dump helper ----------------------------------------------------
108
+ def _dump(text: str | bytes, path: Path | str) -> None:
109
+ """Write `text` / `bytes` to `path`, creating parent dirs as needed."""
110
+ p = Path(path)
111
+ p.parent.mkdir(parents=True, exist_ok=True)
112
+ mode = "wb" if isinstance(text, (bytes, bytearray)) else "w"
113
+ with p.open(mode) as fh:
114
+ fh.write(text)
115
+
116
+ def get_logger(name: str = __name__) -> logging.Logger:
117
+ logger = logging.getLogger(name)
118
+ if not logger.handlers:
119
+ handler = logging.StreamHandler()
120
+ fmt = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
121
+ handler.setFormatter(logging.Formatter(fmt=fmt, datefmt="%Y-%m-%d %H:%M:%S"))
122
+ logger.addHandler(handler)
123
+ logger.setLevel(logging.INFO)
124
+ return logger
125
+
126
+ log = get_logger(__name__)
127
+
128
+ # === 4. PDF HELPERS ===
129
+ try:
130
+ import fitz # PyMuPDF
131
+ except ImportError as exc: # pragma: no cover
132
+ raise ImportError(
133
+ "PyMuPDF is required for PDF parsing. Install with `pip install pymupdf`."
134
+ ) from exc
135
+
136
+ from base64 import b64encode
137
+
138
+ # Improved caption prefix regex - captures most journal variants
139
+ # Simplified pattern: match any line starting with Table, Figure, Scheme, Chart, etc.
140
+ # This catches all variations including "Table S 2", "Figure.", etc.
141
+ _CAPTION_PREFIX_RE = re.compile(
142
+ r"^(Table|Figure|Fig|Scheme|Chart|Extended\s+Data\s+Fig|ED\s+Fig|Supplementary\s+(?:Table|Figure)).*",
143
+ re.I | re.M
144
+ )
145
+
146
+ def _open_doc(pdf_path: str | Path | bytes):
147
+ if isinstance(pdf_path, (str, Path)):
148
+ return fitz.open(pdf_path) # type: ignore[arg-type]
149
+ return fitz.open(stream=pdf_path, filetype="pdf") # type: ignore[arg-type]
150
+
151
+ def extract_text(pdf_path: str | Path | bytes) -> str:
152
+ """Extract raw text from a PDF file (all blocks)."""
153
+ doc = _open_doc(pdf_path)
154
+ try:
155
+ return "\n".join(page.get_text() for page in doc)
156
+ finally:
157
+ doc.close()
158
+
159
+ def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
160
+ """Extract figure/table captions using the improved regex."""
161
+ doc = _open_doc(pdf_path)
162
+ captions: list[str] = []
163
+ try:
164
+ for page in doc:
165
+ page_dict = page.get_text("dict")
166
+ for block in page_dict.get("blocks", []):
167
+ # Get all lines in this block
168
+ block_lines = []
169
+ for line in block.get("lines", []):
170
+ text_line = "".join(span["text"] for span in line.get("spans", []))
171
+ block_lines.append(text_line.strip())
172
+
173
+ # Check if any line starts with a caption prefix
174
+ for i, line in enumerate(block_lines):
175
+ if _CAPTION_PREFIX_RE.match(line):
176
+ # Found a caption start - collect lines
177
+ caption_parts = [line]
178
+ for j in range(i + 1, len(block_lines)):
179
+ next_line = block_lines[j]
180
+ if not next_line: # Empty line signals end
181
+ break
182
+ if _CAPTION_PREFIX_RE.match(next_line):
183
+ break
184
+ caption_parts.append(next_line)
185
+
186
+ full_caption = " ".join(caption_parts)
187
+ captions.append(full_caption)
188
+ finally:
189
+ doc.close()
190
+
191
+ joined = "\n".join(captions)
192
+ return joined[:max_chars]
193
+
194
+ def limited_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -> str:
195
+ """Concatenate **all text** from PDFs, trimmed to `max_chars`."""
196
+ total = 0
197
+ chunks: list[str] = []
198
+ for p in pdf_paths:
199
+ t = extract_text(p)
200
+ if total + len(t) > max_chars:
201
+ t = t[: max_chars - total]
202
+ chunks.append(t)
203
+ total += len(t)
204
+ if total >= max_chars:
205
+ break
206
+ return "\n".join(chunks)
207
+
208
+ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -> str:
209
+ """Concatenate caption text and SI table of contents from PDFs, trimmed to `max_chars`."""
210
+ total = 0
211
+ chunks: list[str] = []
212
+
213
+ for idx, p in enumerate(pdf_paths):
214
+ # For SI (second PDF), first extract table of contents pages
215
+ if idx == 1: # SI document
216
+ doc = _open_doc(p)
217
+ try:
218
+ # Extract first few pages which typically contain TOC
219
+ toc_text = []
220
+ for page_num in range(min(5, doc.page_count)):
221
+ if total >= max_chars:
222
+ break
223
+ page = doc.load_page(page_num)
224
+ page_text = page.get_text()
225
+
226
+ # Look for TOC indicators
227
+ if any(indicator in page_text.lower() for indicator in
228
+ ['table of contents', 'supporting information', 'contents', 'page']):
229
+ toc_text.append(f"\n[SI TOC Page {page_num + 1}]\n{page_text}")
230
+ total += len(page_text)
231
+
232
+ if toc_text:
233
+ chunks.extend(toc_text)
234
+ finally:
235
+ doc.close()
236
+
237
+ # Extract captions
238
+ if total < max_chars:
239
+ t = extract_captions(p)
240
+ if total + len(t) > max_chars:
241
+ t = t[: max_chars - total]
242
+ chunks.append(t)
243
+ total += len(t)
244
+ if total >= max_chars:
245
+ break
246
+
247
+ return "\n".join(chunks)
248
+
249
+ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str]:
250
+ """Extract figure as a page region when embedded images aren't available.
251
+
252
+ Args:
253
+ pdf_paths: List of PDF paths to search
254
+ figure_ref: Figure reference to search for (e.g., "Figure 3" or "Figure 3(a)")
255
+
256
+ Returns:
257
+ Base64-encoded PNG string or None if not found
258
+ """
259
+ if not pdf_paths:
260
+ return None
261
+
262
+ # Always extract the base figure number, removing sub-letters like (a), (b), c, etc.
263
+ import re
264
+ # Match patterns like "Figure 1", "Figure 1c", "Figure 1(c)", "Fig. 1", etc.
265
+ base_figure_match = re.match(r'((?:Figure|Fig\.?)\s*\d+)', figure_ref, re.IGNORECASE)
266
+ if base_figure_match:
267
+ base_figure_ref = base_figure_match.group(1)
268
+ log.info("Extracting entire figure '%s' from reference '%s'", base_figure_ref, figure_ref)
269
+ else:
270
+ base_figure_ref = figure_ref
271
+
272
+ for pdf_path in pdf_paths:
273
+ doc = _open_doc(pdf_path)
274
+ try:
275
+ for page_num in range(doc.page_count):
276
+ page = doc.load_page(page_num)
277
+ page_text = page.get_text()
278
+
279
+ # Check if this page contains the figure caption
280
+ found = False
281
+ caption_instances = None
282
+
283
+ # Look for figure caption
284
+ variations = [
285
+ f"{base_figure_ref}.", # "Figure 1." - most reliable
286
+ f"{base_figure_ref} ", # "Figure 1 "
287
+ base_figure_ref,
288
+ ]
289
+
290
+ for variation in variations:
291
+ caption_instances = page.search_for(variation, quads=False)
292
+ if caption_instances:
293
+ # Check if this is likely a caption (not a reference in text)
294
+ for rect in caption_instances:
295
+ # Get text around this location
296
+ x0, y0, x1, y1 = rect
297
+ text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+300, y1+20))
298
+ # Check if it looks like a figure caption
299
+ if any(keyword in text_around.lower() for keyword in
300
+ ['directed evolution', 'substrate scope', '(a)', '(b)', '(c)']):
301
+ found = True
302
+ caption_rect = rect
303
+ break
304
+ if found:
305
+ break
306
+
307
+ if not found:
308
+ continue
309
+
310
+ log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
311
+
312
+ # Extract a region of the page above the caption
313
+ # The figure should be between the top of the viewable area and the caption
314
+ page_rect = page.rect
315
+
316
+ # Define the region to extract
317
+ # Extract everything above the caption
318
+ top_margin = 0 # Start from the very top of the page
319
+ bottom_margin = 5 # Small margin above caption
320
+ left_margin = 0 # Use full page width
321
+ right_margin = 0
322
+
323
+ # Calculate the figure region - everything from top to caption
324
+ fig_top = top_margin
325
+ fig_bottom = caption_rect.y0 - bottom_margin
326
+ fig_left = left_margin
327
+ fig_right = page_rect.width - right_margin
328
+
329
+ # Create the clip rectangle
330
+ clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
331
+
332
+ # Extract the region as an image
333
+ mat = fitz.Matrix(2, 2) # 2x zoom for better quality
334
+ pix = page.get_pixmap(clip=clip_rect, matrix=mat)
335
+
336
+ # Convert to PNG
337
+ img_bytes = pix.tobytes("png")
338
+ log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
339
+ clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
340
+
341
+ return b64encode(img_bytes).decode()
342
+
343
+ finally:
344
+ doc.close()
345
+
346
+ log.warning("Could not find figure caption for '%s'", figure_ref)
347
+ return None
348
+
349
+
350
+ def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
351
+ """Extract text around a specific reference (e.g., 'Figure 3')."""
352
+ import re
353
+ extracted_sections = []
354
+
355
+ # Try to extract base figure/table reference
356
+ base_ref_match = re.match(r'((?:Figure|Fig|Table)\s*\d+)', ref, re.IGNORECASE)
357
+ base_ref = base_ref_match.group(1) if base_ref_match else ref
358
+
359
+ for pdf_path in pdf_paths:
360
+ doc = _open_doc(pdf_path)
361
+ try:
362
+ for page_num in range(doc.page_count):
363
+ page = doc.load_page(page_num)
364
+ page_text = page.get_text()
365
+
366
+ # Try different variations of the reference
367
+ ref_variations = [
368
+ ref, # Original (e.g., "Figure 3(a)")
369
+ base_ref, # Base reference (e.g., "Figure 3")
370
+ ref.replace("(", " ").replace(")", ""), # "Figure 3 a"
371
+ ref.replace("(", "").replace(")", ""), # "Figure 3a"
372
+ ref.replace("Figure", "Fig"), # "Fig 3(a)"
373
+ base_ref.replace("Figure", "Fig"), # "Fig 3"
374
+ ]
375
+
376
+ # Find the reference in the page
377
+ found = False
378
+ pos = -1
379
+ used_ref = ref
380
+
381
+ for ref_var in ref_variations:
382
+ if ref_var.lower() in page_text.lower():
383
+ pos = page_text.lower().find(ref_var.lower())
384
+ used_ref = ref_var
385
+ found = True
386
+ break
387
+
388
+ if found and pos >= 0:
389
+ # Extract context around it
390
+ start = max(0, pos - context_chars)
391
+ end = min(len(page_text), pos + len(used_ref) + context_chars)
392
+
393
+ section = page_text[start:end]
394
+ extracted_sections.append(
395
+ f"\n=== Context around '{ref}' (found as '{used_ref}') in {pdf_path.name}, page {page_num + 1} ===\n{section}"
396
+ )
397
+ log.debug("Found '%s' as '%s' on page %d of %s", ref, used_ref, page_num + 1, pdf_path.name)
398
+ finally:
399
+ doc.close()
400
+
401
+ if not extracted_sections:
402
+ log.warning("Could not find reference '%s' or base reference '%s' in any PDF", ref, base_ref)
403
+
404
+ return "\n".join(extracted_sections)
405
+
406
+ def _extract_sections_by_title(pdf_paths: List[Path], section_titles: List[str], max_chars_per_section: int = 10000) -> str:
407
+ """Extract sections by their titles from PDFs."""
408
+ import re
409
+ extracted_sections = []
410
+
411
+ for pdf_path in pdf_paths:
412
+ doc = _open_doc(pdf_path)
413
+ try:
414
+ # Build full text with page markers
415
+ pages_text = []
416
+ for i, page in enumerate(doc):
417
+ page_text = page.get_text()
418
+ pages_text.append(f"\n[PAGE {i+1}]\n{page_text}")
419
+ full_text = "".join(pages_text)
420
+
421
+ for title in section_titles:
422
+ # Find section start
423
+ title_pattern = re.escape(title)
424
+ match = re.search(rf'{title_pattern}', full_text, re.IGNORECASE)
425
+
426
+ if match:
427
+ start_pos = match.start()
428
+
429
+ # Find the page number
430
+ page_match = re.search(r'\[PAGE (\d+)\]', full_text[:start_pos][::-1])
431
+ page_num = "unknown"
432
+ if page_match:
433
+ page_num = page_match.group(1)[::-1]
434
+
435
+ # Try to find the next section header
436
+ next_section_patterns = [
437
+ r'\n[A-Z][A-Za-z\s]+:\s*\n', # "Section Title:\n"
438
+ r'\n\d+\.\s+[A-Z]', # "1. Next Section"
439
+ r'\n[A-Z]{2,}[A-Z\s]*\n', # "SECTION HEADER\n"
440
+ r'\nReferences\s*\n',
441
+ r'\nAcknowledg',
442
+ r'\n\[PAGE \d+\]', # Next page
443
+ ]
444
+
445
+ end_pos = len(full_text)
446
+ for pattern in next_section_patterns:
447
+ next_match = re.search(pattern, full_text[start_pos + 100:], re.IGNORECASE)
448
+ if next_match:
449
+ end_pos = min(end_pos, start_pos + 100 + next_match.start())
450
+
451
+ # Extract section with size limit
452
+ section_text = full_text[start_pos:min(start_pos + max_chars_per_section, end_pos)]
453
+
454
+ # Clean up page markers
455
+ section_text = re.sub(r'\[PAGE \d+\]', '', section_text)
456
+
457
+ extracted_sections.append(
458
+ f"\n=== Section: '{title}' from {pdf_path.name} (starting page {page_num}) ===\n{section_text}"
459
+ )
460
+ log.info("Extracted section '%s' (%d chars) from %s",
461
+ title, len(section_text), pdf_path.name)
462
+ finally:
463
+ doc.close()
464
+
465
+ return "\n".join(extracted_sections)
466
+
467
+ def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int]) -> str:
468
+ """Extract text from a specific page number in the PDFs."""
469
+ # Convert page number to int and handle S-prefix
470
+ page_str = str(page_num).strip().upper()
471
+ if page_str.startswith('S'):
472
+ # Supplementary page - look in the SI PDF (second PDF)
473
+ actual_page = int(page_str[1:]) - 1 # 0-indexed
474
+ pdf_index = 1 if len(pdf_paths) > 1 else 0
475
+ else:
476
+ # Regular page - look in the main PDF
477
+ actual_page = int(page_str) - 1 # 0-indexed
478
+ pdf_index = 0
479
+
480
+ if pdf_index >= len(pdf_paths):
481
+ log.warning("Page %s requested but not enough PDFs provided", page_str)
482
+ return ""
483
+
484
+ try:
485
+ doc = _open_doc(pdf_paths[pdf_index])
486
+ if 0 <= actual_page < len(doc):
487
+ page = doc[actual_page]
488
+ page_text = page.get_text()
489
+ doc.close()
490
+ log.info("Extracted %d chars from page %s of %s",
491
+ len(page_text), page_str, pdf_paths[pdf_index].name)
492
+ return page_text
493
+ else:
494
+ log.warning("Page %s (index %d) out of range for %s (has %d pages)",
495
+ page_str, actual_page, pdf_paths[pdf_index].name, len(doc))
496
+ doc.close()
497
+ return ""
498
+ except Exception as e:
499
+ log.error("Failed to extract page %s: %s", page_str, e)
500
+ return ""
501
+
502
+ def _extract_text_from_pages(pdf_paths: List[Path], page_nums: List[Union[str, int]], max_pages: int = 10) -> str:
503
+ """Extract text from multiple page numbers."""
504
+ all_text = []
505
+ pages_extracted = 0
506
+
507
+ for page_num in page_nums[:max_pages]:
508
+ page_text = _extract_text_from_page(pdf_paths, page_num)
509
+ if page_text:
510
+ all_text.append(f"\n[PAGE {page_num}]\n{page_text}")
511
+ pages_extracted += 1
512
+
513
+ if pages_extracted == 0:
514
+ log.warning("No pages extracted from requested pages: %s", page_nums[:5])
515
+ else:
516
+ log.info("Extracted text from %d pages", pages_extracted)
517
+ return "\n".join(all_text)
518
+
519
+ # === 5. LLM (GEMINI) HELPERS === ---------------------------------------------
520
+ from typing import Tuple
521
+
522
+ _BACKOFF_BASE = 2.0 # exponential back-off base (seconds)
523
+
524
+ # -- 5.1 Import whichever SDK is installed -----------------------------------
525
+
526
+ def _import_gemini_sdk() -> Tuple[str, Any]:
527
+ """Return (flavor, module) where flavor in {"new", "legacy"}."""
528
+ try:
529
+ import google.generativeai as genai # official SDK >= 1.0
530
+ return "new", genai
531
+ except ImportError:
532
+ try:
533
+ import google_generativeai as genai # legacy prerelease name
534
+ return "legacy", genai
535
+ except ImportError as exc:
536
+ raise ImportError(
537
+ "Neither 'google-generativeai' (>=1.0) nor 'google_generativeai'\n"
538
+ "is installed. Run: pip install --upgrade google-generativeai"
539
+ ) from exc
540
+
541
+ _SDK_FLAVOR, _genai = _import_gemini_sdk()
542
+
543
+ # -- 5.2 Model factory --------------------------------------------------------
544
+
545
+ def get_model():
546
+ """Configure API key and return a `GenerativeModel` instance."""
547
+ api_key = os.getenv("GEMINI_API_KEY")
548
+ if not api_key:
549
+ raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
550
+ _genai.configure(api_key=api_key)
551
+ # Positional constructor arg works for both SDK flavors
552
+ return _genai.GenerativeModel(MODEL_NAME)
553
+
554
+ # === 5.3 Unified call helper ----------------------------------------------
555
+
556
+ def _extract_text(resp) -> str:
557
+ """
558
+ Pull the *first* textual part out of a GenerativeAI response, handling both
559
+ the old prerelease SDK and the >=1.0 SDK.
560
+
561
+ Returns an empty string if no textual content is found.
562
+ """
563
+ # 1) Legacy SDK (<= 0.4) - still has nice `.text`
564
+ if getattr(resp, "text", None):
565
+ return resp.text
566
+
567
+ # 2) >= 1.0 SDK
568
+ if getattr(resp, "candidates", None):
569
+ cand = resp.candidates[0]
570
+
571
+ # 2a) Some beta builds still expose `.text`
572
+ if getattr(cand, "text", None):
573
+ return cand.text
574
+
575
+ # 2b) Official path: candidate.content.parts[*].text
576
+ if getattr(cand, "content", None):
577
+ parts = [
578
+ part.text # Part objects have .text
579
+ for part in cand.content.parts
580
+ if getattr(part, "text", None)
581
+ ]
582
+ if parts:
583
+ return "".join(parts)
584
+
585
+ # 3) As a last resort fall back to str()
586
+ return str(resp)
587
+
588
+ def generate_json_with_retry(
589
+ model,
590
+ prompt: str,
591
+ schema_hint: str | None = None,
592
+ *,
593
+ max_retries: int = MAX_RETRIES,
594
+ debug_dir: str | Path | None = None,
595
+ tag: str = 'gemini',
596
+ ):
597
+ """
598
+ Call Gemini with retries & exponential back-off, returning parsed JSON.
599
+
600
+ Also strips Markdown fences that the model may wrap around its JSON.
601
+ """
602
+ # Log prompt details
603
+ log.info("=== GEMINI API CALL: %s ===", tag.upper())
604
+ log.info("Prompt length: %d characters", len(prompt))
605
+ log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
606
+
607
+ # Save full prompt to debug directory if provided
608
+ if debug_dir:
609
+ debug_path = Path(debug_dir)
610
+ debug_path.mkdir(parents=True, exist_ok=True)
611
+ prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
612
+ with open(prompt_file, 'w') as f:
613
+ f.write(f"=== PROMPT FOR {tag.upper()} ===\n")
614
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
615
+ f.write(f"Length: {len(prompt)} characters\n")
616
+ f.write("="*80 + "\n\n")
617
+ f.write(prompt)
618
+ log.info("Full prompt saved to: %s", prompt_file)
619
+
620
+ fence_re = re.compile(r"```json|```", re.I)
621
+ for attempt in range(1, max_retries + 1):
622
+ try:
623
+ log.info("Calling Gemini API (attempt %d/%d)...", attempt, max_retries)
624
+ resp = model.generate_content(prompt)
625
+ raw = _extract_text(resp).strip()
626
+
627
+ # Log response
628
+ log.info("Gemini response length: %d characters", len(raw))
629
+ log.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
630
+
631
+ # Save full response to debug directory
632
+ if debug_dir:
633
+ response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
634
+ with open(response_file, 'w') as f:
635
+ f.write(f"=== RESPONSE FOR {tag.upper()} ===\n")
636
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
637
+ f.write(f"Length: {len(raw)} characters\n")
638
+ f.write("="*80 + "\n\n")
639
+ f.write(raw)
640
+ log.info("Full response saved to: %s", response_file)
641
+
642
+ # Remove common Markdown fences
643
+ if raw.startswith("```"):
644
+ raw = fence_re.sub("", raw).strip()
645
+
646
+ # Try to find JSON in the response
647
+ # First, try to parse as-is
648
+ try:
649
+ parsed = json.loads(raw)
650
+ except json.JSONDecodeError:
651
+ # If that fails, look for JSON array or object
652
+ # Find the first '[' or '{' and the matching closing bracket
653
+ json_start = -1
654
+ json_end = -1
655
+ bracket_stack = []
656
+ in_string = False
657
+ escape_next = False
658
+
659
+ for i, char in enumerate(raw):
660
+ if escape_next:
661
+ escape_next = False
662
+ continue
663
+
664
+ if char == '\\':
665
+ escape_next = True
666
+ continue
667
+
668
+ if char == '"' and not escape_next:
669
+ in_string = not in_string
670
+ continue
671
+
672
+ if in_string:
673
+ continue
674
+
675
+ if char in '[{':
676
+ if json_start == -1:
677
+ json_start = i
678
+ bracket_stack.append(char)
679
+ elif char in ']}':
680
+ if bracket_stack:
681
+ opening = bracket_stack.pop()
682
+ if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
683
+ if not bracket_stack: # Found complete JSON
684
+ json_end = i + 1
685
+ break
686
+
687
+ if json_start >= 0 and json_end > json_start:
688
+ # Extract the JSON portion
689
+ json_str = raw[json_start:json_end]
690
+ parsed = json.loads(json_str)
691
+ else:
692
+ # Look for simple [] in the response
693
+ if '[]' in raw:
694
+ parsed = []
695
+ else:
696
+ # No JSON structure found, re-raise the original error
697
+ raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
698
+ log.info("Successfully parsed JSON response")
699
+ return parsed
700
+ except Exception as exc: # broad except OK here
701
+ log.warning(
702
+ "Gemini call failed (attempt %d/%d): %s",
703
+ attempt, max_retries, exc,
704
+ )
705
+ if attempt == max_retries:
706
+ raise
707
+ time.sleep(_BACKOFF_BASE ** attempt)
708
+ # -------------------------------------------------------------------- end 5 ---
709
+
710
+ # === 6. SCOPE EXTRACTION ===
711
+ """
712
+ Substrate scope extraction with compound mapping, enzyme-substrate pairing,
713
+ and individual reaction extraction.
714
+ """
715
+
716
+ # ---- 6.1 Prompt templates -------------------------------------------------
717
+
718
+ _SCOPE_LOC_PROMPT = """
719
+ You are an expert reader of biocatalysis manuscripts.
720
+ Analyze this paper and identify all locations containing substrate scope data.
721
+
722
+ Your task is to:
723
+ 1. Identify all locations (tables, figures, text) containing substrate scope reaction data
724
+ 2. Distinguish substrate scope studies from model reactions used for evolution
725
+ 3. Determine which enzyme variants were tested in substrate scope studies
726
+ 4. Note if multiple substrates are tested with the same enzyme variant
727
+
728
+ Return your analysis as JSON array (max {max_results} locations):
729
+ [
730
+ {{
731
+ "location": "e.g., SI Table 4, Figure 3, etc.",
732
+ "type": "table/figure/text",
733
+ "confidence": 0-100,
734
+ "reason": "why this contains substrate scope",
735
+ "enzyme_variants_tested": ["list", "of", "enzyme", "variants"],
736
+ "number_of_substrates": "approximate number"
737
+ }}
738
+ ]
739
+ """.strip()
740
+
741
+ _IUPAC_SECTION_PROMPT = """
742
+ Analyze the provided Supporting Information table of contents pages to identify sections containing compound IUPAC names.
743
+
744
+ Look for sections that contain:
745
+ 1. "Synthesis" of compounds with numbered identifiers
746
+ 2. "Characterization" data for compounds
747
+ 3. "General procedure" sections listing compounds
748
+ 4. "NMR spectra" or "MS data" sections
749
+
750
+ IMPORTANT:
751
+ - Return the EXACT page range where compounds are characterized
752
+ - Use S prefix for supplementary pages (e.g., "S22-S30" not "22-30")
753
+ - Include both starting AND ending page numbers
754
+
755
+ Return JSON:
756
+ {
757
+ "iupac_sections": [
758
+ {
759
+ "section": "exact section title as written",
760
+ "page_range": "page range (e.g., 'S22-S45')",
761
+ "description": "what compounds are described"
762
+ }
763
+ ]
764
+ }
765
+ """.strip()
766
+
767
+ _COMPOUND_MAPPING_PROMPT = """
768
+ Extract compound identifiers and their chemical names EXACTLY as they appear in the text.
769
+
770
+ STRICT RULES:
771
+ 1. ONLY extract what is explicitly written in the text
772
+ 2. Look for patterns where compound IDs are paired with chemical names
773
+ 3. DO NOT infer, generate, or guess any chemical names
774
+ 4. If a compound ID appears without a chemical name, return null for iupac_name
775
+ 5. If a product was "not detected" or "not formed", return null for iupac_name
776
+
777
+ For each compound:
778
+ - identifier: The exact compound ID as written (e.g., "1", "2a", "SM-1")
779
+ - iupac_name: The chemical name if explicitly provided, otherwise null
780
+ - common_names: Any alternative names mentioned
781
+ - compound_type: substrate/product/reagent/catalyst/other
782
+ - source_location: The exact text excerpt where this information was found
783
+
784
+ Return as JSON:
785
+ {
786
+ "compound_mappings": [
787
+ {
788
+ "identifier": "string",
789
+ "iupac_name": "string or null",
790
+ "common_names": ["array of strings"],
791
+ "compound_type": "string",
792
+ "source_location": "string"
793
+ }
794
+ ]
795
+ }
796
+
797
+ Note: It is better to return null than to hallucinate or infer chemical structures.
798
+ """.strip()
799
+
800
+ _SUBSTRATE_SCOPE_PROMPT = """
801
+ Extract ALL substrate scope data from the primary sources in one complete extraction.
802
+ {extraction_hints}
803
+
804
+ For EACH reaction, extract:
805
+ 1. Enzyme variant ID
806
+ 2. Substrate identifiers (e.g., "6a", "5")
807
+ 3. Product identifiers (e.g., "7a", "7b", "7d", "7e") - ALWAYS include even if no yield
808
+ 4. Performance metrics (yield%, ee%, dr, TTN)
809
+ 5. Reaction conditions (temperature, pH, buffer, substrate concentrations - NOT dithionite/reducing agents)
810
+ 6. Data location (which figure/table this comes from)
811
+
812
+ CRITICAL - NO HALLUCINATION OR MODIFICATION:
813
+ - Extract values EXACTLY as written in the primary source - NO CHANGES WHATSOEVER
814
+ - DO NOT round, estimate, convert, or modify any numbers
815
+ - If the text shows "53%", report 53.0, not 53 or 53.00
816
+ - If the text shows "<5%", report exactly "<5" as a string in notes, yield_percent=null
817
+ - If the text shows "trace", report exactly "trace" in notes, yield_percent=null
818
+ - If the text shows "n.d.", report exactly "n.d." in notes, yield_percent=null
819
+ - If the text shows "80:20 er", calculate ee as 60.0 (|80-20|)
820
+ - If the text shows "91% ee", report ee_percent as 91.0
821
+ - If no value is shown, return null, not 0 or empty string
822
+ - Extract ALL reactions from ALL identified locations
823
+ - Use compound identifiers EXACTLY as shown (not IUPAC names)
824
+ - For every entry, there needs to be identifier for both substrates and products, even if yield is null or activity is 0.
825
+ - Extract reaction conditions EXACTLY as written - NO PARAPHRASING
826
+ - IMPORTANT: Substrate concentration refers to the concentration of the actual chemical substrates being transformed in the reaction, NOT reducing agents (e.g., dithionite, NADH) or other additives
827
+
828
+ IMPORTANT: Each substrate should have a corresponding product identifier. Even when there is no yield, return
829
+ the exact identifier as seen in the reaction.
830
+
831
+ Return as JSON:
832
+ {{
833
+ "substrate_scope_data": [
834
+ {{
835
+ "enzyme_id": "enzyme variant name",
836
+ "substrate_ids": ["list of substrate identifiers"],
837
+ "product_ids": ["list of product identifiers"],
838
+ "yield_percent": null or number,
839
+ "ee_percent": null or number,
840
+ "dr": "ratio if reported",
841
+ "ttn": null or number,
842
+ "reaction_conditions": {{
843
+ "temperature": "",
844
+ "ph": "",
845
+ "buffer": "",
846
+ "substrate_concentration": "concentration of actual substrates/reagents, NOT reducing agents like dithionite",
847
+ "other_conditions": "including enzyme loading, reducing agents (e.g., dithionite), time, etc."
848
+ }},
849
+ "data_location": "specific figure/table",
850
+ "notes": "any special notes (e.g., 'no product detected')"
851
+ }}
852
+ ]
853
+ }}
854
+ """.strip()
855
+
856
+
857
+ # ---- 6.2 Helper functions -------------------------------------------------
858
+
859
+ def identify_scope_locations(
860
+ text: str,
861
+ model,
862
+ *,
863
+ max_results: int = 5,
864
+ debug_dir: str | Path | None = None,
865
+ ) -> List[dict]:
866
+ """Ask Gemini where substrate scope data is located."""
867
+ prompt = _SCOPE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
868
+ locs: List[dict] = []
869
+ try:
870
+ locs = generate_json_with_retry(
871
+ model,
872
+ prompt,
873
+ debug_dir=debug_dir,
874
+ tag="scope_locate",
875
+ )
876
+ except Exception as exc: # pragma: no cover
877
+ log.warning("identify_scope_locations(): %s", exc)
878
+ return locs if isinstance(locs, list) else []
879
+
880
+ def identify_iupac_sections(
881
+ text: str,
882
+ model,
883
+ *,
884
+ pdf_paths: List[Path] = None,
885
+ debug_dir: str | Path | None = None,
886
+ ) -> List[dict]:
887
+ """Identify sections containing IUPAC names from SI table of contents."""
888
+ # Extract only SI TOC pages (first 5 pages of SI)
889
+ si_toc_text = ""
890
+ if pdf_paths and len(pdf_paths) > 1:
891
+ si_pdf = pdf_paths[1] # Second PDF is SI
892
+ doc = _open_doc(si_pdf)
893
+ try:
894
+ for page_num in range(min(5, doc.page_count)):
895
+ page = doc.load_page(page_num)
896
+ page_text = page.get_text()
897
+ si_toc_text += f"\n[SI Page {page_num + 1}]\n{page_text}"
898
+ finally:
899
+ doc.close()
900
+
901
+ if not si_toc_text:
902
+ # Fallback to caption text
903
+ si_toc_text = text[:15_000]
904
+
905
+ prompt = _IUPAC_SECTION_PROMPT + "\n\nTEXT:\n" + si_toc_text
906
+
907
+ try:
908
+ data = generate_json_with_retry(
909
+ model,
910
+ prompt,
911
+ debug_dir=debug_dir,
912
+ tag="iupac_sections",
913
+ )
914
+
915
+ sections = data.get("iupac_sections", []) if isinstance(data, dict) else []
916
+ log.info("Identified %d sections containing IUPAC names", len(sections))
917
+ return sections
918
+
919
+ except Exception as exc:
920
+ log.warning("Failed to identify IUPAC sections: %s", exc)
921
+ return []
922
+
923
+ def _extract_compound_mappings_from_text(
924
+ extraction_text: str,
925
+ model,
926
+ compound_ids: List[str] = None,
927
+ debug_dir: str | Path | None = None,
928
+ tag_suffix: str = "",
929
+ ) -> Dict[str, CompoundMapping]:
930
+ """Helper function to extract compound mappings from provided text."""
931
+ prompt = _COMPOUND_MAPPING_PROMPT
932
+ if compound_ids:
933
+ prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
934
+ prompt += "\n\nTEXT:\n" + extraction_text
935
+
936
+ tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
937
+
938
+ try:
939
+ data = generate_json_with_retry(
940
+ model,
941
+ prompt,
942
+ debug_dir=debug_dir,
943
+ tag=tag,
944
+ )
945
+
946
+ mappings = {}
947
+ compound_mappings_data = data.get("compound_mappings") or []
948
+ for item in compound_mappings_data:
949
+ # Handle both old format (with identifiers list) and new format (with identifier string)
950
+ identifiers = item.get("identifiers") or []
951
+ if not identifiers and item.get("identifier"):
952
+ identifiers = [item.get("identifier")]
953
+
954
+ mapping = CompoundMapping(
955
+ identifiers=identifiers,
956
+ iupac_name=item.get("iupac_name", ""),
957
+ common_names=item.get("common_names") or [],
958
+ compound_type=item.get("compound_type", "unknown"),
959
+ source_location=item.get("source_location")
960
+ )
961
+
962
+ # Create lookup entries for all identifiers and common names
963
+ for identifier in mapping.identifiers + mapping.common_names:
964
+ if identifier:
965
+ mappings[identifier.lower().strip()] = mapping
966
+
967
+ return mappings
968
+
969
+ except Exception as exc:
970
+ log.error("Failed to extract compound mappings: %s", exc)
971
+ return {}
972
+
973
+ def _extract_compound_mappings_with_figures(
974
+ text: str,
975
+ model,
976
+ compound_ids: List[str],
977
+ figure_images: Dict[str, str],
978
+ pdf_paths: List[Path],
979
+ debug_dir: str | Path | None = None,
980
+ tag_suffix: str = "",
981
+ ) -> Dict[str, CompoundMapping]:
982
+ """Extract compound mappings using multimodal approach with figures."""
983
+ # Enhanced prompt for figure-based extraction
984
+ prompt = """You are an expert chemist analyzing chemical figures and manuscript text to identify compound IUPAC names.
985
+
986
+ TASK: Find the IUPAC names for these specific compound identifiers: """ + ", ".join(sorted(compound_ids)) + """
987
+
988
+ APPROACH (in order of preference):
989
+ 1. First, look for explicitly written IUPAC names in text or captions
990
+ 2. If not found, look for common/trivial names that you can convert to IUPAC
991
+ 3. As last resort, carefully analyze chemical structures in figures to derive IUPAC names
992
+
993
+ CRITICAL ACCURACY REQUIREMENTS:
994
+ When deriving IUPAC names from structures:
995
+ - Count ALL atoms and bonds carefully - do not miss any substituents
996
+ - Verify the COMPLETE structure matches your IUPAC name
997
+ - For cyclopropanes: use "cyclopropane-1-carboxylate" NOT "cyclopropanecarboxylate"
998
+ - Include stereochemistry only if clearly shown (trans-, cis-, R/S)
999
+ - Double-check ring sizes, substituent positions, and functional groups
1000
+ - If a structure is unclear or ambiguous, return null rather than guess
1001
+
1002
+ VALIDATION CHECKLIST before providing an IUPAC name:
1003
+ □ Have I accounted for EVERY atom in the structure?
1004
+ □ Have I identified ALL functional groups correctly?
1005
+ □ Is the parent chain/ring correctly identified?
1006
+ □ Are substituent positions numbered correctly?
1007
+ □ Is the name formatted with proper punctuation (hyphens, commas)?
1008
+ □ Would this IUPAC name regenerate EXACTLY the structure shown?
1009
+
1010
+ Common mistakes to avoid:
1011
+ - Missing substituents (e.g., forgetting a methoxy group)
1012
+ - Wrong ring size (e.g., calling a benzene ring a cyclohexane)
1013
+ - Incorrect substituent positions
1014
+ - Using "benzyl" vs "phenyl" incorrectly
1015
+ - Missing or incorrect stereochemistry
1016
+
1017
+ Return as JSON:
1018
+ {
1019
+ "compound_mappings": [
1020
+ {
1021
+ "identifier": "compound identifier",
1022
+ "iupac_name": "valid IUPAC systematic name or null if uncertain",
1023
+ "common_names": ["common names found in text"],
1024
+ "compound_type": "substrate/product/reagent",
1025
+ "source_location": "where found/how determined"
1026
+ }
1027
+ ]
1028
+ }
1029
+
1030
+ TEXT FROM MANUSCRIPT:
1031
+ """ + text
1032
+
1033
+ # Prepare multimodal content
1034
+ content_parts = [prompt]
1035
+
1036
+ # Add figure images
1037
+ if figure_images:
1038
+ import PIL.Image
1039
+ import io
1040
+ import base64
1041
+
1042
+ for fig_ref, fig_base64 in figure_images.items():
1043
+ try:
1044
+ img_bytes = base64.b64decode(fig_base64)
1045
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1046
+ content_parts.append(f"\n[Figure: {fig_ref}]")
1047
+ content_parts.append(image)
1048
+ log.info("Added figure %s to multimodal compound mapping", fig_ref)
1049
+ except Exception as e:
1050
+ log.warning("Failed to add figure %s: %s", fig_ref, e)
1051
+
1052
+ tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
1053
+
1054
+ try:
1055
+ # Log multimodal call
1056
+ log.info("=== GEMINI MULTIMODAL API CALL: COMPOUND_MAPPING_WITH_FIGURES ===")
1057
+ log.info("Text prompt length: %d characters", len(prompt))
1058
+ log.info("Number of images: %d", len(content_parts) - 1)
1059
+ log.info("Compounds to find: %s", ", ".join(sorted(compound_ids)))
1060
+
1061
+ # Save debug info
1062
+ if debug_dir:
1063
+ debug_path = Path(debug_dir)
1064
+ prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
1065
+ with open(prompt_file, 'w') as f:
1066
+ f.write(f"=== PROMPT FOR {tag.upper()} ===\n")
1067
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
1068
+ f.write(f"Text length: {len(prompt)} characters\n")
1069
+ f.write(f"Images included: {len(content_parts) - 1}\n")
1070
+ for fig_ref in figure_images.keys():
1071
+ f.write(f" - {fig_ref}\n")
1072
+ f.write("="*80 + "\n\n")
1073
+ f.write(prompt)
1074
+ log.info("Full prompt saved to: %s", prompt_file)
1075
+
1076
+ # Make multimodal API call
1077
+ response = model.generate_content(content_parts)
1078
+ raw_text = _extract_text(response).strip()
1079
+
1080
+ # Log response
1081
+ log.info("Gemini multimodal response length: %d characters", len(raw_text))
1082
+
1083
+ if debug_dir:
1084
+ response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
1085
+ with open(response_file, 'w') as f:
1086
+ f.write(f"=== RESPONSE FOR {tag.upper()} ===\n")
1087
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
1088
+ f.write(f"Length: {len(raw_text)} characters\n")
1089
+ f.write("="*80 + "\n\n")
1090
+ f.write(raw_text)
1091
+ log.info("Full response saved to: %s", response_file)
1092
+
1093
+ # Parse JSON
1094
+ import json
1095
+ data = json.loads(raw_text.strip('```json').strip('```').strip())
1096
+
1097
+ mappings = {}
1098
+ compound_mappings_data = data.get("compound_mappings") or []
1099
+ for item in compound_mappings_data:
1100
+ identifiers = item.get("identifiers") or []
1101
+ if not identifiers and item.get("identifier"):
1102
+ identifiers = [item.get("identifier")]
1103
+
1104
+ mapping = CompoundMapping(
1105
+ identifiers=identifiers,
1106
+ iupac_name=item.get("iupac_name", ""),
1107
+ common_names=item.get("common_names") or [],
1108
+ compound_type=item.get("compound_type", "unknown"),
1109
+ source_location=item.get("source_location")
1110
+ )
1111
+
1112
+ for identifier in mapping.identifiers + mapping.common_names:
1113
+ if identifier:
1114
+ mappings[identifier.lower().strip()] = mapping
1115
+
1116
+ return mappings
1117
+
1118
+ except Exception as exc:
1119
+ log.error("Failed to extract compound mappings with figures: %s", exc)
1120
+ return {}
1121
+
1122
+ def _extract_text_for_compound_mapping(
1123
+ pdf_paths: List[Path],
1124
+ iupac_sections: List[dict],
1125
+ text_fallback: str,
1126
+ ) -> str:
1127
+ """Extract text from identified IUPAC sections for compound mapping."""
1128
+ extraction_text = ""
1129
+
1130
+ if iupac_sections and pdf_paths:
1131
+ log.info("Extracting text from %d identified IUPAC sections", len(iupac_sections))
1132
+
1133
+ # Use page-based extraction for each section
1134
+ for section in iupac_sections:
1135
+ section_title = section.get('section', '')
1136
+ page_range = section.get('page_range') or section.get('page', '')
1137
+
1138
+ if page_range:
1139
+ log.info("Extracting section '%s' from page range %s", section_title, page_range)
1140
+
1141
+ # Extract multiple pages starting from the given page
1142
+ pages_to_extract = []
1143
+
1144
+ if '-' in str(page_range):
1145
+ # Handle ranges like "S22-S45"
1146
+ parts = page_range.split('-')
1147
+ start_page = parts[0].strip()
1148
+ end_page = parts[1].strip() if len(parts) > 1 else None
1149
+
1150
+ # Extract all pages in the range
1151
+ if start_page.startswith('S') and end_page and end_page.startswith('S'):
1152
+ try:
1153
+ start_num = int(start_page[1:])
1154
+ end_num = int(end_page[1:])
1155
+ for i in range(start_num, min(end_num + 1, start_num + 15)): # Max 15 pages
1156
+ pages_to_extract.append(f"S{i}")
1157
+ except:
1158
+ pages_to_extract.append(start_page)
1159
+ else:
1160
+ pages_to_extract.append(start_page)
1161
+ else:
1162
+ # Single page - extract it plus next 10 pages
1163
+ start_page = str(page_range).strip()
1164
+
1165
+ # Ensure S prefix for SI pages
1166
+ if not start_page.startswith('S') and start_page.isdigit():
1167
+ start_page = 'S' + start_page
1168
+
1169
+ pages_to_extract.append(start_page)
1170
+
1171
+ # Add next 10 pages
1172
+ try:
1173
+ if start_page.startswith('S'):
1174
+ base_num = int(start_page[1:])
1175
+ for i in range(1, 11): # Extract 10 more pages
1176
+ pages_to_extract.append(f"S{base_num + i}")
1177
+ else:
1178
+ base_num = int(start_page)
1179
+ for i in range(1, 11):
1180
+ pages_to_extract.append(str(base_num + i))
1181
+ except:
1182
+ pass
1183
+
1184
+ # Extract the pages
1185
+ page_text = _extract_text_from_pages(pdf_paths, pages_to_extract, max_pages=15)
1186
+ if page_text:
1187
+ extraction_text += f"\n\n=== Section: '{section_title}' starting from page {page_range} ===\n{page_text}"
1188
+ else:
1189
+ # Try title-based extraction as fallback
1190
+ section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=10000)
1191
+ if section_text:
1192
+ extraction_text += section_text
1193
+
1194
+ if not extraction_text:
1195
+ log.warning("No text extracted from IUPAC sections, falling back to limited text")
1196
+ extraction_text = text_fallback[:30_000]
1197
+ else:
1198
+ # Fallback to limited text
1199
+ extraction_text = text_fallback[:30_000]
1200
+
1201
+ return extraction_text
1202
+
1203
+ def extract_compound_mappings(
1204
+ text: str,
1205
+ model,
1206
+ *,
1207
+ pdf_paths: List[Path] = None,
1208
+ iupac_sections: List[dict] = None,
1209
+ compound_ids: List[str] = None,
1210
+ debug_dir: str | Path | None = None,
1211
+ ) -> Dict[str, CompoundMapping]:
1212
+ """Extract compound ID to IUPAC name mappings from identified sections.
1213
+
1214
+ Uses an adaptive strategy:
1215
+ 1. First attempts extraction from identified IUPAC sections
1216
+ 2. Checks for missing compounds
1217
+ 3. Expands search to additional sections if compounds are missing
1218
+ """
1219
+ # Step 1: Extract text from initially identified sections
1220
+ extraction_text = _extract_text_for_compound_mapping(pdf_paths, iupac_sections, text)
1221
+
1222
+ # Step 2: First extraction attempt
1223
+ mappings = _extract_compound_mappings_from_text(
1224
+ extraction_text, model, compound_ids, debug_dir, tag_suffix="initial"
1225
+ )
1226
+ log.info("Initial extraction found %d compound mappings", len(mappings))
1227
+
1228
+ # Step 3: Check for missing compounds
1229
+ missing_compounds = []
1230
+ if compound_ids:
1231
+ for cid in compound_ids:
1232
+ mapping = mappings.get(cid.lower().strip())
1233
+ if not mapping or not mapping.iupac_name:
1234
+ missing_compounds.append(cid)
1235
+
1236
+ # Step 4: Adaptive expansion if compounds are missing
1237
+ if missing_compounds and pdf_paths:
1238
+ log.info("Found %d compounds without IUPAC names: %s",
1239
+ len(missing_compounds), sorted(missing_compounds))
1240
+ log.info("Expanding search to additional sections...")
1241
+
1242
+ # Define additional sections that might contain compound definitions
1243
+ additional_sections = [
1244
+ "Engineering strategy",
1245
+ "Screening for benzyl acrylate cyclopropanation",
1246
+ "Evolution campaign",
1247
+ "General procedure",
1248
+ "Experimental procedures",
1249
+ "Materials and methods",
1250
+ "Substrate synthesis"
1251
+ ]
1252
+
1253
+ # Extract text from additional sections
1254
+ additional_text = _extract_sections_by_title(
1255
+ pdf_paths, additional_sections, max_chars_per_section=5000
1256
+ )
1257
+
1258
+ if additional_text:
1259
+ log.info("Extracted %d chars from additional sections", len(additional_text))
1260
+
1261
+ # Second extraction attempt with expanded text
1262
+ expanded_mappings = _extract_compound_mappings_from_text(
1263
+ additional_text, model, missing_compounds, debug_dir, tag_suffix="expanded"
1264
+ )
1265
+
1266
+ # Merge new mappings
1267
+ new_found = 0
1268
+ for key, mapping in expanded_mappings.items():
1269
+ if key not in mappings or not mappings[key].iupac_name:
1270
+ if mapping.iupac_name: # Only add if we found an IUPAC name
1271
+ mappings[key] = mapping
1272
+ new_found += 1
1273
+ log.info("Found IUPAC name for '%s': %s",
1274
+ key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
1275
+
1276
+ log.info("Expanded search found %d additional compound mappings", new_found)
1277
+ else:
1278
+ log.warning("No additional text found in expanded sections")
1279
+
1280
+ # Step 5: Check again for still missing compounds
1281
+ still_missing = []
1282
+ for cid in missing_compounds:
1283
+ mapping = mappings.get(cid.lower().strip())
1284
+ if not mapping or not mapping.iupac_name:
1285
+ still_missing.append(cid)
1286
+
1287
+ # Step 6: Final fallback - use figures and full manuscript if compounds are still missing
1288
+ # COMMENTED OUT: Figure-based IUPAC extraction is unreliable
1289
+ # Generating IUPAC names from visual structures leads to errors
1290
+ # Only use text-based extraction for reliability
1291
+
1292
+ # if still_missing:
1293
+ # log.info("Still missing IUPAC names for %d compounds: %s",
1294
+ # len(still_missing), sorted(still_missing))
1295
+ # log.info("Attempting final extraction using figures and full manuscript...")
1296
+ #
1297
+ # # Extract figure images if available
1298
+ # figure_images = {}
1299
+ # if hasattr(extract_compound_mappings, '_figure_images_cache'):
1300
+ # figure_images = extract_compound_mappings._figure_images_cache
1301
+ #
1302
+ # # Use multimodal approach with figures and manuscript text
1303
+ # final_mappings = _extract_compound_mappings_with_figures(
1304
+ # text[:50_000], model, still_missing, figure_images,
1305
+ # pdf_paths, debug_dir, tag_suffix="figures"
1306
+ # )
1307
+ #
1308
+ # # Merge final mappings
1309
+ # final_found = 0
1310
+ # for key, mapping in final_mappings.items():
1311
+ # if key not in mappings or not mappings[key].iupac_name:
1312
+ # if mapping.iupac_name:
1313
+ # mappings[key] = mapping
1314
+ # final_found += 1
1315
+ # log.info("Found IUPAC name for '%s' using figures: %s",
1316
+ # key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
1317
+ #
1318
+ # log.info("Figure-based search found %d additional compound mappings", final_found)
1319
+
1320
+ if still_missing:
1321
+ log.info("Still missing IUPAC names for %d compounds: %s",
1322
+ len(still_missing), sorted(still_missing))
1323
+ log.info("Note: Figure-based IUPAC extraction is disabled for reliability")
1324
+
1325
+ log.info("Total compound mappings extracted: %d", len(mappings))
1326
+ return mappings
1327
+
1328
+ def extract_all_substrate_scope_data(
1329
+ text: str,
1330
+ model,
1331
+ locations: List[dict],
1332
+ *,
1333
+ pdf_paths: List[Path] = None,
1334
+ figure_images: Dict[str, str] = None,
1335
+ debug_dir: str | Path | None = None,
1336
+ ) -> List[dict]:
1337
+ """Extract all substrate scope data at once from all primary sources."""
1338
+ extraction_hints = ""
1339
+ all_refs = []
1340
+
1341
+ if locations:
1342
+ # Include ALL locations, not just primary
1343
+ location_strs = []
1344
+ for loc in locations[:3]: # Up to 3 locations
1345
+ loc_str = loc.get('location', '')
1346
+ location_strs.append(loc_str)
1347
+ all_refs.append(loc_str)
1348
+
1349
+ extraction_hints = f"\nSubstrate scope locations: {', '.join(location_strs)}"
1350
+
1351
+ # Collect all enzyme variants
1352
+ all_variants = []
1353
+ for loc in locations:
1354
+ variants = loc.get('enzyme_variants_tested', [])
1355
+ all_variants.extend(variants)
1356
+
1357
+ if all_variants:
1358
+ unique_variants = list(set(all_variants))
1359
+ extraction_hints += f"\nEnzyme variants: {', '.join(unique_variants)}"
1360
+
1361
+ # Extract text from ALL identified locations
1362
+ extraction_texts = []
1363
+
1364
+ for ref in all_refs:
1365
+ if ref and pdf_paths:
1366
+ ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
1367
+ if ref_text:
1368
+ # Add figure image notation if available
1369
+ if figure_images and ref in figure_images:
1370
+ ref_text = f"[FIGURE IMAGE EXTRACTED: {ref}]\n\n{ref_text}"
1371
+ extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
1372
+
1373
+ if not extraction_texts:
1374
+ extraction_texts = [text[:50_000]]
1375
+
1376
+ extraction_text = "\n\n".join(extraction_texts)
1377
+
1378
+ prompt = _SUBSTRATE_SCOPE_PROMPT.format(extraction_hints=extraction_hints)
1379
+ prompt += "\n\nTEXT:\n" + extraction_text
1380
+
1381
+ # Prepare multimodal content with images
1382
+ content_parts = [prompt]
1383
+
1384
+ # Add figure images to the prompt
1385
+ if figure_images:
1386
+ import PIL.Image
1387
+ import io
1388
+ import base64
1389
+
1390
+ for fig_ref, fig_base64 in figure_images.items():
1391
+ try:
1392
+ # Convert base64 to PIL Image
1393
+ img_bytes = base64.b64decode(fig_base64)
1394
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1395
+ content_parts.append(f"\n[Figure: {fig_ref}]")
1396
+ content_parts.append(image)
1397
+ log.info("Added figure %s to multimodal prompt", fig_ref)
1398
+ except Exception as e:
1399
+ log.warning("Failed to add figure %s: %s", fig_ref, e)
1400
+
1401
+ try:
1402
+ # Use multimodal content if we have images
1403
+ if len(content_parts) > 1:
1404
+ # Log multimodal API call
1405
+ log.info("=== GEMINI MULTIMODAL API CALL: SUBSTRATE_SCOPE_WITH_FIGURES ===")
1406
+ log.info("Text prompt length: %d characters", len(prompt))
1407
+ log.info("Number of images: %d", len(content_parts) - 1)
1408
+ log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
1409
+
1410
+ # Save prompt and image info to debug directory
1411
+ if debug_dir:
1412
+ debug_path = Path(debug_dir)
1413
+ debug_path.mkdir(parents=True, exist_ok=True)
1414
+ prompt_file = debug_path / f"substrate_scope_multimodal_prompt_{int(time.time())}.txt"
1415
+
1416
+ # Build prompt info including image references
1417
+ prompt_info = f"=== PROMPT FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n"
1418
+ prompt_info += f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
1419
+ prompt_info += f"Text length: {len(prompt)} characters\n"
1420
+ prompt_info += f"Images included: {len(content_parts) - 1}\n"
1421
+ for fig_ref in figure_images.keys():
1422
+ prompt_info += f" - {fig_ref}\n"
1423
+ prompt_info += "="*80 + "\n\n"
1424
+ prompt_info += prompt
1425
+
1426
+ _dump(prompt_info, prompt_file)
1427
+ log.info("Full prompt saved to: %s", prompt_file)
1428
+
1429
+ log.info("Calling Gemini Multimodal API...")
1430
+ response = model.generate_content(content_parts)
1431
+ raw_text = _extract_text(response).strip()
1432
+
1433
+ # Log and save response
1434
+ log.info("Gemini multimodal response length: %d characters", len(raw_text))
1435
+ log.info("First 500 chars of response:\n%s\n...(truncated)", raw_text[:500])
1436
+
1437
+ if debug_dir:
1438
+ debug_path = Path(debug_dir)
1439
+ response_file = debug_path / f"substrate_scope_multimodal_response_{int(time.time())}.txt"
1440
+ with open(response_file, 'w') as f:
1441
+ f.write(f"=== RESPONSE FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n")
1442
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
1443
+ f.write(f"Length: {len(raw_text)} characters\n")
1444
+ f.write("="*80 + "\n\n")
1445
+ f.write(raw_text)
1446
+ log.info("Full response saved to: %s", response_file)
1447
+
1448
+ # Parse JSON from response
1449
+ import json
1450
+ data = json.loads(raw_text.strip('```json').strip('```').strip())
1451
+ else:
1452
+ data = generate_json_with_retry(
1453
+ model,
1454
+ prompt,
1455
+ debug_dir=debug_dir,
1456
+ tag="substrate_scope",
1457
+ )
1458
+
1459
+ scope_data = data.get("substrate_scope_data", [])
1460
+ log.info("Extracted %d substrate scope entries", len(scope_data))
1461
+ return scope_data
1462
+
1463
+ except Exception as exc:
1464
+ log.error("Failed to extract substrate scope data: %s", exc)
1465
+ return []
1466
+
1467
+ def _extract_single_reaction(
1468
+ text: str,
1469
+ model,
1470
+ enzyme_id: str,
1471
+ substrate_name: str,
1472
+ data_location: str,
1473
+ context_pairs: List[Tuple[str, str, str]] = None,
1474
+ *,
1475
+ pdf_paths: List[Path] = None,
1476
+ debug_dir: str | Path | None = None,
1477
+ ) -> Optional[dict]:
1478
+ """Extract data for a single enzyme-substrate pair."""
1479
+ # Build context
1480
+ context_info = ""
1481
+ if context_pairs:
1482
+ context_info = "\nCONTEXT - NEIGHBORING ENTRIES:\n"
1483
+ for ctx_enzyme, ctx_substrate, _ in context_pairs[:4]:
1484
+ if ctx_enzyme == enzyme_id and ctx_substrate != substrate_name:
1485
+ context_info += f"- {ctx_substrate} (same enzyme, different substrate)\n"
1486
+
1487
+ # Extract focused text for this specific reaction
1488
+ if data_location and pdf_paths:
1489
+ # Extract text around the data location and reaction conditions
1490
+ extraction_text = _extract_text_around_reference(pdf_paths, data_location, context_chars=2000)
1491
+
1492
+ # Also extract reaction conditions section if available
1493
+ conditions_sections = ["General procedure", "Reaction conditions", "Standard conditions"]
1494
+ conditions_text = _extract_sections_by_title(pdf_paths, conditions_sections, max_chars_per_section=2000)
1495
+
1496
+ if conditions_text:
1497
+ extraction_text += "\n\n=== REACTION CONDITIONS ===\n" + conditions_text
1498
+ else:
1499
+ extraction_text = text[:20_000]
1500
+
1501
+ prompt = _SINGLE_REACTION_PROMPT.format(
1502
+ enzyme_id=enzyme_id,
1503
+ substrate_name=substrate_name,
1504
+ data_location=data_location,
1505
+ context_info=context_info
1506
+ )
1507
+ prompt += "\n\nTEXT:\n" + extraction_text
1508
+
1509
+ try:
1510
+ return generate_json_with_retry(
1511
+ model,
1512
+ prompt,
1513
+ debug_dir=debug_dir,
1514
+ tag=f"reaction_{enzyme_id[:10]}_{substrate_name[:10]}",
1515
+ )
1516
+ except Exception as exc:
1517
+ log.error("Failed to extract reaction %s-%s: %s", enzyme_id, substrate_name, exc)
1518
+ return None
1519
+
1520
+ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping]) -> List[ScopeEntry]:
1521
+ """Convert raw JSON to ScopeEntry objects with IUPAC enhancement."""
1522
+ entries: List[ScopeEntry] = []
1523
+
1524
+ for item in data:
1525
+ try:
1526
+ # Parse substrate IDs
1527
+ substrates = []
1528
+ substrate_ids = item.get("substrate_ids") or []
1529
+ # Also handle old format
1530
+ if not substrate_ids and item.get("substrates"):
1531
+ substrates_data = item.get("substrates") or []
1532
+ for s in substrates_data:
1533
+ if isinstance(s, dict):
1534
+ substrate_ids.append(s.get("identifier") or s.get("name", ""))
1535
+ else:
1536
+ substrate_ids.append(str(s))
1537
+
1538
+ for sid in substrate_ids:
1539
+ # Look up IUPAC name
1540
+ iupac_name = None
1541
+ mapping = compound_mappings.get(str(sid).lower())
1542
+ if mapping:
1543
+ iupac_name = mapping.iupac_name
1544
+
1545
+ substrates.append(SubstrateProduct(name=str(sid), iupac_name=iupac_name))
1546
+
1547
+ # Parse product IDs
1548
+ products = []
1549
+ product_ids = item.get("product_ids") or []
1550
+ # Also handle old format
1551
+ if not product_ids and item.get("products"):
1552
+ products_data = item.get("products") or []
1553
+ for p in products_data:
1554
+ if isinstance(p, dict):
1555
+ product_ids.append(p.get("identifier") or p.get("name", ""))
1556
+ else:
1557
+ product_ids.append(str(p))
1558
+
1559
+ for pid in product_ids:
1560
+ # Look up IUPAC name
1561
+ iupac_name = None
1562
+ mapping = compound_mappings.get(str(pid).lower())
1563
+ if mapping:
1564
+ iupac_name = mapping.iupac_name
1565
+
1566
+ products.append(SubstrateProduct(name=str(pid), iupac_name=iupac_name))
1567
+
1568
+ # Parse cofactors
1569
+ cofactors = []
1570
+ cofactors_data = item.get("cofactors") or []
1571
+ for c in cofactors_data:
1572
+ if isinstance(c, dict):
1573
+ cofactors.append(Cofactor(
1574
+ name=c.get("name", ""),
1575
+ iupac_name=c.get("iupac_name"),
1576
+ role=c.get("role")
1577
+ ))
1578
+
1579
+ # Parse conditions
1580
+ cond_data = item.get("reaction_conditions", {})
1581
+ conditions = ReactionConditions(
1582
+ temperature=cond_data.get("temperature"),
1583
+ ph=cond_data.get("ph"),
1584
+ substrate_concentration=cond_data.get("substrate_concentration"),
1585
+ buffer=cond_data.get("buffer"),
1586
+ other_conditions=cond_data.get("other_conditions")
1587
+ )
1588
+
1589
+ # Parse numeric values
1590
+ def parse_numeric(val):
1591
+ if not val or val in ["", "n.d.", "N/A", None]:
1592
+ return None
1593
+ try:
1594
+ # Extract numeric part
1595
+ match = re.search(r'(\d+\.?\d*)', str(val))
1596
+ return float(match.group(1)) if match else None
1597
+ except:
1598
+ return None
1599
+
1600
+ # Parse ee - handle both percentage and ratio formats
1601
+ ee_value = item.get("ee_percent")
1602
+ if ee_value is None and item.get("ee"):
1603
+ # Try to extract from ratio format like "80:20 er"
1604
+ ee_str = str(item.get("ee"))
1605
+ match = re.search(r'(\d+):(\d+)', ee_str)
1606
+ if match:
1607
+ major = float(match.group(1))
1608
+ minor = float(match.group(2))
1609
+ # Convert ratio to ee%
1610
+ ee_value = abs(major - minor)
1611
+
1612
+ entry = ScopeEntry(
1613
+ enzyme_id=item.get("enzyme_id", ""),
1614
+ substrates=substrates,
1615
+ products=products,
1616
+ cofactors=cofactors,
1617
+ yield_percent=parse_numeric(item.get("yield_percent")),
1618
+ ttn=parse_numeric(item.get("ttn")),
1619
+ ee=parse_numeric(ee_value),
1620
+ conditions=conditions,
1621
+ data_location=item.get("data_location", ""),
1622
+ data_source_type={"all": "text/figure"},
1623
+ notes=item.get("notes", "")
1624
+ )
1625
+
1626
+ entries.append(entry)
1627
+
1628
+ except Exception as exc: # pragma: no cover
1629
+ log.debug("Skipping malformed scope entry %s: %s", item, exc)
1630
+
1631
+ return entries
1632
+
1633
+ # ---- 6.3 Public API -------------------------------------------------------
1634
+
1635
+ def get_substrate_scope(
1636
+ caption_text: str,
1637
+ full_text: str,
1638
+ model,
1639
+ *,
1640
+ pdf_paths: Optional[List[Path]] = None,
1641
+ debug_dir: str | Path | None = None,
1642
+ ) -> List[ScopeEntry]:
1643
+ """
1644
+ High-level wrapper used by the pipeline.
1645
+
1646
+ 1. Use captions to identify substrate scope locations
1647
+ 2. Identify sections containing IUPAC names
1648
+ 3. Extract compound mappings from identified sections
1649
+ 4. Identify enzyme-substrate pairs
1650
+ 5. Extract individual reactions with context
1651
+ """
1652
+ # Step 1: Find locations using captions
1653
+ locations = identify_scope_locations(caption_text, model, debug_dir=debug_dir)
1654
+ if locations:
1655
+ location_summary = []
1656
+ for loc in locations[:3]:
1657
+ location_summary.append(
1658
+ f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, "
1659
+ f"confidence: {loc.get('confidence', 0)})"
1660
+ )
1661
+ log.info("Identified %d substrate scope locations: %s",
1662
+ len(locations), ", ".join(location_summary))
1663
+ else:
1664
+ log.warning("No substrate scope locations identified")
1665
+ return []
1666
+
1667
+ # Step 2: Extract all substrate scope data first
1668
+ # (This gets us the compound IDs we need to map)
1669
+ time.sleep(2) # Rate limiting
1670
+ log.info("Extracting all substrate scope data from all identified sources...")
1671
+
1672
+ # Extract images for all figure locations
1673
+ figure_images = {}
1674
+ for loc in locations:
1675
+ location_str = loc.get('location', '')
1676
+ # Extract if it's marked as figure type OR if location contains "Figure" or "Fig"
1677
+ if pdf_paths and ('figure' in location_str.lower() or 'fig' in location_str.lower() or loc.get('type') == 'figure'):
1678
+ figure_ref = location_str
1679
+ confidence = loc.get('confidence', 0)
1680
+ log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, loc.get('type', 'unknown'))
1681
+ figure_image = extract_figure_image(pdf_paths, figure_ref)
1682
+ if figure_image:
1683
+ log.info("Successfully extracted figure image for %s (%d bytes)",
1684
+ figure_ref, len(figure_image))
1685
+ figure_images[figure_ref] = figure_image
1686
+
1687
+ # Save figure image if debug_dir is enabled
1688
+ if debug_dir:
1689
+ import base64
1690
+ debug_path = Path(debug_dir)
1691
+ image_path = debug_path / f"figure_image_{figure_ref.replace(' ', '_')}.png"
1692
+ with open(image_path, 'wb') as f:
1693
+ f.write(base64.b64decode(figure_image))
1694
+ log.info("Saved figure image to %s", image_path)
1695
+ else:
1696
+ log.warning("Failed to extract figure image for %s", figure_ref)
1697
+
1698
+ # Extract all substrate scope data in one call
1699
+ raw_entries = extract_all_substrate_scope_data(
1700
+ full_text, model, locations,
1701
+ pdf_paths=pdf_paths,
1702
+ figure_images=figure_images,
1703
+ debug_dir=debug_dir
1704
+ )
1705
+
1706
+ if not raw_entries:
1707
+ log.warning("No substrate scope data found")
1708
+ return []
1709
+
1710
+ # Step 3: Now identify IUPAC sections using SI TOC pages
1711
+ log.info("Identifying sections containing IUPAC names from SI table of contents...")
1712
+ iupac_sections = identify_iupac_sections(caption_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
1713
+
1714
+ # Step 4: Extract compound mappings from identified sections
1715
+ # Now we know which compound IDs to look for from the substrate scope data
1716
+ log.info("Extracting compound ID to IUPAC mappings...")
1717
+
1718
+ # Collect all compound IDs from substrate scope data
1719
+ all_compound_ids = set()
1720
+ for entry in raw_entries:
1721
+ substrate_ids = entry.get('substrate_ids') or []
1722
+ for sid in substrate_ids:
1723
+ all_compound_ids.add(str(sid))
1724
+ product_ids = entry.get('product_ids') or []
1725
+ for pid in product_ids:
1726
+ all_compound_ids.add(str(pid))
1727
+
1728
+ log.info("Found %d unique compound IDs to map: %s", len(all_compound_ids), sorted(all_compound_ids))
1729
+
1730
+ # Store figure images in the function for later use
1731
+ extract_compound_mappings._figure_images_cache = figure_images
1732
+
1733
+ compound_mappings = extract_compound_mappings(full_text, model,
1734
+ pdf_paths=pdf_paths,
1735
+ iupac_sections=iupac_sections,
1736
+ compound_ids=list(all_compound_ids),
1737
+ debug_dir=debug_dir)
1738
+
1739
+ # Step 5: Parse all entries with compound mappings
1740
+ entries = _parse_scope_entries(raw_entries, compound_mappings)
1741
+ log.info("Successfully parsed %d substrate scope entries", len(entries))
1742
+
1743
+ return entries
1744
+
1745
+ # === 7. VALIDATION & MERGE ===
1746
+ """Validation, duplicate detection, and merging with lineage data."""
1747
+
1748
+ def validate_scope_entries(entries: List[ScopeEntry]) -> List[str]:
1749
+ """Validate for suspicious patterns like duplicate values."""
1750
+ warnings = []
1751
+
1752
+ # Track values
1753
+ ttn_values: Dict[float, List[str]] = {}
1754
+ yield_values: Dict[float, List[str]] = {}
1755
+ ee_values: Dict[float, List[str]] = {}
1756
+
1757
+ for entry in entries:
1758
+ substrate_name = entry.substrates[0].name if entry.substrates else "Unknown"
1759
+ key = f"{entry.enzyme_id}-{substrate_name}"
1760
+
1761
+ if entry.ttn is not None:
1762
+ if entry.ttn not in ttn_values:
1763
+ ttn_values[entry.ttn] = []
1764
+ ttn_values[entry.ttn].append(key)
1765
+
1766
+ if entry.yield_percent is not None:
1767
+ if entry.yield_percent not in yield_values:
1768
+ yield_values[entry.yield_percent] = []
1769
+ yield_values[entry.yield_percent].append(key)
1770
+
1771
+ if entry.ee is not None:
1772
+ if entry.ee not in ee_values:
1773
+ ee_values[entry.ee] = []
1774
+ ee_values[entry.ee].append(key)
1775
+
1776
+ # Check for suspicious duplicates
1777
+ for value, items in ttn_values.items():
1778
+ if len(items) > 1:
1779
+ warnings.append(f"Multiple entries have TTN={value}: {', '.join(items[:3])}")
1780
+
1781
+ for value, items in yield_values.items():
1782
+ if len(items) > 1:
1783
+ warnings.append(f"Multiple entries have yield={value}%: {', '.join(items[:3])}")
1784
+
1785
+ for value, items in ee_values.items():
1786
+ if len(items) > 1:
1787
+ warnings.append(f"Multiple entries have ee={value}%: {', '.join(items[:3])}")
1788
+
1789
+ if warnings:
1790
+ log.warning("Validation warnings found - possible extraction errors")
1791
+ for warning in warnings:
1792
+ log.warning(" %s", warning)
1793
+
1794
+ return warnings
1795
+
1796
+ def merge_with_lineage(
1797
+ entries: List[ScopeEntry],
1798
+ lineage_csv: Optional[Path]
1799
+ ) -> List[ScopeEntry]:
1800
+ """Merge substrate scope entries with enzyme lineage data."""
1801
+ if not lineage_csv or not lineage_csv.exists():
1802
+ return entries
1803
+
1804
+ try:
1805
+ import pandas as pd
1806
+ lineage_df = pd.read_csv(lineage_csv)
1807
+ log.info("Loading lineage data from %s (%d enzymes)", lineage_csv, len(lineage_df))
1808
+
1809
+ # Create lookup map (case-insensitive)
1810
+ lineage_map = {}
1811
+ for _, row in lineage_df.iterrows():
1812
+ enzyme_id = str(row.get('enzyme_id', ''))
1813
+ lineage_map[enzyme_id.lower()] = {
1814
+ 'parent_id': row.get('parent_id'),
1815
+ 'mutations': row.get('mutations'),
1816
+ 'generation': row.get('generation'),
1817
+ 'aa_seq': row.get('aa_seq'),
1818
+ 'dna_seq': row.get('dna_seq'),
1819
+ 'confidence': row.get('confidence')
1820
+ }
1821
+
1822
+ # Merge
1823
+ merged_count = 0
1824
+ for entry in entries:
1825
+ key = entry.enzyme_id.lower()
1826
+ if key in lineage_map:
1827
+ data = lineage_map[key]
1828
+ entry.parent_id = data['parent_id']
1829
+ entry.mutations = data['mutations']
1830
+ entry.generation = data['generation']
1831
+ entry.aa_seq = data['aa_seq']
1832
+ entry.dna_seq = data['dna_seq']
1833
+ entry.confidence = data['confidence']
1834
+ merged_count += 1
1835
+
1836
+ log.info("Merged lineage data for %d/%d entries", merged_count, len(entries))
1837
+
1838
+ except Exception as exc:
1839
+ log.error("Failed to merge with lineage: %s", exc)
1840
+
1841
+ return entries
1842
+
1843
+ # === 8. PIPELINE ORCHESTRATOR ===
1844
+ """High-level function that ties everything together."""
1845
+
1846
+ import pandas as pd
1847
+
1848
+ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
1849
+ """Convert ScopeEntry objects to tidy DataFrame."""
1850
+ rows = []
1851
+
1852
+ for entry in entries:
1853
+ row = {
1854
+ 'enzyme_id': entry.enzyme_id,
1855
+ 'parent_enzyme_id': entry.parent_id or '',
1856
+ 'mutations': entry.mutations or '',
1857
+ 'generation': entry.generation if entry.generation is not None else '',
1858
+ 'protein_sequence': entry.aa_seq or '',
1859
+ 'nucleotide_sequence': entry.dna_seq or '',
1860
+ 'sequence_confidence': str(entry.confidence) if entry.confidence is not None else '',
1861
+ 'flag': '',
1862
+
1863
+ 'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
1864
+ 'substrate_iupac_list': '; '.join(s.iupac_name or '' for s in entry.substrates),
1865
+ 'product_list': '; '.join(p.name for p in entry.products if p.name),
1866
+ 'product_iupac_list': '; '.join(p.iupac_name or '' for p in entry.products),
1867
+
1868
+ 'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
1869
+ 'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),
1870
+ 'cofactor_roles': '; '.join(c.role or '' for c in entry.cofactors),
1871
+
1872
+ 'yield': str(entry.yield_percent) if entry.yield_percent is not None else '',
1873
+ 'ttn': str(entry.ttn) if entry.ttn is not None else '',
1874
+ 'ee': str(entry.ee) if entry.ee is not None else '',
1875
+
1876
+ 'reaction_temperature': entry.conditions.temperature or '',
1877
+ 'reaction_ph': entry.conditions.ph or '',
1878
+ 'reaction_substrate_concentration': entry.conditions.substrate_concentration or '',
1879
+ 'reaction_buffer': entry.conditions.buffer or '',
1880
+ 'reaction_other_conditions': entry.conditions.other_conditions or '',
1881
+
1882
+ 'data_location': entry.data_location or ''
1883
+ }
1884
+ rows.append(row)
1885
+
1886
+ df = pd.DataFrame(rows)
1887
+
1888
+ # Define column order
1889
+ column_order = [
1890
+ 'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation',
1891
+ 'protein_sequence', 'nucleotide_sequence', 'sequence_confidence', 'flag',
1892
+ 'substrate_list', 'substrate_iupac_list',
1893
+ 'product_list', 'product_iupac_list',
1894
+ 'cofactor_list', 'cofactor_iupac_list', 'cofactor_roles',
1895
+ 'yield', 'ttn', 'ee',
1896
+ 'reaction_temperature', 'reaction_ph', 'reaction_substrate_concentration',
1897
+ 'reaction_buffer', 'reaction_other_conditions',
1898
+ 'data_location'
1899
+ ]
1900
+
1901
+ # Ensure all columns exist
1902
+ for col in column_order:
1903
+ if col not in df.columns:
1904
+ df[col] = ''
1905
+
1906
+ # Reorder
1907
+ df = df[column_order]
1908
+
1909
+ return df
1910
+
1911
+ def run_pipeline(
1912
+ manuscript: Union[str, Path],
1913
+ si: Optional[Union[str, Path]] = None,
1914
+ output_csv: Optional[Union[str, Path]] = None,
1915
+ *,
1916
+ lineage_csv: Optional[Union[str, Path]] = None,
1917
+ debug_dir: str | Path | None = None,
1918
+ ) -> pd.DataFrame:
1919
+ """Execute the end-to-end substrate scope extraction pipeline.
1920
+
1921
+ Parameters
1922
+ ----------
1923
+ manuscript : str | Path
1924
+ Path to the main PDF file.
1925
+ si : str | Path | None, optional
1926
+ Path to the Supplementary Information PDF, if available.
1927
+ output_csv : str | Path | None, optional
1928
+ If provided, the substrate scope table will be written here.
1929
+ lineage_csv : str | Path | None, optional
1930
+ Path to enzyme lineage CSV for sequence merging.
1931
+
1932
+ Returns
1933
+ -------
1934
+ pandas.DataFrame
1935
+ One row per substrate-enzyme combination with all data.
1936
+ """
1937
+ t0 = time.perf_counter()
1938
+ manuscript = Path(manuscript)
1939
+ si_path = Path(si) if si else None
1940
+
1941
+ # 1. Prepare raw text ------------------------------------------------------
1942
+ pdf_paths = [p for p in (manuscript, si_path) if p]
1943
+ caption_text = limited_caption_concat(*pdf_paths)
1944
+ full_text = limited_concat(*pdf_paths)
1945
+
1946
+ log.info("Loaded %d chars of captions and %d chars of full text",
1947
+ len(caption_text), len(full_text))
1948
+
1949
+ # 2. Connect to Gemini -----------------------------------------------------
1950
+ model = get_model()
1951
+
1952
+ # 3. Extract substrate scope -----------------------------------------------
1953
+ entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
1954
+
1955
+ if not entries:
1956
+ raise RuntimeError("Pipeline aborted: failed to extract any substrate scope data")
1957
+
1958
+ # 4. Merge with lineage if available ---------------------------------------
1959
+ if lineage_csv:
1960
+ entries = merge_with_lineage(entries, Path(lineage_csv))
1961
+
1962
+ # 5. Validate entries ------------------------------------------------------
1963
+ warnings = validate_scope_entries(entries)
1964
+ if warnings:
1965
+ log.warning("Found %d validation warnings", len(warnings))
1966
+
1967
+ # 6. Convert to DataFrame --------------------------------------------------
1968
+ df_final = _entries_to_dataframe(entries)
1969
+
1970
+ # 7. Write CSV if requested ------------------------------------------------
1971
+ if output_csv:
1972
+ output_path = Path(output_csv)
1973
+ df_final.to_csv(output_path, index=False)
1974
+ log.info(
1975
+ "Saved substrate scope CSV -> %s (%.1f kB)",
1976
+ output_path,
1977
+ output_path.stat().st_size / 1024,
1978
+ )
1979
+
1980
+ log.info(
1981
+ "Pipeline finished in %.2f s (entries: %d)",
1982
+ time.perf_counter() - t0,
1983
+ len(df_final),
1984
+ )
1985
+ return df_final
1986
+
1987
+ # === 9. CLI ENTRYPOINT ===
1988
+ """Simple argparse wrapper matching enzyme_lineage_extractor.py style."""
1989
+
1990
+ import argparse
1991
+
1992
+ # -- 9.1 Argument parser ----------------------------------------------------
1993
+
1994
+ def _build_arg_parser() -> argparse.ArgumentParser:
1995
+ p = argparse.ArgumentParser(
1996
+ prog="substrate_scope_extractor",
1997
+ description="Extract substrate scope data from PDFs using Google Gemini",
1998
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
1999
+ )
2000
+ p.add_argument("--manuscript", required=True, help="Path to main manuscript PDF")
2001
+ p.add_argument("--si", help="Path to Supplementary Information PDF")
2002
+ p.add_argument("-o", "--output", help="CSV file for extracted data")
2003
+ p.add_argument("--lineage-csv", help="Path to enzyme lineage CSV for merging")
2004
+ p.add_argument(
2005
+ "-v",
2006
+ "--verbose",
2007
+ action="count",
2008
+ default=0,
2009
+ help="Increase verbosity; repeat (-vv) for DEBUG logging",
2010
+ )
2011
+ p.add_argument(
2012
+ "--debug-dir",
2013
+ metavar="DIR",
2014
+ help="Write ALL intermediate artefacts (prompts, raw Gemini replies) to DIR",
2015
+ )
2016
+ return p
2017
+
2018
+ # -- 9.2 main() -------------------------------------------------------------
2019
+
2020
+ def main(argv: Optional[List[str]] = None) -> None:
2021
+ parser = _build_arg_parser()
2022
+ args = parser.parse_args(argv)
2023
+
2024
+ # Configure logging early so everything respects the chosen level.
2025
+ level = logging.DEBUG if args.verbose >= 2 else logging.INFO if args.verbose else logging.WARNING
2026
+ logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
2027
+
2028
+ run_pipeline(
2029
+ manuscript=args.manuscript,
2030
+ si=args.si,
2031
+ output_csv=args.output,
2032
+ lineage_csv=args.lineage_csv,
2033
+ debug_dir=args.debug_dir,
2034
+ )
2035
+
2036
+ if __name__ == "__main__":
2037
+ main()
2038
+
2039
+ # -------------------------------------------------------------------- end 9 ---