debase 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2331 @@
1
+ """reaction_info_extractor_clean.py
2
+
3
+ Single-file, maintainable CLI tool that pulls **enzyme-reaction performance data**
4
+ from chemistry PDFs using Google Gemini (text-only *and* vision) - now with
5
+ **true figure-image extraction** mirroring the enzyme-lineage workflow.
6
+
7
+ Key June 2025 additions
8
+ =======================
9
+ 1. **Figure image helper** - locates the figure caption, then exports the first
10
+ image **above** that caption using PyMuPDF (fitz). This PNG is sent to
11
+ Gemini Vision for metric extraction.
12
+ 2. **GeminiClient.generate()** now accepts an optional `image_b64` arg and
13
+ automatically switches to a *vision* invocation when provided.
14
+ 3. **extract_metrics_for_enzyme()** chooses between three tiers:
15
+
16
+ * *Table* -> caption + following rows (text-only)
17
+ * *Figure* -> image bytes (vision) *or* caption fallback
18
+ * *Other* -> page-level text
19
+
20
+ If the vision route fails (no JSON), it gracefully falls back to caption
21
+ text so the pipeline never crashes.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import logging
29
+ import os
30
+ import re
31
+ import sys
32
+ import time
33
+ from base64 import b64encode, b64decode
34
+ from dataclasses import dataclass, field
35
+ from pathlib import Path
36
+ from textwrap import dedent
37
+ from typing import Any, Dict, List, Optional, Tuple
38
+
39
+ import fitz # PyMuPDF - for image extraction
40
+ import google.generativeai as genai # type: ignore
41
+ import pandas as pd
42
+ from PyPDF2 import PdfReader
43
+ import PIL.Image
44
+ import io
45
+
46
+ ###############################################################################
47
+ # 1 - CONFIG & CONSTANTS
48
+ ###############################################################################
49
+
50
+ @dataclass
51
+ class Config:
52
+ """Centralised tunables so tests can override them easily."""
53
+
54
+ model_name: str = "gemini-1.5-pro-latest"
55
+ location_temperature: float = 0.2
56
+ extract_temperature: float = 0.0
57
+ model_reaction_temperature: float = 0.0
58
+ top_p: float = 1.0
59
+ max_tokens: int = 4096
60
+ pdf_cache_size: int = 8
61
+ retries: int = 2
62
+
63
+ @dataclass
64
+ class CompoundMapping:
65
+ """Mapping between compound identifiers and IUPAC names."""
66
+ identifiers: List[str]
67
+ iupac_name: str
68
+ common_names: List[str] = field(default_factory=list)
69
+ compound_type: str = "unknown"
70
+ source_location: Optional[str] = None
71
+
72
+ ###############################################################################
73
+ # 2 - LOGGING
74
+ ###############################################################################
75
+
76
+ LOGGER = logging.getLogger("reaction_info_extractor")
77
+ handler = logging.StreamHandler()
78
+ handler.setFormatter(logging.Formatter("%(levelname)s [%(name)s] %(message)s"))
79
+ LOGGER.addHandler(handler)
80
+ LOGGER.setLevel(logging.INFO)
81
+
82
+ # --- Debug dump helper ----------------------------------------------------
83
+ def _dump(text: str | bytes, path: Path | str) -> None:
84
+ """Write `text` / `bytes` to `path`, creating parent dirs as needed."""
85
+ p = Path(path)
86
+ p.parent.mkdir(parents=True, exist_ok=True)
87
+ mode = "wb" if isinstance(text, (bytes, bytearray)) else "w"
88
+ with p.open(mode) as fh:
89
+ fh.write(text)
90
+
91
+ ###############################################################################
92
+ # 3 - PDF UTILITIES
93
+ ###############################################################################
94
+
95
+ def extract_text_by_page(path: Optional[Path]) -> List[str]:
96
+ if path is None:
97
+ return []
98
+ reader = PdfReader(str(path))
99
+ pages: List[str] = []
100
+ for page in reader.pages:
101
+ try:
102
+ pages.append(page.extract_text() or "")
103
+ except Exception as exc: # pragma: no cover
104
+ LOGGER.warning("PyPDF2 failed on a page: %s", exc)
105
+ pages.append("")
106
+ return pages
107
+
108
+ ###############################################################################
109
+ # 4 - GEMINI WRAPPER (text & vision)
110
+ ###############################################################################
111
+
112
+ def get_model(cfg: Config):
113
+ """Configure API key and return a `GenerativeModel` instance."""
114
+ api_key = os.getenv("GEMINI_API_KEY")
115
+ if not api_key:
116
+ raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
117
+ genai.configure(api_key=api_key)
118
+ return genai.GenerativeModel(cfg.model_name)
119
+
120
+ def generate_json_with_retry(
121
+ model,
122
+ prompt: str,
123
+ schema_hint: str | None = None,
124
+ *,
125
+ max_retries: int = 2,
126
+ temperature: float = 0.0,
127
+ debug_dir: str | Path | None = None,
128
+ tag: str = 'gemini',
129
+ image_b64: Optional[str] = None,
130
+ ):
131
+ """Call Gemini with retries & exponential back-off, returning parsed JSON."""
132
+ # Log prompt details
133
+ LOGGER.info("=== GEMINI API CALL: %s ===", tag.upper())
134
+ LOGGER.info("Prompt length: %d characters", len(prompt))
135
+ LOGGER.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
136
+
137
+ # Save full prompt to debug directory if provided
138
+ if debug_dir:
139
+ debug_path = Path(debug_dir)
140
+ debug_path.mkdir(parents=True, exist_ok=True)
141
+ prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
142
+ _dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\n{'='*80}\n\n{prompt}",
143
+ prompt_file)
144
+ LOGGER.info("Full prompt saved to: %s", prompt_file)
145
+
146
+ fence_re = re.compile(r"```json|```", re.I)
147
+ for attempt in range(1, max_retries + 1):
148
+ try:
149
+ LOGGER.info("Calling Gemini API (attempt %d/%d)...", attempt, max_retries)
150
+
151
+ # Handle image if provided
152
+ if image_b64:
153
+ parts = [prompt, {"mime_type": "image/png", "data": image_b64}]
154
+ else:
155
+ parts = [prompt]
156
+
157
+ resp = model.generate_content(
158
+ parts,
159
+ generation_config={
160
+ "temperature": temperature,
161
+ "max_output_tokens": 4096,
162
+ }
163
+ )
164
+ raw = resp.text.strip()
165
+
166
+ # Log response
167
+ LOGGER.info("Gemini response length: %d characters", len(raw))
168
+ LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
169
+
170
+ # Save full response to debug directory
171
+ if debug_dir:
172
+ response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
173
+ _dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}",
174
+ response_file)
175
+ LOGGER.info("Full response saved to: %s", response_file)
176
+
177
+ # Remove common Markdown fences
178
+ if raw.startswith("```"):
179
+ raw = fence_re.sub("", raw).strip()
180
+
181
+ # Try to find JSON in the response
182
+ # First, try to parse as-is
183
+ try:
184
+ parsed = json.loads(raw)
185
+ except json.JSONDecodeError:
186
+ # If that fails, look for JSON array or object
187
+ # Find the first '[' or '{' and the matching closing bracket
188
+ json_start = -1
189
+ json_end = -1
190
+ bracket_stack = []
191
+ in_string = False
192
+ escape_next = False
193
+
194
+ for i, char in enumerate(raw):
195
+ if escape_next:
196
+ escape_next = False
197
+ continue
198
+
199
+ if char == '\\':
200
+ escape_next = True
201
+ continue
202
+
203
+ if char == '"' and not escape_next:
204
+ in_string = not in_string
205
+ continue
206
+
207
+ if in_string:
208
+ continue
209
+
210
+ if char in '[{':
211
+ if json_start == -1:
212
+ json_start = i
213
+ bracket_stack.append(char)
214
+ elif char in ']}':
215
+ if bracket_stack:
216
+ opening = bracket_stack.pop()
217
+ if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
218
+ if not bracket_stack: # Found complete JSON
219
+ json_end = i + 1
220
+ break
221
+
222
+ if json_start >= 0 and json_end > json_start:
223
+ # Extract the JSON portion
224
+ json_str = raw[json_start:json_end]
225
+ parsed = json.loads(json_str)
226
+ else:
227
+ # Look for simple [] in the response
228
+ if '[]' in raw:
229
+ parsed = []
230
+ else:
231
+ # No JSON structure found, re-raise the original error
232
+ raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
233
+ LOGGER.info("Successfully parsed JSON response")
234
+ return parsed
235
+ except Exception as exc:
236
+ LOGGER.warning(
237
+ "Gemini call failed (attempt %d/%d): %s",
238
+ attempt, max_retries, exc,
239
+ )
240
+ if attempt == max_retries:
241
+ raise
242
+ time.sleep(2 ** attempt)
243
+
244
+
245
+ ###############################################################################
246
+ # 5 - PROMPTS (unchanged except for brevity)
247
+ ###############################################################################
248
+
249
+ PROMPT_FIND_LOCATIONS = dedent("""
250
+ You are an expert reader of protein engineering manuscripts.
251
+ Given the following article captions and section titles, identify ALL locations
252
+ (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
253
+ activity, etc.) for enzyme variants.
254
+
255
+ IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
256
+ performance data locations. Pay careful attention to:
257
+ - The caption text to identify which campaign/lineage the data is for
258
+ - Enzyme name prefixes (e.g., PYS vs INS) that indicate different campaigns
259
+ - Different substrate/product types mentioned in captions
260
+
261
+ Respond with a JSON array where each element contains:
262
+ - "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
263
+ - "type": one of "table", "figure"
264
+ - "confidence": your confidence score (0-100)
265
+ - "caption": the exact caption text for this location
266
+ - "reason": brief explanation (including if this is for a specific lineage/campaign)
267
+ - "lineage_hint": any indication of which enzyme group this data is for (or null)
268
+ - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
269
+
270
+ Tables are preferred over figures when both contain the same data.
271
+
272
+ Respond ONLY with **minified JSON**. NO markdown fences.
273
+
274
+ Example:
275
+ [{"location": "Table S1", "type": "table", "confidence": 95, "caption": "Table S1. Detailed information...", "reason": "Complete performance metrics", "lineage_hint": "first enzyme family", "campaign_clues": "PYS lineage, pyrrolidine synthesis"}]
276
+ """)
277
+
278
+ PROMPT_EXTRACT_METRICS = dedent("""
279
+ You are given either (a) the PNG image of a figure panel, or (b) the caption /
280
+ text excerpt that contains numeric reaction performance data for an enzyme.
281
+
282
+ Extract ONLY the performance metrics, NOT substrate/product names or reaction conditions.
283
+ Return a JSON object with the following keys (use **null** if not found):
284
+ * "yield" - yield as percentage with ONE decimal place precision
285
+ * "ttn" - turnover number (total turnovers)
286
+ * "ton" - turnover number if TTN not available
287
+ * "selectivity" - ee or er value with unit (e.g., "98% ee", ">99:1 er")
288
+ * "conversion" - conversion percentage if different from yield
289
+ * "tof" - turnover frequency (turnovers per time unit) if provided
290
+ * "activity" - specific activity if provided (with unit)
291
+ * "other_metrics" - dictionary of any other performance metrics with their units
292
+ * "notes" - any performance-related notes
293
+
294
+ IMPORTANT:
295
+ - Extract ALL performance metrics provided, even if they use different units.
296
+ - Do NOT extract substrate/product names - these will come from SI
297
+ - Do NOT extract reaction conditions (temperature, pH, time, solvent)
298
+ - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
299
+
300
+ Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
301
+ """)
302
+
303
+ PROMPT_EXTRACT_FIGURE_METRICS_BATCH = dedent("""
304
+ You are analyzing a figure showing enzyme reaction performance data for multiple variants.
305
+
306
+ Extract performance metrics for ALL the following enzyme variants:
307
+ {enzyme_names}
308
+
309
+ Steps:
310
+ 1. CHECK THE Y-AXIS SCALE: What is the maximum value? (e.g., 10%, 30%, 50%, 100%)
311
+ 2. For each enzyme variant listed above:
312
+ - Find its position on the X-axis
313
+ - Read the bar height or data point value
314
+ - Calculate the actual value based on the Y-axis scale
315
+ 3. Compare all bars to understand relative performance
316
+
317
+ Return a JSON object with enzyme names as keys, each containing:
318
+ * "yield" - yield with ONE decimal place precision
319
+ * "ttn" - turnover number if shown
320
+ * "ton" - turnover number if TTN not available
321
+ * "selectivity" - ee or er value with unit
322
+ * "conversion" - conversion percentage if different from yield
323
+ * "tof" - turnover frequency if provided
324
+ * "activity" - specific activity if provided
325
+ * "other_metrics" - dictionary of any other metrics
326
+ * "notes" - any relevant notes (including reaction type if different reactions are shown)
327
+
328
+ CRITICAL:
329
+ - Read ALL pages provided in the image
330
+ - If different enzymes are tested for different reactions (e.g., pyrrolidine vs indoline synthesis), note this in "notes"
331
+ - For tables, check if data continues beyond what's shown
332
+ - Read the Y-axis scale carefully for figures
333
+
334
+ Example format:
335
+ {{"ApePgb LVQ": {{"yield": 0.0, "ttn": null, "notes": "pyrrolidine synthesis", ...}}, ...}}
336
+
337
+ Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
338
+ """)
339
+
340
+ # Removed substrate scope IUPAC extraction - now handled in model reaction only
341
+
342
+ PROMPT_FIND_MODEL_REACTION_LOCATION = dedent("""
343
+ You are an expert reader of chemistry manuscripts.
344
+ Given the following text sections, identify where the MODEL REACTION information is located.
345
+
346
+ The model reaction is the STANDARD reaction used to evaluate all enzyme variants
347
+ (not the substrate scope). Look for:
348
+
349
+ - Sections titled "Model Reaction", "Standard Reaction", "General Procedure"
350
+ - Text describing the reaction conditions used for enzyme evolution/screening
351
+ - Sections describing which substrates were used as the benchmark
352
+ - Compound numbers (e.g., "6a", "7a") used in the model reaction
353
+
354
+ Also identify where the IUPAC names for these specific compounds are listed.
355
+
356
+ Respond with a JSON object containing:
357
+ {
358
+ "model_reaction_location": {
359
+ "location": "section name or description",
360
+ "confidence": 0-100,
361
+ "reason": "why this contains the model reaction",
362
+ "compound_ids": ["list", "of", "compound", "IDs", "if", "found"]
363
+ },
364
+ "conditions_location": {
365
+ "location": "where reaction conditions are described",
366
+ "confidence": 0-100
367
+ },
368
+ "iupac_location": {
369
+ "location": "where IUPAC names are listed (usually SI compound characterization)",
370
+ "confidence": 0-100,
371
+ "compound_section_hint": "specific section to look for compound IDs"
372
+ }
373
+ }
374
+
375
+ Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
376
+ """)
377
+
378
+ PROMPT_MODEL_REACTION = dedent("""
379
+ Extract the model/standard reaction used to evaluate enzyme variants in this paper.
380
+
381
+ This is the reaction used for directed evolution screening, NOT the substrate scope.
382
+ Look for terms like "model reaction", "standard substrate", "benchmark reaction",
383
+ or the specific reaction mentioned in enzyme screening/evolution sections.
384
+
385
+ CRITICAL STEPS FOR IUPAC NAMES:
386
+ 1. First identify the compound IDs used in the model reaction (e.g., "6a", "7a")
387
+ 2. Then search the provided context for these compound IDs to find their IUPAC names
388
+ 3. Look for sections with "Compound 6a", "Product 7a", or similar patterns
389
+ 4. The IUPAC names are usually given after the compound ID in parentheses or after a colon
390
+
391
+ CRITICAL FOR SUBSTRATE CONCENTRATION:
392
+ - Look carefully in FIGURES and figure captions for substrate concentration information
393
+ - Figures often show detailed reaction conditions that may not be in the main text
394
+ - Identify the ACTUAL SUBSTRATES being transformed (not reducing agents or cofactors)
395
+ - Common pattern: "[X] mM [substrate name]" or "[substrate]: [X] mM"
396
+ - DO NOT confuse reducing agents (dithionite, NADH, etc.) with actual substrates
397
+ - The substrate is the molecule being chemically transformed by the enzyme
398
+
399
+ Return a JSON object with:
400
+ * "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["5", "6a"])
401
+ * "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents
402
+ * "product_list" - Array of product identifiers as used in the paper (e.g., ["7a"])
403
+ * "product_iupac_list" - Array of IUPAC names for ALL products formed
404
+ * "reaction_substrate_concentration" - Concentration of actual substrate(s) being transformed, NOT reducing agents like dithionite
405
+ * "cofactor" - Any cofactors used (e.g., "NADH", "NADPH", "FAD", "heme", etc.) or null if none
406
+ * "reaction_temperature" - reaction temperature (e.g., "25°C", "room temperature")
407
+ * "reaction_ph" - reaction pH
408
+ * "reaction_buffer" - buffer system (e.g., "50 mM potassium phosphate")
409
+ * "reaction_other_conditions" - other important conditions (enzyme loading, reducing agents like dithionite, time, anaerobic, etc.)
410
+
411
+ IMPORTANT:
412
+ - Extract the reaction used for ENZYME EVOLUTION/SCREENING (not substrate scope)
413
+ - Substrate concentration = concentration of chemicals being transformed, NOT reducing agents (dithionite, NADH, etc.)
414
+ - Maintain correspondence: substrate_list[i] should map to substrate_iupac_list[i], same for products
415
+ - If a compound ID has no IUPAC name found, still include it in the list with null in the IUPAC list
416
+ - For IUPAC names, look for the SYSTEMATIC chemical names, NOT common/trivial names
417
+ - Search the provided context for systematic names - they typically:
418
+ * Use numerical locants (e.g., "prop-2-enoate" not "acrylate")
419
+ * Follow IUPAC nomenclature rules
420
+ * May be found in compound characterization sections
421
+ - If you find a common name in the reaction description, search the context for its systematic equivalent
422
+ - Look for the exact systematic names as written in the compound characterization
423
+ - Do NOT include stereochemistry prefixes like (1R,2S) unless they are part of the compound name in the SI
424
+
425
+ Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
426
+ """)
427
+
428
+ PROMPT_ANALYZE_LINEAGE_GROUPS = dedent("""
429
+ You are analyzing enzyme performance data from a protein engineering manuscript.
430
+ Based on the performance data locations and enzyme names, determine if there are
431
+ distinct enzyme lineage groups that were evolved for different purposes.
432
+
433
+ Look for patterns such as:
434
+ - Different tables/figures for different enzyme groups
435
+ - Enzyme naming patterns that suggest different lineages
436
+ - Different reaction types mentioned in notes or captions
437
+ - Clear separations in how variants are organized
438
+
439
+ Return a JSON object with:
440
+ {
441
+ "has_multiple_lineages": true/false,
442
+ "lineage_groups": [
443
+ {
444
+ "group_id": "unique identifier you assign",
445
+ "data_location": "where this group's data is found",
446
+ "enzyme_pattern": "naming pattern or list of enzymes",
447
+ "reaction_type": "what reaction this group catalyzes",
448
+ "evidence": "why you grouped these together"
449
+ }
450
+ ],
451
+ "confidence": 0-100
452
+ }
453
+
454
+ If only one lineage exists, return has_multiple_lineages: false with a single group.
455
+
456
+ Respond ONLY with **minified JSON**.
457
+ """)
458
+
459
+ PROMPT_FIND_LINEAGE_MODEL_REACTION = dedent("""
460
+ For the enzyme group with performance data in {location}, identify the specific
461
+ model reaction used to screen/evaluate these variants.
462
+
463
+ Context about this group:
464
+ {group_context}
465
+
466
+ Look for:
467
+ - References to the specific substrate/product used for this enzyme group
468
+ - Text near the performance data location describing the reaction
469
+ - Connections between the enzyme names and specific substrates
470
+ - Any mention of "screened with", "tested against", "substrate X was used"
471
+
472
+ Return:
473
+ {{
474
+ "substrate_ids": ["list of substrate IDs for this group"],
475
+ "product_ids": ["list of product IDs for this group"],
476
+ "confidence": 0-100,
477
+ "evidence": "text supporting this substrate/product assignment"
478
+ }}
479
+
480
+ Respond ONLY with **minified JSON**.
481
+ """)
482
+
483
+ PROMPT_COMPOUND_MAPPING = dedent("""
484
+ Extract compound identifiers and their IUPAC names from the provided sections.
485
+
486
+ Look for ALL compounds mentioned, including:
487
+ 1. Compounds with explicit IUPAC names in the text
488
+ 2. Common reagents where you can provide standard IUPAC names
489
+ 3. Products that may not be explicitly characterized
490
+
491
+ CRITICAL - NO HALLUCINATION:
492
+ - Extract IUPAC names EXACTLY as written in the source
493
+ - DO NOT modify, correct, or "improve" any chemical names
494
+ - If a name is written as "benzyl-2-phenylcyclopropane-1-carboxylate", keep it exactly
495
+ - Only provide standard IUPAC names for common reagents if not found in text
496
+ - If no IUPAC name is found for a compound, return null for iupac_name
497
+ - Include ALL compounds found or referenced
498
+
499
+ Return as JSON:
500
+ {
501
+ "compound_mappings": [
502
+ {
503
+ "identifier": "compound identifier",
504
+ "iupac_name": "complete IUPAC name",
505
+ "common_names": ["any alternative names"],
506
+ "compound_type": "substrate/product/reagent/other",
507
+ "source_location": "where found or inferred"
508
+ }
509
+ ]
510
+ }
511
+ """)
512
+
513
+ ###############################################################################
514
+ # 6 - EXTRACTION ENGINE
515
+ ###############################################################################
516
+
517
+ class ReactionExtractor:
518
+ _FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
519
+ _TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
520
+
521
+ def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
522
+ campaign_filter: Optional[str] = None):
523
+ self.manuscript = manuscript
524
+ self.si = si
525
+ self.cfg = cfg
526
+ self.model = get_model(cfg)
527
+ self.debug_dir = debug_dir
528
+ self.campaign_filter = campaign_filter # Filter for specific campaign
529
+
530
+ # Create debug directory if specified
531
+ if self.debug_dir:
532
+ self.debug_dir = Path(self.debug_dir)
533
+ self.debug_dir.mkdir(parents=True, exist_ok=True)
534
+ LOGGER.info("Debug output will be saved to: %s", self.debug_dir)
535
+
536
+ if self.campaign_filter:
537
+ LOGGER.info("Filtering extraction for campaign: %s", self.campaign_filter)
538
+
539
+ # Preload text pages
540
+ LOGGER.info("Reading PDFs…")
541
+ self.ms_pages = extract_text_by_page(manuscript)
542
+ self.si_pages = extract_text_by_page(si)
543
+ self.all_pages = self.ms_pages + self.si_pages
544
+
545
+ # Keep open fitz Docs for image extraction
546
+ self.ms_doc = fitz.open(str(manuscript))
547
+ self.si_doc = fitz.open(str(si)) if si else None
548
+
549
+ # ------------------------------------------------------------------
550
+ # 6.1 Find locations (unchanged)
551
+ # ------------------------------------------------------------------
552
+
553
+ def _collect_captions_and_titles(self) -> str:
554
+ # Simpler pattern: match any line starting with Table or Figure
555
+ # This catches all variations like "Table S 2", "Table.", "Figure S1", etc.
556
+ cap_pattern = re.compile(r"^(Table|Figure).*", re.I | re.M)
557
+ captions: List[str] = []
558
+
559
+ # Collect from all pages
560
+ all_text = "\n".join(self.all_pages)
561
+
562
+ # Find all figure/table captions
563
+ for match in cap_pattern.finditer(all_text):
564
+ caption_start = match.start()
565
+ # Get up to 1200 chars or until double newline
566
+ caption_end = all_text.find("\n\n", caption_start)
567
+ if caption_end == -1 or caption_end - caption_start > 1200:
568
+ caption_end = caption_start + 1200
569
+ caption = all_text[caption_start:caption_end].strip()
570
+ captions.append(caption)
571
+
572
+ # Also look for SI section titles
573
+ si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
574
+
575
+ result = "\n".join(captions + si_titles)
576
+ LOGGER.debug("Collected %d captions/titles, total length: %d chars",
577
+ len(captions) + len(si_titles), len(result))
578
+
579
+ # Log first few captions for debugging
580
+ if captions:
581
+ LOGGER.debug("First few captions: %s", captions[:3])
582
+
583
+ return result
584
+
585
+ def find_reaction_locations(self) -> List[Dict[str, Any]]:
586
+ """Find all locations containing reaction performance data."""
587
+ # Add campaign context if available
588
+ campaign_context = ""
589
+ if self.campaign_filter:
590
+ campaign_context = f"""
591
+ IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
592
+ Only return locations that contain data for this specific campaign.
593
+ Ignore locations that contain data for other campaigns.
594
+
595
+ """
596
+
597
+ prompt = campaign_context + PROMPT_FIND_LOCATIONS + "\n\n" + self._collect_captions_and_titles()
598
+ try:
599
+ data = generate_json_with_retry(
600
+ self.model,
601
+ prompt,
602
+ temperature=self.cfg.location_temperature,
603
+ debug_dir=self.debug_dir,
604
+ tag="find_locations"
605
+ )
606
+ # Handle both single dict (backwards compatibility) and list
607
+ if isinstance(data, dict):
608
+ return [data]
609
+ elif isinstance(data, list):
610
+ return data
611
+ else:
612
+ LOGGER.error("Expected list or dict from Gemini, got: %s", type(data))
613
+ return []
614
+ except Exception as e:
615
+ LOGGER.error("Failed to find reaction locations: %s", e)
616
+ return []
617
+
618
+ def _get_base_location(self, location: str) -> str:
619
+ """Extract the base location identifier (e.g., 'Table S1' from 'Table S1' or 'S41-S47').
620
+
621
+ This helps group related locations that likely share the same model reaction.
622
+ """
623
+ # Common patterns for locations
624
+ patterns = [
625
+ (r'Table\s+S\d+', 'table'),
626
+ (r'Figure\s+S\d+', 'figure'),
627
+ (r'Table\s+\d+', 'table'),
628
+ (r'Figure\s+\d+', 'figure'),
629
+ (r'S\d+(?:-S\d+)?', 'supp'), # Supplementary pages like S41-S47
630
+ ]
631
+
632
+ for pattern, loc_type in patterns:
633
+ match = re.search(pattern, location, re.I)
634
+ if match:
635
+ return match.group(0)
636
+
637
+ # Default: use the location as-is
638
+ return location
639
+
640
+ def analyze_lineage_groups(self, locations: List[Dict[str, Any]], enzyme_df: pd.DataFrame) -> Dict[str, Any]:
641
+ """Analyze if there are distinct lineage groups based on different locations.
642
+
643
+ Key principle: Different locations (tables/figures) indicate different model reactions.
644
+ """
645
+ # Group locations by their base identifier
646
+ location_groups = {}
647
+
648
+ for loc in locations:
649
+ location_id = loc['location']
650
+ base_location = self._get_base_location(location_id)
651
+
652
+ if base_location not in location_groups:
653
+ location_groups[base_location] = []
654
+ location_groups[base_location].append(loc)
655
+
656
+ # Each unique base location represents a potential lineage group
657
+ lineage_groups = []
658
+
659
+ for base_loc, locs in location_groups.items():
660
+ # Use the location with highest confidence as primary
661
+ primary_loc = max(locs, key=lambda x: x.get('confidence', 0))
662
+
663
+ # Create a group for this location
664
+ group = {
665
+ 'group_id': base_loc,
666
+ 'data_location': primary_loc['location'],
667
+ 'all_locations': [l['location'] for l in locs],
668
+ 'lineage_hint': primary_loc.get('lineage_hint', ''),
669
+ 'caption': primary_loc.get('caption', ''),
670
+ 'confidence': primary_loc.get('confidence', 0)
671
+ }
672
+ lineage_groups.append(group)
673
+
674
+ # Multiple distinct base locations = multiple model reactions
675
+ has_multiple = len(location_groups) > 1
676
+
677
+ LOGGER.info("Location-based lineage analysis: %d distinct base locations found",
678
+ len(location_groups))
679
+ for group in lineage_groups:
680
+ LOGGER.info(" - %s: %s", group['group_id'], group['data_location'])
681
+
682
+ return {
683
+ 'has_multiple_lineages': has_multiple,
684
+ 'lineage_groups': lineage_groups,
685
+ 'confidence': 95
686
+ }
687
+
688
+ def find_lineage_model_reaction(self, location: str, group_context: str) -> Dict[str, Any]:
689
+ """Find the model reaction for a specific lineage group."""
690
+ # Gather relevant text near this location
691
+ page_text = self._page_with_reference(location) or ""
692
+
693
+ # Also check manuscript introduction for model reaction info
694
+ intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
695
+
696
+ prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
697
+ location=location,
698
+ group_context=group_context
699
+ )
700
+ prompt += f"\n\nText near {location}:\n{page_text[:3000]}"
701
+ prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
702
+
703
+ try:
704
+ data = generate_json_with_retry(
705
+ self.model,
706
+ prompt,
707
+ temperature=self.cfg.model_reaction_temperature,
708
+ debug_dir=self.debug_dir,
709
+ tag=f"lineage_model_reaction_{location.replace(' ', '_')}"
710
+ )
711
+ return data if isinstance(data, dict) else {}
712
+ except Exception as e:
713
+ LOGGER.error("Failed to find model reaction for lineage at %s: %s", location, e)
714
+ return {}
715
+
716
+ # ------------------------------------------------------------------
717
+ # 6.2 Figure / Table context helpers
718
+ # ------------------------------------------------------------------
719
+
720
+ def _page_with_reference(self, ref_id: str) -> Optional[str]:
721
+ for page in self.all_pages:
722
+ if ref_id.lower() in page.lower():
723
+ return page
724
+ return None
725
+
726
+ # ---- Table text helper - now returns full page ----
727
+ def _extract_table_context(self, ref: str) -> str:
728
+ page = self._page_with_reference(ref)
729
+ if not page:
730
+ return ""
731
+ # Return the entire page content for better table extraction
732
+ return page
733
+
734
+ # ---- Figure caption helper (text fallback) ----
735
+ def _extract_figure_caption(self, ref: str) -> str:
736
+ page = self._page_with_reference(ref)
737
+ if not page:
738
+ return ""
739
+ m = re.search(rf"({re.escape(ref)}[\s\S]{{0,800}}?\.)", page, re.I)
740
+ if m:
741
+ return m.group(1)
742
+ for line in page.split("\n"):
743
+ if ref.lower() in line.lower():
744
+ return line
745
+ return page[:800]
746
+
747
+ # ---- NEW: Page image helper for both figures and tables ----
748
+ def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
749
+ """Export the page containing the reference as PNG.
750
+ If extract_figure_only=True, extracts just the figure above the caption.
751
+ If False, extracts the entire page (useful for tables).
752
+ Returns a base64-encoded PNG or None."""
753
+
754
+ # For table extraction, use multi-page approach
755
+ if not extract_figure_only:
756
+ pages_with_ref = self._find_pages_with_reference(ref)
757
+ if pages_with_ref:
758
+ LOGGER.debug(f"Found {len(pages_with_ref)} pages containing {ref}")
759
+ return self._extract_multiple_pages_png(pages_with_ref)
760
+ return None
761
+
762
+ # For figure extraction, search both documents
763
+ for doc in filter(None, [self.ms_doc, self.si_doc]):
764
+ for page_number in range(doc.page_count):
765
+ page = doc.load_page(page_number)
766
+ page_text = page.get_text()
767
+ if ref.lower() not in page_text.lower():
768
+ continue
769
+ # Get caption bbox
770
+ text_instances = page.search_for(ref, quads=False)
771
+ if not text_instances:
772
+ continue
773
+ cap_rect = text_instances[0] # first match
774
+
775
+ if extract_figure_only:
776
+ # Sort images by y0 (top) coordinate ascending
777
+ images = sorted(page.get_images(full=True), key=lambda im: im[7])
778
+ # Find first image whose bottom y is **above** caption top y
779
+ for img in images:
780
+ xref = img[0]
781
+ # Get image rectangles to find position
782
+ img_rects = page.get_image_rects(xref)
783
+ if img_rects:
784
+ img_rect = img_rects[0] # First rectangle
785
+ if img_rect.y1 < cap_rect.y0: # fully above caption
786
+ # Extract image bytes
787
+ pix = fitz.Pixmap(doc, xref)
788
+ if pix.alpha: # RGBA -> RGB
789
+ pix = fitz.Pixmap(fitz.csRGB, pix)
790
+ img_bytes = pix.tobytes("png")
791
+ return b64encode(img_bytes).decode()
792
+ else:
793
+ # Extract the entire page as an image
794
+ mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
795
+ pix = page.get_pixmap(matrix=mat)
796
+ img_bytes = pix.tobytes("png")
797
+ return b64encode(img_bytes).decode()
798
+ return None
799
+
800
+ def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
801
+ """Find all pages containing the reference across documents.
802
+ Returns list of (document, page_number) tuples."""
803
+ pages_found = []
804
+
805
+ for doc in filter(None, [self.ms_doc, self.si_doc]):
806
+ for page_number in range(doc.page_count):
807
+ page = doc.load_page(page_number)
808
+ page_text = page.get_text()
809
+ if ref.lower() in page_text.lower():
810
+ pages_found.append((doc, page_number))
811
+
812
+ return pages_found
813
+
814
+ def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]]) -> Optional[str]:
815
+ """Extract multiple pages as a combined PNG image."""
816
+ if not pages:
817
+ return None
818
+
819
+ # Sort pages by document and page number
820
+ pages.sort(key=lambda x: (id(x[0]), x[1]))
821
+
822
+ # Extract the range of pages including one page after
823
+ all_images = []
824
+ for i, (doc, page_num) in enumerate(pages):
825
+ # Add the current page
826
+ mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
827
+ pix = doc.load_page(page_num).get_pixmap(matrix=mat)
828
+ all_images.append(pix)
829
+
830
+ # If this is the last page with the reference, also add the next page
831
+ if i == len(pages) - 1 and page_num + 1 < doc.page_count:
832
+ next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
833
+ all_images.append(next_pix)
834
+ LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
835
+
836
+ if not all_images:
837
+ return None
838
+
839
+ # If only one page, return it directly
840
+ if len(all_images) == 1:
841
+ return b64encode(all_images[0].tobytes("png")).decode()
842
+
843
+ # Combine multiple pages vertically
844
+ if not all_images:
845
+ return None
846
+
847
+ if len(all_images) == 1:
848
+ return b64encode(all_images[0].tobytes("png")).decode()
849
+
850
+ # Calculate dimensions for combined image
851
+ total_height = sum(pix.height for pix in all_images)
852
+ max_width = max(pix.width for pix in all_images)
853
+
854
+ LOGGER.info(f"Combining {len(all_images)} pages into single image ({max_width}x{total_height})")
855
+
856
+ # Create a new document with a single page that can hold all images
857
+ output_doc = fitz.open()
858
+
859
+ # Create a page with the combined dimensions
860
+ # Note: PDF pages have a max size, so we scale if needed
861
+ max_pdf_dimension = 14400 # PDF max is ~200 inches at 72 DPI
862
+ scale = 1.0
863
+ if total_height > max_pdf_dimension or max_width > max_pdf_dimension:
864
+ scale = min(max_pdf_dimension / total_height, max_pdf_dimension / max_width)
865
+ total_height = int(total_height * scale)
866
+ max_width = int(max_width * scale)
867
+ LOGGER.warning(f"Scaling down by {scale:.2f} to fit PDF limits")
868
+
869
+ page = output_doc.new_page(width=max_width, height=total_height)
870
+
871
+ # Insert each image into the page
872
+ y_offset = 0
873
+ for i, pix in enumerate(all_images):
874
+ # Center each image horizontally
875
+ x_offset = (max_width - pix.width * scale) / 2
876
+
877
+ # Create rect for image placement
878
+ rect = fitz.Rect(x_offset, y_offset,
879
+ x_offset + pix.width * scale,
880
+ y_offset + pix.height * scale)
881
+
882
+ # Insert the image
883
+ page.insert_image(rect, pixmap=pix)
884
+ y_offset += pix.height * scale
885
+
886
+ # Convert the page to a pixmap
887
+ mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
888
+ combined_pix = page.get_pixmap(matrix=mat)
889
+
890
+ # Convert to PNG and return
891
+ img_bytes = combined_pix.tobytes("png")
892
+ output_doc.close()
893
+
894
+ return b64encode(img_bytes).decode()
895
+
896
+ # ------------------------------------------------------------------
897
+ # 6.3 Extract metrics in batch
898
+ # ------------------------------------------------------------------
899
+
900
+ def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
901
+ """Extract performance metrics for multiple enzymes from the identified location in batch."""
902
+ ref_lc = ref.lower()
903
+ image_b64: Optional[str] = None
904
+
905
+ # Add campaign context if available
906
+ campaign_context = ""
907
+ if self.campaign_filter:
908
+ campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\n"
909
+
910
+ if self._TAB_RE.search(ref_lc):
911
+ # For tables, try to extract the page as an image first
912
+ image_b64 = self._extract_page_png(ref, extract_figure_only=False)
913
+ if not image_b64:
914
+ LOGGER.debug("No page image found for %s - using full page text", ref)
915
+ snippet = self._extract_table_context(ref)
916
+ elif self._FIG_RE.search(ref_lc):
917
+ # For figures, extract just the figure image
918
+ image_b64 = self._extract_page_png(ref, extract_figure_only=True)
919
+ if not image_b64:
920
+ LOGGER.debug("No figure image found for %s - using caption text", ref)
921
+ snippet = self._extract_figure_caption(ref)
922
+ else:
923
+ snippet = self._page_with_reference(ref) or ""
924
+
925
+ enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
926
+
927
+ if image_b64:
928
+ # Use batch extraction prompt for image analysis
929
+ prompt = campaign_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
930
+ LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
931
+ tag = f"extract_metrics_batch_vision"
932
+ else:
933
+ # Add enzyme names to prompt for batch extraction
934
+ prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\n=== CONTEXT ===\n" + snippet[:4000]
935
+ LOGGER.info("Gemini: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
936
+ tag = f"extract_metrics_batch"
937
+
938
+ try:
939
+ data = generate_json_with_retry(
940
+ self.model,
941
+ prompt,
942
+ temperature=self.cfg.extract_temperature,
943
+ debug_dir=self.debug_dir,
944
+ tag=tag,
945
+ image_b64=image_b64
946
+ )
947
+
948
+ # Handle the response format - expecting a dict with enzyme names as keys
949
+ results = []
950
+ if isinstance(data, dict):
951
+ for enzyme in enzyme_list:
952
+ enzyme_data = data.get(enzyme, {})
953
+ if not isinstance(enzyme_data, dict):
954
+ enzyme_data = {"error": "No data found"}
955
+
956
+ # Normalize keys
957
+ # No need to rename - we now use "yield" directly
958
+ if "TTN" in enzyme_data and "ttn" not in enzyme_data:
959
+ enzyme_data["ttn"] = enzyme_data.pop("TTN")
960
+
961
+ # Add metadata
962
+ enzyme_data["enzyme"] = enzyme
963
+ enzyme_data["location_ref"] = ref
964
+ enzyme_data["used_image"] = bool(image_b64)
965
+ results.append(enzyme_data)
966
+ else:
967
+ # Fallback if response format is unexpected
968
+ LOGGER.warning("Unexpected response format from batch extraction")
969
+ for enzyme in enzyme_list:
970
+ results.append({
971
+ "enzyme": enzyme,
972
+ "location_ref": ref,
973
+ "used_image": bool(image_b64),
974
+ "error": "Invalid response format"
975
+ })
976
+
977
+ except Exception as e:
978
+ LOGGER.warning("Failed to extract metrics batch: %s", e)
979
+ results = []
980
+ for enzyme in enzyme_list:
981
+ results.append({
982
+ "enzyme": enzyme,
983
+ "location_ref": ref,
984
+ "used_image": bool(image_b64),
985
+ "error": str(e)
986
+ })
987
+
988
+ return results
989
+
990
+ # Removed extract_iupac_names - substrate scope IUPAC extraction no longer needed
991
+
992
+ # ------------------------------------------------------------------
993
+ # 6.4 Model reaction with location finding
994
+ # ------------------------------------------------------------------
995
+
996
+ def find_model_reaction_locations(self, enzyme_variants: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
997
+ """Find locations for model reaction scheme, conditions, and IUPAC names."""
998
+ # Collect all text including section titles, captions, and schemes
999
+ all_text = self._collect_captions_and_titles()
1000
+
1001
+ # Also add first few pages of main text and SI
1002
+ ms_preview = "\n".join(self.ms_pages[:5])[:5000]
1003
+ si_preview = "\n".join(self.si_pages[:10])[:5000] if self.si_pages else ""
1004
+
1005
+ # Add enzyme context if provided
1006
+ enzyme_context = ""
1007
+ if enzyme_variants and self.campaign_filter:
1008
+ enzyme_context = f"""
1009
+ IMPORTANT CONTEXT:
1010
+ You are looking for the model reaction used specifically for these enzyme variants:
1011
+ {', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
1012
+
1013
+ These variants belong to campaign: {self.campaign_filter}
1014
+
1015
+ Focus on finding the model reaction that was used to evaluate THESE specific variants.
1016
+ Different campaigns may use different model reactions.
1017
+ """
1018
+
1019
+ prompt = enzyme_context + PROMPT_FIND_MODEL_REACTION_LOCATION + "\n\n=== CAPTIONS AND SECTIONS ===\n" + all_text + "\n\n=== MANUSCRIPT TEXT PREVIEW ===\n" + ms_preview + "\n\n=== SI TEXT PREVIEW ===\n" + si_preview
1020
+
1021
+ try:
1022
+ data = generate_json_with_retry(
1023
+ self.model,
1024
+ prompt,
1025
+ temperature=self.cfg.location_temperature,
1026
+ debug_dir=self.debug_dir,
1027
+ tag="find_model_reaction_locations"
1028
+ )
1029
+ if not isinstance(data, dict):
1030
+ LOGGER.error("Expected dict from Gemini, got: %s", type(data))
1031
+ return None
1032
+ return data
1033
+ except Exception as e:
1034
+ LOGGER.error("Failed to find model reaction locations: %s", e)
1035
+ return None
1036
+
1037
+ def _get_text_around_location(self, location: str) -> Optional[str]:
1038
+ """Extract text around a given location identifier."""
1039
+ location_lower = location.lower()
1040
+
1041
+ # Search in all pages
1042
+ for page_text in self.all_pages:
1043
+ if location_lower in page_text.lower():
1044
+ # Find the location and extract context around it
1045
+ idx = page_text.lower().index(location_lower)
1046
+ start = max(0, idx - 500)
1047
+ end = min(len(page_text), idx + 3000)
1048
+ return page_text[start:end]
1049
+
1050
+ # If not found in exact form, try pattern matching
1051
+ # For scheme/figure references
1052
+ if re.search(r"(scheme|figure|table)\s*\d+", location_lower):
1053
+ pattern = re.compile(location.replace(" ", r"\s*"), re.I)
1054
+ for page_text in self.all_pages:
1055
+ match = pattern.search(page_text)
1056
+ if match:
1057
+ start = max(0, match.start() - 500)
1058
+ end = min(len(page_text), match.end() + 3000)
1059
+ return page_text[start:end]
1060
+
1061
+ return None
1062
+
1063
+ def _get_extended_text_around_location(self, location: str, before: int = 2000, after: int = 10000) -> Optional[str]:
1064
+ """Extract extended text around a given location identifier."""
1065
+ location_lower = location.lower()
1066
+
1067
+ # Search in all pages
1068
+ for i, page_text in enumerate(self.all_pages):
1069
+ if location_lower in page_text.lower():
1070
+ # Find the location
1071
+ idx = page_text.lower().index(location_lower)
1072
+
1073
+ # Collect text from multiple pages if needed
1074
+ result = []
1075
+
1076
+ # Start from current page
1077
+ start = max(0, idx - before)
1078
+ result.append(page_text[start:])
1079
+
1080
+ # Add subsequent pages up to 'after' characters
1081
+ chars_collected = len(page_text) - start
1082
+ page_idx = i + 1
1083
+
1084
+ while chars_collected < after + before and page_idx < len(self.all_pages):
1085
+ next_page = self.all_pages[page_idx]
1086
+ chars_to_take = min(len(next_page), after + before - chars_collected)
1087
+ result.append(next_page[:chars_to_take])
1088
+ chars_collected += chars_to_take
1089
+ page_idx += 1
1090
+
1091
+ return "\n".join(result)
1092
+
1093
+ return None
1094
+
1095
+ def _extract_sections_by_title(self, sections: List[str], max_chars_per_section: int = 5000) -> str:
1096
+ """Extract text from sections with specific titles."""
1097
+ extracted_text = []
1098
+
1099
+ for section_title in sections:
1100
+ pattern = re.compile(rf"{re.escape(section_title)}.*?(?=\n\n[A-Z]|\Z)", re.I | re.S)
1101
+
1102
+ # Search in all pages
1103
+ for page in self.all_pages:
1104
+ match = pattern.search(page)
1105
+ if match:
1106
+ section_text = match.group(0)[:max_chars_per_section]
1107
+ extracted_text.append(f"=== {section_title} ===\n{section_text}")
1108
+ break
1109
+
1110
+ return "\n\n".join(extracted_text)
1111
+
1112
+ def _extract_compound_mappings_from_text(
1113
+ self,
1114
+ extraction_text: str,
1115
+ compound_ids: List[str] = None,
1116
+ tag_suffix: str = "",
1117
+ ) -> Dict[str, CompoundMapping]:
1118
+ """Helper function to extract compound mappings from provided text."""
1119
+ prompt = PROMPT_COMPOUND_MAPPING
1120
+ if compound_ids:
1121
+ prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
1122
+ prompt += "\n\nTEXT:\n" + extraction_text
1123
+
1124
+ tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
1125
+
1126
+ try:
1127
+ data = generate_json_with_retry(
1128
+ self.model,
1129
+ prompt,
1130
+ temperature=self.cfg.model_reaction_temperature,
1131
+ debug_dir=self.debug_dir,
1132
+ tag=tag,
1133
+ )
1134
+
1135
+ mappings = {}
1136
+ for item in data.get("compound_mappings", []):
1137
+ # Handle both old format (with identifiers list) and new format (with identifier string)
1138
+ identifiers = item.get("identifiers", [])
1139
+ if not identifiers and item.get("identifier"):
1140
+ identifiers = [item.get("identifier")]
1141
+
1142
+ mapping = CompoundMapping(
1143
+ identifiers=identifiers,
1144
+ iupac_name=item.get("iupac_name", ""),
1145
+ common_names=item.get("common_names", []),
1146
+ compound_type=item.get("compound_type", "unknown"),
1147
+ source_location=item.get("source_location")
1148
+ )
1149
+
1150
+ # Create lookup entries for all identifiers and common names
1151
+ for identifier in mapping.identifiers + mapping.common_names:
1152
+ if identifier:
1153
+ mappings[identifier.lower().strip()] = mapping
1154
+
1155
+ return mappings
1156
+
1157
+ except Exception as exc:
1158
+ LOGGER.error("Failed to extract compound mappings: %s", exc)
1159
+ return {}
1160
+
1161
+ def _extract_compound_mappings_with_figures(
1162
+ self,
1163
+ text: str,
1164
+ compound_ids: List[str],
1165
+ figure_images: Dict[str, str],
1166
+ tag_suffix: str = "",
1167
+ ) -> Dict[str, CompoundMapping]:
1168
+ """Extract compound mappings using multimodal approach with figures."""
1169
+ # Enhanced prompt for figure-based extraction
1170
+ prompt = """You are analyzing chemical figures and manuscript text to identify compound IUPAC names.
1171
+
1172
+ TASK: Find the IUPAC names for these specific compound identifiers: """ + ", ".join(sorted(compound_ids)) + """
1173
+
1174
+ Use your best knowledge, Look carefully in:
1175
+ 1. The chemical structures shown in figures - infer IUPAC names from drawn structures
1176
+ 2. Figure captions that may define compounds
1177
+ 3. Text that refers to these compound numbers
1178
+ 4. Reaction schemes showing transformations
1179
+
1180
+
1181
+ IMPORTANT:
1182
+ - Only provide IUPAC names you can determine from the figures or text
1183
+ - If a structure is clearly shown in a figure, derive the IUPAC name from it
1184
+
1185
+ Return as JSON:
1186
+ {
1187
+ "compound_mappings": [
1188
+ {
1189
+ "identifier": "compound identifier",
1190
+ "iupac_name": "IUPAC name",
1191
+ "common_names": ["common names if any"],
1192
+ "compound_type": "substrate/product/reagent",
1193
+ "source_location": "where found (e.g., Figure 3, manuscript text)"
1194
+ }
1195
+ ]
1196
+ }
1197
+
1198
+ TEXT FROM MANUSCRIPT:
1199
+ """ + text
1200
+
1201
+ # Prepare multimodal content
1202
+ content_parts = [prompt]
1203
+
1204
+ # Add figure images
1205
+ if figure_images:
1206
+ for fig_ref, fig_base64 in figure_images.items():
1207
+ try:
1208
+ img_bytes = b64decode(fig_base64)
1209
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1210
+ content_parts.append(f"\n[Figure: {fig_ref}]")
1211
+ content_parts.append(image)
1212
+ LOGGER.info("Added figure %s to multimodal compound mapping", fig_ref)
1213
+ except Exception as e:
1214
+ LOGGER.warning("Failed to add figure %s: %s", fig_ref, e)
1215
+
1216
+ tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
1217
+
1218
+ try:
1219
+ # Log multimodal call
1220
+ LOGGER.info("=== GEMINI MULTIMODAL API CALL: COMPOUND_MAPPING_WITH_FIGURES ===")
1221
+ LOGGER.info("Text prompt length: %d characters", len(prompt))
1222
+ LOGGER.info("Number of images: %d", len(content_parts) - 1)
1223
+ LOGGER.info("Compounds to find: %s", ", ".join(sorted(compound_ids)))
1224
+
1225
+ # Save debug info
1226
+ if self.debug_dir:
1227
+ prompt_file = self.debug_dir / f"{tag}_prompt_{int(time.time())}.txt"
1228
+ with open(prompt_file, 'w') as f:
1229
+ f.write(f"=== PROMPT FOR {tag.upper()} ===\n")
1230
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
1231
+ f.write(f"Text length: {len(prompt)} characters\n")
1232
+ f.write(f"Images included: {len(content_parts) - 1}\n")
1233
+ for fig_ref in figure_images.keys():
1234
+ f.write(f" - {fig_ref}\n")
1235
+ f.write("="*80 + "\n\n")
1236
+ f.write(prompt)
1237
+ LOGGER.info("Full prompt saved to: %s", prompt_file)
1238
+
1239
+ # Make multimodal API call
1240
+ response = self.model.generate_content(content_parts)
1241
+ raw_text = response.text.strip()
1242
+
1243
+ # Log response
1244
+ LOGGER.info("Gemini multimodal response length: %d characters", len(raw_text))
1245
+
1246
+ if self.debug_dir:
1247
+ response_file = self.debug_dir / f"{tag}_response_{int(time.time())}.txt"
1248
+ with open(response_file, 'w') as f:
1249
+ f.write(f"=== RESPONSE FOR {tag.upper()} ===\n")
1250
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
1251
+ f.write(f"Length: {len(raw_text)} characters\n")
1252
+ f.write("="*80 + "\n\n")
1253
+ f.write(raw_text)
1254
+ LOGGER.info("Full response saved to: %s", response_file)
1255
+
1256
+ # Parse JSON
1257
+ data = json.loads(raw_text.strip('```json').strip('```').strip())
1258
+
1259
+ mappings = {}
1260
+ for item in data.get("compound_mappings", []):
1261
+ identifiers = item.get("identifiers", [])
1262
+ if not identifiers and item.get("identifier"):
1263
+ identifiers = [item.get("identifier")]
1264
+
1265
+ mapping = CompoundMapping(
1266
+ identifiers=identifiers,
1267
+ iupac_name=item.get("iupac_name", ""),
1268
+ common_names=item.get("common_names", []),
1269
+ compound_type=item.get("compound_type", "unknown"),
1270
+ source_location=item.get("source_location")
1271
+ )
1272
+
1273
+ for identifier in mapping.identifiers + mapping.common_names:
1274
+ if identifier:
1275
+ mappings[identifier.lower().strip()] = mapping
1276
+
1277
+ return mappings
1278
+
1279
+ except Exception as exc:
1280
+ LOGGER.error("Failed to extract compound mappings with figures: %s", exc)
1281
+ return {}
1282
+
1283
+ def _extract_compound_mappings_adaptive(
1284
+ self,
1285
+ compound_ids: List[str],
1286
+ initial_sections: List[str] = None,
1287
+ ) -> Dict[str, CompoundMapping]:
1288
+ """Extract compound ID to IUPAC name mappings using adaptive 3-tier strategy.
1289
+
1290
+ 1. First attempts extraction from standard sections
1291
+ 2. Expands search to additional sections if compounds are missing
1292
+ 3. Uses multimodal figure analysis as final fallback
1293
+ """
1294
+ if not compound_ids:
1295
+ return {}
1296
+
1297
+ LOGGER.info("Starting adaptive compound mapping for %d compounds: %s",
1298
+ len(compound_ids), sorted(compound_ids))
1299
+
1300
+ # Tier 1: Standard sections (manuscript + initial SI sections)
1301
+ initial_sections = initial_sections or [
1302
+ "General procedure", "Compound characterization",
1303
+ "Synthesis", "Experimental", "Materials and methods"
1304
+ ]
1305
+
1306
+ # Include manuscript pages (first 10) for model reaction context
1307
+ manuscript_text = "\n\n".join(self.ms_pages[:10])
1308
+
1309
+ # Extract from initial sections
1310
+ extraction_text = self._extract_sections_by_title(initial_sections)
1311
+ if extraction_text:
1312
+ extraction_text = manuscript_text + "\n\n" + extraction_text
1313
+ else:
1314
+ extraction_text = manuscript_text
1315
+
1316
+ # First extraction attempt
1317
+ mappings = self._extract_compound_mappings_from_text(
1318
+ extraction_text[:50000], compound_ids, tag_suffix="initial"
1319
+ )
1320
+ LOGGER.info("Tier 1: Found %d compound mappings from standard sections", len(mappings))
1321
+
1322
+ # Check for missing compounds
1323
+ missing_compounds = []
1324
+ for cid in compound_ids:
1325
+ mapping = mappings.get(cid.lower().strip())
1326
+ if not mapping or not mapping.iupac_name:
1327
+ missing_compounds.append(cid)
1328
+
1329
+ # Tier 2: Expanded search + multimodal with figures
1330
+ if missing_compounds:
1331
+ LOGGER.info("Tier 2: %d compounds still missing IUPAC names: %s",
1332
+ len(missing_compounds), sorted(missing_compounds))
1333
+
1334
+ # Additional sections to search
1335
+ additional_sections = [
1336
+ "Engineering strategy", "Evolution campaign",
1337
+ "Screening", "Optimization", "Substrate synthesis",
1338
+ "Supporting Information", "Supplementary Methods"
1339
+ ]
1340
+
1341
+ # Extract from additional sections
1342
+ additional_text = self._extract_sections_by_title(additional_sections)
1343
+
1344
+ # Also extract any figures that might contain compound structures
1345
+ figure_images = {}
1346
+ figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Scheme 1", "Scheme 2"]
1347
+ for ref in figure_refs:
1348
+ img_b64 = self._extract_page_png(ref, extract_figure_only=True)
1349
+ if img_b64:
1350
+ figure_images[ref] = img_b64
1351
+ LOGGER.info("Extracted %s for compound mapping", ref)
1352
+
1353
+ # Try multimodal approach with figures and expanded text
1354
+ if figure_images or additional_text:
1355
+ combined_text = additional_text[:30000] if additional_text else ""
1356
+ expanded_mappings = self._extract_compound_mappings_with_figures(
1357
+ combined_text, missing_compounds, figure_images, tag_suffix="tier2"
1358
+ )
1359
+
1360
+ # Merge new mappings
1361
+ new_found = 0
1362
+ for key, mapping in expanded_mappings.items():
1363
+ if key not in mappings or not mappings[key].iupac_name:
1364
+ if mapping.iupac_name:
1365
+ mappings[key] = mapping
1366
+ new_found += 1
1367
+ LOGGER.info("Found IUPAC name for '%s': %s",
1368
+ key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
1369
+
1370
+ LOGGER.info("Tier 2: Found %d additional compound mappings", new_found)
1371
+
1372
+ # Check again for still missing compounds
1373
+ still_missing = []
1374
+ for cid in missing_compounds:
1375
+ mapping = mappings.get(cid.lower().strip())
1376
+ if not mapping or not mapping.iupac_name:
1377
+ still_missing.append(cid)
1378
+
1379
+ # Tier 3: Full manuscript search with all available figures
1380
+ if still_missing:
1381
+ LOGGER.info("Tier 3: %d compounds still missing, trying full manuscript search",
1382
+ len(still_missing))
1383
+
1384
+ # Get all SI figures
1385
+ si_figure_refs = []
1386
+ for page in self.si_pages[:5]: # Check first 5 SI pages
1387
+ matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
1388
+ si_figure_refs.extend(matches[:5]) # Limit to 5 figures
1389
+
1390
+ # Extract SI figures
1391
+ for ref in set(si_figure_refs):
1392
+ if ref not in figure_images:
1393
+ img_b64 = self._extract_page_png(ref, extract_figure_only=True)
1394
+ if img_b64:
1395
+ figure_images[ref] = img_b64
1396
+ LOGGER.info("Extracted %s for final compound mapping", ref)
1397
+
1398
+ # Full text search including all pages
1399
+ full_text = "\n\n".join(self.all_pages[:30]) # First 30 pages
1400
+
1401
+ final_mappings = self._extract_compound_mappings_with_figures(
1402
+ full_text[:50000], still_missing, figure_images, tag_suffix="tier3"
1403
+ )
1404
+
1405
+ # Merge final mappings
1406
+ final_found = 0
1407
+ for key, mapping in final_mappings.items():
1408
+ if key not in mappings or not mappings[key].iupac_name:
1409
+ if mapping.iupac_name:
1410
+ mappings[key] = mapping
1411
+ final_found += 1
1412
+ LOGGER.info("Found IUPAC name for '%s' in final search: %s",
1413
+ key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
1414
+
1415
+ LOGGER.info("Tier 3: Found %d additional compound mappings", final_found)
1416
+
1417
+ LOGGER.info("Adaptive compound mapping complete: %d total mappings", len(mappings))
1418
+ return mappings
1419
+
1420
+ def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
1421
+ """Extract model reaction information using identified locations and 3-tier compound mapping."""
1422
+ # First find the best locations
1423
+ locations = self.find_model_reaction_locations(enzyme_variants)
1424
+ if not locations:
1425
+ LOGGER.warning("Could not find model reaction locations, using fallback approach")
1426
+ # Fallback to old approach but include more manuscript text
1427
+ pattern = re.compile(r"(model reaction|general procedure|typical .*run|standard conditions|scheme 1|figure 1)", re.I)
1428
+ snippets: List[str] = []
1429
+ # Search both manuscript and SI
1430
+ for page in self.all_pages:
1431
+ if pattern.search(page):
1432
+ para_match = re.search(r"(.{0,3000}?\n\n)", page)
1433
+ if para_match:
1434
+ snippets.append(para_match.group(0))
1435
+ if len(snippets) >= 5:
1436
+ break
1437
+ text_context = "\n---\n".join(snippets)[:10000]
1438
+ else:
1439
+ # Gather text from identified locations
1440
+ text_snippets = []
1441
+
1442
+ # Always include manuscript abstract and introduction for context
1443
+ if self.ms_pages:
1444
+ # First 3 pages typically contain abstract, introduction, and model reaction info
1445
+ manuscript_intro = "\n\n".join(self.ms_pages[:3])
1446
+ text_snippets.append(f"=== MANUSCRIPT INTRODUCTION ===\n{manuscript_intro}")
1447
+
1448
+ # Get model reaction context
1449
+ if locations.get("model_reaction_location", {}).get("location"):
1450
+ model_loc = locations["model_reaction_location"]["location"]
1451
+ LOGGER.info("Looking for model reaction at: %s", model_loc)
1452
+ model_text = self._get_text_around_location(model_loc)
1453
+ if model_text:
1454
+ text_snippets.append(f"=== {model_loc} ===\n{model_text}")
1455
+
1456
+ # Get conditions context
1457
+ if locations.get("conditions_location", {}).get("location"):
1458
+ cond_loc = locations["conditions_location"]["location"]
1459
+ LOGGER.info("Looking for reaction conditions at: %s", cond_loc)
1460
+ cond_text = self._get_text_around_location(cond_loc)
1461
+ if cond_text:
1462
+ text_snippets.append(f"=== {cond_loc} ===\n{cond_text}")
1463
+
1464
+ # Get IUPAC names context from the specific location identified
1465
+ if locations.get("iupac_location", {}).get("location"):
1466
+ iupac_loc = locations["iupac_location"]["location"]
1467
+ LOGGER.info("Looking for IUPAC names at: %s", iupac_loc)
1468
+
1469
+ # If we have compound IDs from the model reaction location, search for them specifically
1470
+ compound_ids = locations.get("model_reaction_location", {}).get("compound_ids", [])
1471
+ if compound_ids:
1472
+ LOGGER.info("Looking for specific compound IDs: %s", compound_ids)
1473
+ # Search for each compound ID in the SI
1474
+ for compound_id in compound_ids:
1475
+ # Search patterns for compound characterization
1476
+ patterns = [
1477
+ rf"(?:compound\s+)?{re.escape(compound_id)}[:\s]*\([^)]+\)", # 6a: (IUPAC name)
1478
+ rf"(?:compound\s+)?{re.escape(compound_id)}[.\s]+[A-Z][^.]+", # 6a. IUPAC name
1479
+ rf"{re.escape(compound_id)}[^:]*:\s*[^.]+", # Any format with colon
1480
+ ]
1481
+
1482
+ for page in self.si_pages:
1483
+ for pattern in patterns:
1484
+ match = re.search(pattern, page, re.I)
1485
+ if match:
1486
+ # Get extended context around the match
1487
+ start = max(0, match.start() - 200)
1488
+ end = min(len(page), match.end() + 500)
1489
+ text_snippets.append(f"=== Compound {compound_id} characterization ===\n{page[start:end]}")
1490
+ break
1491
+
1492
+ # Also search for substrate names mentioned in the reaction to find their IUPAC equivalents
1493
+ # Look for common substrate patterns in compound listings
1494
+ substrate_patterns = [
1495
+ r"(?:substrate|reactant|reagent)s?\s*:?\s*([^.]+)",
1496
+ r"(?:starting\s+material)s?\s*:?\s*([^.]+)",
1497
+ r"\d+\.\s*([A-Za-z\s\-]+)(?:\s*\([^)]+\))?", # numbered compound lists
1498
+ ]
1499
+
1500
+ for pattern in substrate_patterns:
1501
+ for page in self.si_pages[:5]: # Check first few SI pages
1502
+ matches = re.finditer(pattern, page, re.I)
1503
+ for match in matches:
1504
+ text = match.group(0)
1505
+ if len(text) < 200: # Reasonable length check
1506
+ start = max(0, match.start() - 100)
1507
+ end = min(len(page), match.end() + 300)
1508
+ snippet = page[start:end]
1509
+ if "prop-2-enoate" in snippet or "diazirin" in snippet:
1510
+ text_snippets.append(f"=== Substrate characterization ===\n{snippet}")
1511
+ break
1512
+
1513
+ # Also get general IUPAC context
1514
+ iupac_text = self._get_text_around_location(iupac_loc)
1515
+ if iupac_text:
1516
+ # Get more context around the identified location
1517
+ extended_iupac_text = self._get_extended_text_around_location(iupac_loc, before=2000, after=10000)
1518
+ if extended_iupac_text:
1519
+ text_snippets.append(f"=== {iupac_loc} ===\n{extended_iupac_text}")
1520
+ else:
1521
+ text_snippets.append(f"=== {iupac_loc} ===\n{iupac_text}")
1522
+
1523
+ text_context = "\n\n".join(text_snippets)[:35000] # Increase limit for more context
1524
+
1525
+ # Extract figure images for model reaction if identified
1526
+ figure_images = {}
1527
+ if locations:
1528
+ # Extract images from model reaction and conditions locations
1529
+ for loc_key in ["model_reaction_location", "conditions_location"]:
1530
+ loc_info = locations.get(loc_key, {})
1531
+ location = loc_info.get("location", "")
1532
+ if location and ("figure" in location.lower() or "fig" in location.lower()):
1533
+ # Extract just the figure reference (e.g., "Figure 2" from "Figure 2. Caption...")
1534
+ fig_match = re.search(r"(Figure\s+\d+|Fig\s+\d+|Scheme\s+\d+)", location, re.I)
1535
+ if fig_match:
1536
+ fig_ref = fig_match.group(1)
1537
+ LOGGER.info("Extracting image for %s from %s", fig_ref, loc_key)
1538
+ img_b64 = self._extract_page_png(fig_ref, extract_figure_only=True)
1539
+ if img_b64:
1540
+ figure_images[fig_ref] = img_b64
1541
+ LOGGER.info("Successfully extracted %s image for model reaction analysis", fig_ref)
1542
+
1543
+ # Extract compound IDs from locations
1544
+ compound_ids = []
1545
+ if locations and locations.get("model_reaction_location", {}).get("compound_ids"):
1546
+ compound_ids = locations["model_reaction_location"]["compound_ids"]
1547
+ LOGGER.info("Found compound IDs in model reaction: %s", compound_ids)
1548
+
1549
+ # Use the 3-tier compound mapping approach if we have compound IDs
1550
+ compound_mappings = {}
1551
+ if compound_ids:
1552
+ LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
1553
+ compound_mappings = self._extract_compound_mappings_adaptive(compound_ids)
1554
+
1555
+ # Add the mapped IUPAC names to the context for better extraction
1556
+ if compound_mappings:
1557
+ mapping_text = "\n\n=== COMPOUND MAPPINGS ===\n"
1558
+ for cid in compound_ids:
1559
+ mapping = compound_mappings.get(cid.lower().strip())
1560
+ if mapping and mapping.iupac_name:
1561
+ mapping_text += f"Compound {cid}: {mapping.iupac_name}\n"
1562
+ text_context += mapping_text
1563
+
1564
+ # Include both manuscript and SI text for better coverage
1565
+ prompt = PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
1566
+
1567
+ try:
1568
+ # Use multimodal extraction if we have figure images
1569
+ if figure_images:
1570
+ LOGGER.info("Using multimodal extraction with %d figure images", len(figure_images))
1571
+ # Prepare multimodal content
1572
+ content_parts = [prompt]
1573
+
1574
+ # Add figure images
1575
+ for fig_ref, fig_base64 in figure_images.items():
1576
+ try:
1577
+ img_bytes = b64decode(fig_base64)
1578
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1579
+ content_parts.append(f"\n[Figure: {fig_ref}]")
1580
+ content_parts.append(image)
1581
+ except Exception as e:
1582
+ LOGGER.warning("Failed to process figure %s: %s", fig_ref, e)
1583
+
1584
+ # Use multimodal model if we have valid images
1585
+ if len(content_parts) > 1:
1586
+ # Create multimodal request
1587
+ model = genai.GenerativeModel(
1588
+ model_name=self.cfg.model_name,
1589
+ generation_config={
1590
+ "temperature": self.cfg.model_reaction_temperature,
1591
+ "top_p": self.cfg.top_p,
1592
+ "top_k": 1,
1593
+ "max_output_tokens": self.cfg.max_tokens,
1594
+ }
1595
+ )
1596
+
1597
+ response = model.generate_content(content_parts)
1598
+
1599
+ # Parse JSON from response
1600
+ if response and response.text:
1601
+ # Save debug output
1602
+ if self.debug_dir:
1603
+ timestamp = int(time.time())
1604
+ _dump(prompt, self.debug_dir / f"model_reaction_multimodal_prompt_{timestamp}.txt")
1605
+ _dump(response.text, self.debug_dir / f"model_reaction_multimodal_response_{timestamp}.txt")
1606
+
1607
+ # Extract JSON from response
1608
+ text = response.text.strip()
1609
+ if text.startswith("```json"):
1610
+ text = text[7:]
1611
+ if text.endswith("```"):
1612
+ text = text[:-3]
1613
+ data = json.loads(text.strip())
1614
+ else:
1615
+ raise ValueError("Empty response from multimodal model")
1616
+ else:
1617
+ # Fall back to text-only extraction
1618
+ data = generate_json_with_retry(
1619
+ self.model,
1620
+ prompt,
1621
+ temperature=self.cfg.model_reaction_temperature,
1622
+ debug_dir=self.debug_dir,
1623
+ tag="model_reaction"
1624
+ )
1625
+ else:
1626
+ # Standard text-only extraction
1627
+ data = generate_json_with_retry(
1628
+ self.model,
1629
+ prompt,
1630
+ temperature=self.cfg.model_reaction_temperature,
1631
+ debug_dir=self.debug_dir,
1632
+ tag="model_reaction"
1633
+ )
1634
+
1635
+ # Handle the new array format for substrates/products
1636
+ if isinstance(data, dict):
1637
+ # If we have compound mappings, enhance the IUPAC names
1638
+ if compound_ids and compound_mappings:
1639
+ # Try to map substrate/product lists through compound IDs
1640
+ substrate_list = data.get("substrate_iupac_list", [])
1641
+ if isinstance(substrate_list, list):
1642
+ enhanced_substrates = []
1643
+ for item in substrate_list:
1644
+ # Check if it's a compound ID that we can map
1645
+ mapping = compound_mappings.get(str(item).lower().strip())
1646
+ if mapping and mapping.iupac_name:
1647
+ enhanced_substrates.append(mapping.iupac_name)
1648
+ elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
1649
+ # Keep valid IUPAC names
1650
+ enhanced_substrates.append(str(item))
1651
+ data["substrate_iupac_list"] = enhanced_substrates
1652
+
1653
+ product_list = data.get("product_iupac_list", [])
1654
+ if isinstance(product_list, list):
1655
+ enhanced_products = []
1656
+ for item in product_list:
1657
+ # Check if it's a compound ID that we can map
1658
+ mapping = compound_mappings.get(str(item).lower().strip())
1659
+ if mapping and mapping.iupac_name:
1660
+ enhanced_products.append(mapping.iupac_name)
1661
+ elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
1662
+ # Keep valid IUPAC names
1663
+ enhanced_products.append(str(item))
1664
+ data["product_iupac_list"] = enhanced_products
1665
+
1666
+ # Validate and convert arrays to semicolon-separated strings for CSV compatibility
1667
+ if "substrate_iupac_list" in data and isinstance(data["substrate_iupac_list"], list):
1668
+ # Filter out non-IUPAC names (abbreviations like "1a", "S1", etc.)
1669
+ valid_substrates = [s for s in data["substrate_iupac_list"]
1670
+ if s and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', s)]
1671
+ # Join with semicolons instead of JSON encoding
1672
+ data["substrate_iupac_list"] = "; ".join(valid_substrates) if valid_substrates else ""
1673
+ else:
1674
+ data["substrate_iupac_list"] = ""
1675
+
1676
+ if "product_iupac_list" in data and isinstance(data["product_iupac_list"], list):
1677
+ # Filter out non-IUPAC names
1678
+ valid_products = [p for p in data["product_iupac_list"]
1679
+ if p and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', p)]
1680
+ # Join with semicolons instead of JSON encoding
1681
+ data["product_iupac_list"] = "; ".join(valid_products) if valid_products else ""
1682
+ else:
1683
+ data["product_iupac_list"] = ""
1684
+
1685
+ except Exception as exc:
1686
+ LOGGER.error("Failed to extract model reaction: %s", exc)
1687
+ data = {
1688
+ "substrate_iupac_list": None,
1689
+ "product_iupac_list": None,
1690
+ "reaction_substrate_concentration": None,
1691
+ "cofactor": None,
1692
+ "reaction_temperature": None,
1693
+ "reaction_ph": None,
1694
+ "reaction_buffer": None,
1695
+ "reaction_other_conditions": None,
1696
+ "error": str(exc)
1697
+ }
1698
+
1699
+ # Ensure all expected keys are present
1700
+ expected_keys = [
1701
+ "substrate_list", "substrate_iupac_list", "product_list", "product_iupac_list",
1702
+ "reaction_substrate_concentration", "cofactor", "reaction_temperature",
1703
+ "reaction_ph", "reaction_buffer", "reaction_other_conditions"
1704
+ ]
1705
+ for key in expected_keys:
1706
+ data.setdefault(key, None)
1707
+
1708
+ return data
1709
+
1710
+ def _process_single_lineage(self, location: Dict[str, Any], enzyme_df: pd.DataFrame) -> pd.DataFrame:
1711
+ """Process a single lineage case - still extract based on location."""
1712
+ # Even for single lineage, use location-based extraction
1713
+ lineage_analysis = {
1714
+ 'has_multiple_lineages': False,
1715
+ 'lineage_groups': [{
1716
+ 'group_id': self._get_base_location(location['location']),
1717
+ 'data_location': location['location'],
1718
+ 'lineage_hint': location.get('lineage_hint', ''),
1719
+ 'caption': location.get('caption', ''),
1720
+ 'confidence': location.get('confidence', 0)
1721
+ }]
1722
+ }
1723
+
1724
+ return self._process_multiple_lineages([location], enzyme_df, lineage_analysis)
1725
+
1726
+ def _process_multiple_lineages_by_confidence(self, locations: List[Dict[str, Any]],
1727
+ enzyme_df: pd.DataFrame,
1728
+ lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
1729
+ """Process multiple lineages by confidence, detecting which enzymes belong to which campaign."""
1730
+ # Get all enzyme IDs
1731
+ all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else enzyme_df['enzyme'].tolist()
1732
+ all_variants = set(all_enzyme_ids)
1733
+ variants_with_data = set()
1734
+ all_results = []
1735
+
1736
+ # If enzyme_df has campaign_id column, we can use it to filter
1737
+ has_campaign_info = 'campaign_id' in enzyme_df.columns
1738
+
1739
+ # Process locations in order of confidence
1740
+ for location in locations:
1741
+ if len(variants_with_data) >= len(all_variants):
1742
+ LOGGER.info("All variants have data, stopping extraction")
1743
+ break
1744
+
1745
+ LOGGER.info("\nProcessing location %s (confidence: %d%%)",
1746
+ location['location'], location.get('confidence', 0))
1747
+
1748
+ # Extract metrics from this location for ALL enzymes
1749
+ metrics_rows = self.extract_metrics_batch(all_enzyme_ids, location['location'])
1750
+
1751
+ # Filter to valid metrics
1752
+ valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
1753
+
1754
+ if not valid_metrics:
1755
+ LOGGER.warning("No valid metrics found in %s", location['location'])
1756
+ continue
1757
+
1758
+ LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), location['location'])
1759
+
1760
+ # Create DataFrame for this location
1761
+ df_location = pd.DataFrame(valid_metrics)
1762
+
1763
+ # Track which variants we got data for
1764
+ new_variants = set(df_location['enzyme'].tolist()) - variants_with_data
1765
+ LOGGER.info("Found data for %d new variants in %s", len(new_variants), location['location'])
1766
+ variants_with_data.update(new_variants)
1767
+
1768
+ # Determine which campaign/lineage this location represents
1769
+ # by checking which variants are present
1770
+ location_variants = set(df_location['enzyme'].tolist())
1771
+
1772
+ # If we have campaign info, determine the campaign for this location
1773
+ campaign_id = None
1774
+ if has_campaign_info:
1775
+ # Find which campaign(s) these variants belong to
1776
+ if 'enzyme_id' in enzyme_df.columns:
1777
+ variant_campaigns = enzyme_df[enzyme_df['enzyme_id'].isin(location_variants)]['campaign_id'].unique()
1778
+ else:
1779
+ variant_campaigns = enzyme_df[enzyme_df['enzyme'].isin(location_variants)]['campaign_id'].unique()
1780
+ if len(variant_campaigns) == 1:
1781
+ campaign_id = variant_campaigns[0]
1782
+ LOGGER.info("Location %s contains variants from campaign: %s",
1783
+ location['location'], campaign_id)
1784
+ elif len(variant_campaigns) > 1:
1785
+ LOGGER.warning("Location %s contains variants from multiple campaigns: %s",
1786
+ location['location'], variant_campaigns)
1787
+
1788
+ # Extract model reaction specific to this location/campaign
1789
+ location_context = f"Location: {location['location']}"
1790
+ if location.get('caption'):
1791
+ location_context += f"\nCaption: {location['caption']}"
1792
+
1793
+ # Try to find model reaction for this specific lineage
1794
+ location_model_reaction = self.find_lineage_model_reaction(location['location'], location_context)
1795
+
1796
+ # Get full model reaction info with IUPAC names
1797
+ if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
1798
+ model_info = self._extract_lineage_model_info(location_model_reaction)
1799
+ else:
1800
+ # Fall back to general model reaction extraction
1801
+ # Pass the enzyme variants from this location
1802
+ location_enzymes = df_location['enzyme'].unique().tolist()
1803
+ model_info = self.gather_model_reaction_info(location_enzymes)
1804
+
1805
+ # Add model reaction info to all enzymes from this location
1806
+ for key, value in model_info.items():
1807
+ if isinstance(value, list):
1808
+ value = "; ".join(str(v) for v in value) if value else None
1809
+ df_location[key] = value
1810
+
1811
+ # Add location and campaign info
1812
+ df_location['data_location'] = location['location']
1813
+ df_location['location_type'] = location.get('type', 'unknown')
1814
+ df_location['location_confidence'] = location.get('confidence', 0)
1815
+ # Remove lineage_group column - not working properly
1816
+ # df_location['lineage_group'] = location.get('lineage_hint', campaign_id or 'unknown')
1817
+
1818
+ all_results.append(df_location)
1819
+
1820
+ # Log progress
1821
+ LOGGER.info("Progress: %d/%d variants have data",
1822
+ len(variants_with_data), len(all_variants))
1823
+
1824
+ if all_results:
1825
+ # Combine all results
1826
+ df_combined = pd.concat(all_results, ignore_index=True)
1827
+
1828
+ # If we have duplicates (same variant in multiple locations), keep the one with highest confidence
1829
+ if df_combined.duplicated(subset=['enzyme']).any():
1830
+ LOGGER.info("Removing duplicates, keeping highest confidence data")
1831
+ df_combined = df_combined.sort_values(
1832
+ ['enzyme', 'location_confidence'],
1833
+ ascending=[True, False]
1834
+ ).drop_duplicates(subset=['enzyme'], keep='first')
1835
+
1836
+ # Log extraction summary
1837
+ LOGGER.info("Extraction complete: %d unique variants from %d locations",
1838
+ len(df_combined), len(all_results))
1839
+
1840
+ if 'data_location' in df_combined.columns:
1841
+ for location in df_combined['data_location'].unique():
1842
+ location_enzymes = df_combined[df_combined['data_location'] == location]
1843
+ LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
1844
+
1845
+ return df_combined
1846
+ else:
1847
+ LOGGER.warning("No metrics extracted from any location")
1848
+ return pd.DataFrame()
1849
+
1850
+ def _process_multiple_lineages(self, locations: List[Dict[str, Any]],
1851
+ enzyme_df: pd.DataFrame,
1852
+ lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
1853
+ """Process multiple lineages where each location represents a different model reaction."""
1854
+ all_metrics = []
1855
+ lineage_groups = lineage_analysis.get('lineage_groups', [])
1856
+
1857
+ # Get all enzyme IDs for extraction attempts
1858
+ all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else []
1859
+
1860
+ for group in lineage_groups:
1861
+ group_location = group.get('data_location')
1862
+ group_id = group.get('group_id')
1863
+
1864
+ # Find the location info
1865
+ location_info = next((loc for loc in locations if loc['location'] == group_location), None)
1866
+ if not location_info:
1867
+ LOGGER.warning("No location info found for group %s at %s", group_id, group_location)
1868
+ continue
1869
+
1870
+ LOGGER.info("Processing location %s (%s)", group_location, group_id)
1871
+
1872
+ # Extract metrics from this location for ALL enzymes
1873
+ # The extractor will return only those that actually have data
1874
+ metrics_rows = self.extract_metrics_batch(all_enzyme_ids, group_location)
1875
+
1876
+ # Filter to enzymes that actually had data in this location
1877
+ valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
1878
+
1879
+ if not valid_metrics:
1880
+ LOGGER.warning("No valid metrics found in %s", group_location)
1881
+ continue
1882
+
1883
+ LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), group_location)
1884
+
1885
+ # Create DataFrame for this location
1886
+ df_location = pd.DataFrame(valid_metrics)
1887
+
1888
+ # Extract model reaction specific to this location
1889
+ # Different locations = different model reactions
1890
+ location_context = f"Location: {group_location}"
1891
+ if group.get('caption'):
1892
+ location_context += f"\nCaption: {group['caption']}"
1893
+
1894
+ location_model_reaction = self.find_lineage_model_reaction(group_location, location_context)
1895
+
1896
+ # Get full model reaction info with IUPAC names
1897
+ if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
1898
+ model_info = self._extract_lineage_model_info(location_model_reaction)
1899
+ else:
1900
+ # Try to extract model reaction from this specific location
1901
+ # Pass the enzyme variants that have data in this location
1902
+ location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
1903
+ model_info = self.gather_model_reaction_info(location_enzymes)
1904
+
1905
+ # Add model reaction info to all enzymes from this location
1906
+ for key, value in model_info.items():
1907
+ if isinstance(value, list):
1908
+ value = "; ".join(str(v) for v in value) if value else None
1909
+ df_location[key] = value
1910
+
1911
+ # Add location identifier
1912
+ df_location['data_location'] = group_location
1913
+ # Remove lineage_group column - not working properly
1914
+ # df_location['lineage_group'] = group.get('lineage_hint', group_id)
1915
+
1916
+ all_metrics.append(df_location)
1917
+
1918
+ if all_metrics:
1919
+ # Combine all metrics
1920
+ df_combined = pd.concat(all_metrics, ignore_index=True)
1921
+
1922
+ # Log extraction summary
1923
+ LOGGER.info("Extraction complete: %d total enzymes from %d locations",
1924
+ len(df_combined), len(all_metrics))
1925
+
1926
+ if 'data_location' in df_combined.columns:
1927
+ for location in df_combined['data_location'].unique():
1928
+ location_enzymes = df_combined[df_combined['data_location'] == location]
1929
+ LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
1930
+
1931
+ return df_combined
1932
+ else:
1933
+ LOGGER.warning("No metrics extracted from any location")
1934
+ return pd.DataFrame()
1935
+
1936
+ def _has_valid_metrics(self, metrics_row: Dict[str, Any]) -> bool:
1937
+ """Check if a metrics row contains any valid performance data."""
1938
+ metric_fields = ['yield', 'ttn', 'ton', 'selectivity', 'conversion', 'tof', 'activity']
1939
+
1940
+ for field in metric_fields:
1941
+ if metrics_row.get(field) is not None:
1942
+ return True
1943
+
1944
+ # Also check other_metrics
1945
+ if metrics_row.get('other_metrics') and isinstance(metrics_row['other_metrics'], dict):
1946
+ if metrics_row['other_metrics']: # Non-empty dict
1947
+ return True
1948
+
1949
+ return False
1950
+
1951
+ def _filter_locations_by_campaign(self, locations: List[Dict[str, Any]],
1952
+ enzyme_df: pd.DataFrame) -> List[Dict[str, Any]]:
1953
+ """Filter locations to only those relevant to the current campaign."""
1954
+ if not self.campaign_filter or 'campaign_id' not in enzyme_df.columns:
1955
+ return locations
1956
+
1957
+ # Get enzyme names for this campaign
1958
+ campaign_enzymes = enzyme_df[enzyme_df['campaign_id'] == self.campaign_filter]['enzyme_id' if 'enzyme_id' in enzyme_df.columns else 'enzyme'].tolist()
1959
+
1960
+ # Extract any common patterns from enzyme names
1961
+ enzyme_patterns = set()
1962
+ for enzyme in campaign_enzymes:
1963
+ # Extract any uppercase abbreviations (e.g., 'PYS', 'INS')
1964
+ matches = re.findall(r'[A-Z]{2,}', enzyme)
1965
+ enzyme_patterns.update(matches)
1966
+
1967
+ LOGGER.info("Campaign %s has enzyme patterns: %s", self.campaign_filter, enzyme_patterns)
1968
+
1969
+ # Get campaign description keywords from the campaign data if available
1970
+ campaign_keywords = set()
1971
+ # Extract keywords from campaign_id (e.g., 'pyrrolidine_synthase_evolution' -> ['pyrrolidine', 'synthase'])
1972
+ words = self.campaign_filter.lower().replace('_', ' ').split()
1973
+ # Filter out generic words
1974
+ generic_words = {'evolution', 'campaign', 'synthase', 'enzyme', 'variant'}
1975
+ campaign_keywords.update(word for word in words if word not in generic_words and len(word) > 3)
1976
+
1977
+ LOGGER.info("Campaign keywords: %s", campaign_keywords)
1978
+
1979
+ # Filter locations based on campaign clues
1980
+ filtered = []
1981
+ for loc in locations:
1982
+ # Check caption and clues for campaign indicators
1983
+ caption = loc.get('caption', '').lower()
1984
+ campaign_clues = loc.get('campaign_clues', '').lower()
1985
+ lineage_hint = loc.get('lineage_hint', '').lower()
1986
+ combined_text = caption + ' ' + campaign_clues + ' ' + lineage_hint
1987
+
1988
+ # Check if location is relevant to this campaign
1989
+ is_relevant = False
1990
+
1991
+ # Check for enzyme patterns
1992
+ for pattern in enzyme_patterns:
1993
+ if pattern.lower() in combined_text:
1994
+ is_relevant = True
1995
+ break
1996
+
1997
+ # Check for campaign keywords
1998
+ if not is_relevant:
1999
+ for keyword in campaign_keywords:
2000
+ if keyword in combined_text:
2001
+ is_relevant = True
2002
+ break
2003
+
2004
+ # Check if any campaign enzymes are explicitly mentioned
2005
+ if not is_relevant:
2006
+ for enzyme in campaign_enzymes[:5]: # Check first few enzymes
2007
+ if enzyme.lower() in combined_text:
2008
+ is_relevant = True
2009
+ break
2010
+
2011
+ if is_relevant:
2012
+ filtered.append(loc)
2013
+ LOGGER.info("Location %s is relevant to campaign %s",
2014
+ loc.get('location'), self.campaign_filter)
2015
+ else:
2016
+ LOGGER.debug("Location %s filtered out for campaign %s",
2017
+ loc.get('location'), self.campaign_filter)
2018
+
2019
+ return filtered
2020
+
2021
+ def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any]) -> Dict[str, Any]:
2022
+ """Extract full model reaction info including IUPAC names for a lineage."""
2023
+ # Get substrate/product IDs from lineage-specific extraction
2024
+ substrate_ids = lineage_reaction.get('substrate_ids', [])
2025
+ product_ids = lineage_reaction.get('product_ids', [])
2026
+
2027
+ # Get general model reaction info for conditions
2028
+ general_info = self.gather_model_reaction_info()
2029
+
2030
+ # Override substrate/product lists with lineage-specific ones only if they contain actual compound IDs
2031
+ model_info = general_info.copy()
2032
+
2033
+ # Check if substrate_ids contain actual compound IDs (not generic terms like "alkyl azide")
2034
+ if substrate_ids and any(re.match(r'^[0-9]+[a-z]?$|^[A-Z][0-9]+$', sid) for sid in substrate_ids):
2035
+ model_info['substrate_list'] = substrate_ids
2036
+ elif not substrate_ids and general_info.get('substrate_list'):
2037
+ # Keep the general info if lineage extraction found nothing
2038
+ pass
2039
+ else:
2040
+ model_info['substrate_list'] = substrate_ids
2041
+
2042
+ # Check if product_ids contain actual compound IDs (not generic terms like "pyrrolidine")
2043
+ if product_ids and any(re.match(r'^[0-9]+[a-z]?$|^[A-Z][0-9]+$', pid) for pid in product_ids):
2044
+ model_info['product_list'] = product_ids
2045
+ elif not product_ids and general_info.get('product_list'):
2046
+ # Keep the general info if lineage extraction found nothing
2047
+ pass
2048
+ else:
2049
+ # If we only have generic terms, try to keep general info if available
2050
+ if general_info.get('product_list') and all(len(pid) > 5 for pid in product_ids):
2051
+ # Likely generic terms like "pyrrolidine", keep general info
2052
+ pass
2053
+ else:
2054
+ model_info['product_list'] = product_ids
2055
+
2056
+ # Extract IUPAC names for the compounds we're actually using
2057
+ # Use the IDs from model_info (which may have been preserved from general extraction)
2058
+ final_substrate_ids = model_info.get('substrate_list', [])
2059
+ final_product_ids = model_info.get('product_list', [])
2060
+ all_compound_ids = final_substrate_ids + final_product_ids
2061
+
2062
+ if all_compound_ids:
2063
+ compound_mappings = self._extract_compound_mappings_adaptive(all_compound_ids)
2064
+
2065
+ # Map substrate IUPAC names
2066
+ substrate_iupacs = []
2067
+ for sid in final_substrate_ids:
2068
+ mapping = compound_mappings.get(str(sid).lower().strip())
2069
+ if mapping and mapping.iupac_name:
2070
+ substrate_iupacs.append(mapping.iupac_name)
2071
+ # Only update if we found IUPAC names
2072
+ if substrate_iupacs:
2073
+ model_info['substrate_iupac_list'] = substrate_iupacs
2074
+
2075
+ # Map product IUPAC names
2076
+ product_iupacs = []
2077
+ for pid in final_product_ids:
2078
+ mapping = compound_mappings.get(str(pid).lower().strip())
2079
+ if mapping and mapping.iupac_name:
2080
+ product_iupacs.append(mapping.iupac_name)
2081
+ # Only update if we found IUPAC names
2082
+ if product_iupacs:
2083
+ model_info['product_iupac_list'] = product_iupacs
2084
+
2085
+ return model_info
2086
+
2087
+ def _process_single_lineage_by_confidence(self, locations: List[Dict[str, Any]],
2088
+ enzyme_df: pd.DataFrame) -> pd.DataFrame:
2089
+ """Process single lineage by confidence, stopping when all variants have data."""
2090
+ # Get list of all variants we need data for
2091
+ all_variants = set(enzyme_df['enzyme'].tolist() if 'enzyme' in enzyme_df.columns else
2092
+ enzyme_df['enzyme_id'].tolist())
2093
+ variants_with_data = set()
2094
+ all_results = []
2095
+
2096
+ # Process locations in order of confidence
2097
+ for location in locations:
2098
+ if len(variants_with_data) >= len(all_variants):
2099
+ LOGGER.info("All variants have data, stopping extraction")
2100
+ break
2101
+
2102
+ LOGGER.info("\nProcessing location %s (confidence: %d%%)",
2103
+ location['location'], location.get('confidence', 0))
2104
+
2105
+ # Extract metrics from this location
2106
+ metrics_rows = self.extract_metrics_batch(list(all_variants), location['location'])
2107
+
2108
+ # Filter to valid metrics
2109
+ valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
2110
+
2111
+ if not valid_metrics:
2112
+ LOGGER.warning("No valid metrics found in %s", location['location'])
2113
+ continue
2114
+
2115
+ # Create DataFrame for this location
2116
+ df_location = pd.DataFrame(valid_metrics)
2117
+
2118
+ # Track which variants we got data for
2119
+ new_variants = set(df_location['enzyme'].tolist()) - variants_with_data
2120
+ LOGGER.info("Found data for %d new variants in %s", len(new_variants), location['location'])
2121
+ variants_with_data.update(new_variants)
2122
+
2123
+ # Add location info
2124
+ df_location['data_location'] = location['location']
2125
+ df_location['location_type'] = location.get('type', 'unknown')
2126
+ df_location['location_confidence'] = location.get('confidence', 0)
2127
+
2128
+ all_results.append(df_location)
2129
+
2130
+ # Log progress
2131
+ LOGGER.info("Progress: %d/%d variants have data",
2132
+ len(variants_with_data), len(all_variants))
2133
+
2134
+ if all_results:
2135
+ # Combine all results
2136
+ df_combined = pd.concat(all_results, ignore_index=True)
2137
+
2138
+ # If we have duplicates (same variant in multiple locations), keep the one with highest confidence
2139
+ if df_combined.duplicated(subset=['enzyme']).any():
2140
+ LOGGER.info("Removing duplicates, keeping highest confidence data")
2141
+ df_combined = df_combined.sort_values(
2142
+ ['enzyme', 'location_confidence'],
2143
+ ascending=[True, False]
2144
+ ).drop_duplicates(subset=['enzyme'], keep='first')
2145
+
2146
+ # Extract model reaction info once
2147
+ # Pass the enzyme variants we're processing
2148
+ enzyme_list = df_combined['enzyme'].unique().tolist()
2149
+ model_info = self.gather_model_reaction_info(enzyme_list)
2150
+
2151
+ # Add model reaction info to all rows
2152
+ for key, value in model_info.items():
2153
+ if isinstance(value, list):
2154
+ value = "; ".join(str(v) for v in value) if value else None
2155
+ df_combined[key] = value
2156
+
2157
+ LOGGER.info("Extraction complete: %d unique variants with data", len(df_combined))
2158
+
2159
+ return df_combined
2160
+ else:
2161
+ LOGGER.warning("No metrics extracted from any location")
2162
+ return pd.DataFrame()
2163
+
2164
+ # ------------------------------------------------------------------
2165
+ # 6.5 Public orchestrator
2166
+ # ------------------------------------------------------------------
2167
+
2168
+ def run(self, enzyme_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
2169
+ # This module should always have enzyme CSV provided
2170
+ if enzyme_df is None:
2171
+ LOGGER.error("No enzyme DataFrame provided - this module requires enzyme CSV input")
2172
+ return pd.DataFrame()
2173
+
2174
+ # Check if we have campaign_id column - if so, process each campaign separately
2175
+ if 'campaign_id' in enzyme_df.columns and not self.campaign_filter:
2176
+ campaigns = enzyme_df['campaign_id'].unique()
2177
+ if len(campaigns) > 1:
2178
+ LOGGER.info("Detected %d campaigns in enzyme data - processing each separately", len(campaigns))
2179
+ all_campaign_results = []
2180
+
2181
+ for campaign_id in campaigns:
2182
+ LOGGER.info("\n" + "="*60)
2183
+ LOGGER.info("Processing campaign: %s", campaign_id)
2184
+ LOGGER.info("="*60)
2185
+
2186
+ # Create a new extractor instance for this campaign
2187
+ campaign_extractor = ReactionExtractor(
2188
+ manuscript=self.manuscript,
2189
+ si=self.si,
2190
+ cfg=self.cfg,
2191
+ debug_dir=self.debug_dir / campaign_id if self.debug_dir else None,
2192
+ campaign_filter=campaign_id
2193
+ )
2194
+
2195
+ # Run extraction for this campaign
2196
+ campaign_df = campaign_extractor.run(enzyme_df)
2197
+
2198
+ if not campaign_df.empty:
2199
+ # Add campaign identifier
2200
+ campaign_df['campaign_id'] = campaign_id
2201
+ all_campaign_results.append(campaign_df)
2202
+ LOGGER.info("Extracted %d reactions for campaign %s", len(campaign_df), campaign_id)
2203
+
2204
+ # Combine results from all campaigns
2205
+ if all_campaign_results:
2206
+ combined_df = pd.concat(all_campaign_results, ignore_index=True)
2207
+ LOGGER.info("\nCombined extraction complete: %d total reactions across %d campaigns",
2208
+ len(combined_df), len(campaigns))
2209
+ return combined_df
2210
+ else:
2211
+ LOGGER.warning("No reactions extracted from any campaign")
2212
+ return pd.DataFrame()
2213
+
2214
+ # Filter by campaign if specified
2215
+ if self.campaign_filter and 'campaign_id' in enzyme_df.columns:
2216
+ LOGGER.info("Filtering enzymes for campaign: %s", self.campaign_filter)
2217
+ enzyme_df = enzyme_df[enzyme_df['campaign_id'] == self.campaign_filter].copy()
2218
+ LOGGER.info("Found %d enzymes for campaign %s", len(enzyme_df), self.campaign_filter)
2219
+ if len(enzyme_df) == 0:
2220
+ LOGGER.warning("No enzymes found for campaign %s", self.campaign_filter)
2221
+ return pd.DataFrame()
2222
+
2223
+ # Find all locations with performance data
2224
+ locations = self.find_reaction_locations()
2225
+ if not locations:
2226
+ LOGGER.error("Failed to find reaction data locations")
2227
+ return pd.DataFrame()
2228
+
2229
+ # Filter locations by campaign if specified
2230
+ if self.campaign_filter:
2231
+ filtered_locations = self._filter_locations_by_campaign(locations, enzyme_df)
2232
+ if filtered_locations:
2233
+ LOGGER.info("Filtered to %d locations for campaign %s",
2234
+ len(filtered_locations), self.campaign_filter)
2235
+ locations = filtered_locations
2236
+ else:
2237
+ LOGGER.warning("No locations found specifically for campaign %s, using all locations",
2238
+ self.campaign_filter)
2239
+
2240
+ # Sort locations by confidence (highest first) and prefer tables over figures
2241
+ locations_sorted = sorted(locations, key=lambda x: (
2242
+ x.get('confidence', 0),
2243
+ 1 if x.get('type') == 'table' else 0 # Prefer tables when confidence is equal
2244
+ ), reverse=True)
2245
+
2246
+ LOGGER.info("Found %d reaction data location(s), sorted by confidence:", len(locations_sorted))
2247
+ for loc in locations_sorted:
2248
+ LOGGER.info(" - %s (%s, confidence: %d%%)",
2249
+ loc.get('location'),
2250
+ loc.get('type'),
2251
+ loc.get('confidence', 0))
2252
+
2253
+ # Analyze if we have multiple lineages
2254
+ lineage_analysis = self.analyze_lineage_groups(locations_sorted, enzyme_df)
2255
+ has_multiple_lineages = lineage_analysis.get('has_multiple_lineages', False)
2256
+
2257
+ if has_multiple_lineages:
2258
+ LOGGER.info("Multiple lineage groups detected")
2259
+ return self._process_multiple_lineages_by_confidence(locations_sorted, enzyme_df, lineage_analysis)
2260
+ else:
2261
+ LOGGER.info("Single lineage detected, using confidence-based processing")
2262
+ return self._process_single_lineage_by_confidence(locations_sorted, enzyme_df)
2263
+
2264
+ ###############################################################################
2265
+ # 7 - MERGE WITH LINEAGE CSV + SAVE
2266
+ ###############################################################################
2267
+
2268
+ def merge_with_lineage_data(
2269
+ df_lineage: pd.DataFrame, df_metrics: pd.DataFrame
2270
+ ) -> pd.DataFrame:
2271
+ """Outer-merge on 'enzyme' column. Left CSV defines desired row order."""
2272
+
2273
+ # Handle both 'enzyme' and 'enzyme_id' column names
2274
+ if "enzyme_id" in df_lineage.columns and "enzyme" not in df_lineage.columns:
2275
+ df_lineage = df_lineage.rename(columns={"enzyme_id": "enzyme"})
2276
+
2277
+ if "enzyme" not in df_lineage.columns:
2278
+ raise ValueError("Lineage CSV must have an 'enzyme' or 'enzyme_id' column.")
2279
+
2280
+ merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
2281
+ return merged
2282
+
2283
+ ###############################################################################
2284
+ # 8 - CLI ENTRY-POINT
2285
+ ###############################################################################
2286
+
2287
+ def build_parser() -> argparse.ArgumentParser:
2288
+ p = argparse.ArgumentParser(
2289
+ description="Extract enzyme reaction metrics from chemistry PDFs",
2290
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
2291
+ )
2292
+ p.add_argument("--manuscript", required=True, type=Path)
2293
+ p.add_argument("--si", type=Path, help="Supporting-information PDF")
2294
+ p.add_argument("--lineage-csv", type=Path)
2295
+ p.add_argument("--output", type=Path, default=Path("reaction_metrics.csv"))
2296
+ p.add_argument("--verbose", action="store_true")
2297
+ p.add_argument(
2298
+ "--debug-dir",
2299
+ metavar="DIR",
2300
+ help="Write ALL intermediate artefacts (prompts, raw Gemini replies) to DIR",
2301
+ )
2302
+ return p
2303
+
2304
+ def main() -> None:
2305
+ args = build_parser().parse_args()
2306
+ if args.verbose:
2307
+ LOGGER.setLevel(logging.DEBUG)
2308
+ cfg = Config()
2309
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir)
2310
+
2311
+ # Load enzyme data from CSV if provided
2312
+ enzyme_df = None
2313
+ if args.lineage_csv and args.lineage_csv.exists():
2314
+ LOGGER.info("Loading enzyme data from CSV…")
2315
+ enzyme_df = pd.read_csv(args.lineage_csv)
2316
+
2317
+ # Run extraction with enzyme data
2318
+ df_metrics = extractor.run(enzyme_df)
2319
+
2320
+ if args.lineage_csv and args.lineage_csv.exists() and not df_metrics.empty:
2321
+ LOGGER.info("Merging with lineage CSV…")
2322
+ df_final = merge_with_lineage_data(enzyme_df, df_metrics)
2323
+ else:
2324
+ df_final = df_metrics
2325
+
2326
+ df_final.to_csv(args.output, index=False)
2327
+ LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
2328
+
2329
+ if __name__ == "__main__":
2330
+ main()
2331
+