debase 0.1.18__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,7 @@ Key June 2025 additions
24
24
  from __future__ import annotations
25
25
 
26
26
  import argparse
27
+ import hashlib
27
28
  import json
28
29
  import logging
29
30
  import os
@@ -31,7 +32,9 @@ import re
31
32
  import sys
32
33
  import time
33
34
  from base64 import b64encode, b64decode
35
+ from collections import OrderedDict
34
36
  from dataclasses import dataclass, field
37
+ from functools import lru_cache
35
38
  from pathlib import Path
36
39
  from textwrap import dedent
37
40
  from typing import Any, Dict, List, Optional, Tuple
@@ -40,7 +43,6 @@ import fitz # PyMuPDF - for image extraction
40
43
  import google.generativeai as genai # type: ignore
41
44
  import pandas as pd
42
45
  from PyPDF2 import PdfReader
43
- import PIL.Image
44
46
  import io
45
47
 
46
48
  ###############################################################################
@@ -51,7 +53,7 @@ import io
51
53
  class Config:
52
54
  """Centralised tunables so tests can override them easily."""
53
55
 
54
- model_name: str = "gemini-1.5-pro-latest"
56
+ model_name: str = "gemini-2.5-flash"
55
57
  location_temperature: float = 0.2
56
58
  extract_temperature: float = 0.0
57
59
  model_reaction_temperature: float = 0.0
@@ -117,6 +119,144 @@ def get_model(cfg: Config):
117
119
  genai.configure(api_key=api_key)
118
120
  return genai.GenerativeModel(cfg.model_name)
119
121
 
122
+ # Bounded LRU caches to store prompt/image content by hash (prevents memory leaks)
123
+
124
+ class LRUCache:
125
+ """Simple LRU cache implementation."""
126
+ def __init__(self, maxsize: int):
127
+ self.maxsize = maxsize
128
+ self.cache = OrderedDict()
129
+
130
+ def get(self, key: str) -> Optional[str]:
131
+ if key in self.cache:
132
+ # Move to end (most recently used)
133
+ self.cache.move_to_end(key)
134
+ return self.cache[key]
135
+ return None
136
+
137
+ def put(self, key: str, value: str) -> None:
138
+ if key in self.cache:
139
+ # Update existing
140
+ self.cache.move_to_end(key)
141
+ else:
142
+ # Add new, evict oldest if needed
143
+ if len(self.cache) >= self.maxsize:
144
+ self.cache.popitem(last=False)
145
+ self.cache[key] = value
146
+
147
+ def __len__(self) -> int:
148
+ return len(self.cache)
149
+
150
+ # Global bounded caches
151
+ _PROMPT_CACHE = LRUCache(maxsize=1000)
152
+ _IMAGE_CACHE = LRUCache(maxsize=500) # Images are larger, so smaller cache
153
+
154
+ def get_cache_stats() -> Dict[str, Any]:
155
+ """Get cache statistics for debugging."""
156
+ return {
157
+ "gemini_cache_info": _cached_gemini_call.cache_info(),
158
+ "prompt_cache_size": len(_PROMPT_CACHE),
159
+ "image_cache_size": len(_IMAGE_CACHE),
160
+ }
161
+
162
+ @lru_cache(maxsize=1000)
163
+ def _cached_gemini_call(
164
+ model_name: str,
165
+ prompt_hash: str,
166
+ image_hash: Optional[str],
167
+ temperature: float,
168
+ max_retries: int,
169
+ ) -> str:
170
+ """Pure cached function for Gemini API calls using only hash keys.
171
+
172
+ Args:
173
+ model_name: Name of the Gemini model
174
+ prompt_hash: SHA256 hash of the prompt
175
+ image_hash: SHA256 hash of the image (if any)
176
+ temperature: Temperature for generation
177
+ max_retries: Maximum number of retries
178
+
179
+ Returns:
180
+ Raw response text from Gemini
181
+ """
182
+ # Retrieve actual content from LRU cache
183
+ prompt = _PROMPT_CACHE.get(prompt_hash)
184
+ image_b64 = _IMAGE_CACHE.get(image_hash) if image_hash else None
185
+
186
+ if prompt is None:
187
+ raise RuntimeError(f"Prompt content not found for hash {prompt_hash}")
188
+
189
+ # Configure API key (this is idempotent)
190
+ api_key = os.getenv("GEMINI_API_KEY")
191
+ if not api_key:
192
+ raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
193
+ genai.configure(api_key=api_key)
194
+
195
+ # Create model instance (not cached since it's lightweight)
196
+ model = genai.GenerativeModel(model_name)
197
+
198
+ for attempt in range(1, max_retries + 1):
199
+ try:
200
+ # Handle image if provided
201
+ if image_b64:
202
+ # Decode base64 string to bytes for Gemini API
203
+ image_bytes = b64decode(image_b64)
204
+ parts = [prompt, {"mime_type": "image/png", "data": image_bytes}]
205
+ else:
206
+ parts = [prompt]
207
+
208
+ resp = model.generate_content(
209
+ parts,
210
+ generation_config={
211
+ "temperature": temperature,
212
+ "max_output_tokens": 8192,
213
+ }
214
+ )
215
+ # Track token usage if available
216
+ try:
217
+ if hasattr(resp, 'usage_metadata'):
218
+ input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
219
+ output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
220
+ if input_tokens or output_tokens:
221
+ try:
222
+ from .wrapper import add_token_usage
223
+ add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
224
+ except ImportError:
225
+ pass # wrapper not available
226
+ except Exception:
227
+ pass # token tracking is best-effort
228
+
229
+ return resp.text.strip()
230
+ except Exception as exc:
231
+ if attempt == max_retries:
232
+ raise
233
+ time.sleep(2 ** attempt)
234
+
235
+ # Should never reach here
236
+ raise RuntimeError("Max retries exceeded")
237
+
238
+ def _normalize_prompt_for_caching(prompt: str) -> str:
239
+ """Normalize prompt for better cache hit rates by removing boilerplate and collapsing whitespace."""
240
+ # Remove common boilerplate lines that don't affect the core query
241
+ lines = prompt.split('\n')
242
+ normalized_lines = []
243
+
244
+ for line in lines:
245
+ # Skip timestamp and debug lines
246
+ if any(skip in line.lower() for skip in ['timestamp:', 'length:', 'characters', '===', '***']):
247
+ continue
248
+ # Skip lines that are just separators
249
+ if line.strip() and not line.strip().replace('=', '').replace('-', '').replace('*', ''):
250
+ continue
251
+ # Collapse whitespace but preserve structure
252
+ normalized_lines.append(' '.join(line.split()))
253
+
254
+ # Join and collapse multiple newlines
255
+ normalized = '\n'.join(normalized_lines)
256
+ normalized = re.sub(r'\n\s*\n+', '\n\n', normalized)
257
+
258
+ return normalized.strip()
259
+
120
260
  def generate_json_with_retry(
121
261
  model,
122
262
  prompt: str,
@@ -129,9 +269,17 @@ def generate_json_with_retry(
129
269
  image_b64: Optional[str] = None,
130
270
  ):
131
271
  """Call Gemini with retries & exponential back-off, returning parsed JSON."""
272
+ # Generate cache keys based on normalized prompt and image content
273
+ normalized_prompt = _normalize_prompt_for_caching(prompt)
274
+ prompt_hash = hashlib.sha256(normalized_prompt.encode()).hexdigest()
275
+ image_hash = hashlib.sha256(image_b64.encode()).hexdigest() if image_b64 else None
276
+
132
277
  # Log prompt details
133
278
  LOGGER.info("=== GEMINI API CALL: %s ===", tag.upper())
134
279
  LOGGER.info("Prompt length: %d characters", len(prompt))
280
+ LOGGER.info("Prompt hash: %s", prompt_hash[:16])
281
+ if image_hash:
282
+ LOGGER.info("Image hash: %s", image_hash[:16])
135
283
  LOGGER.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
136
284
 
137
285
  # Save full prompt to debug directory if provided
@@ -139,107 +287,142 @@ def generate_json_with_retry(
139
287
  debug_path = Path(debug_dir)
140
288
  debug_path.mkdir(parents=True, exist_ok=True)
141
289
  prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
142
- _dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\n{'='*80}\n\n{prompt}",
290
+ _dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{prompt}",
143
291
  prompt_file)
144
292
  LOGGER.info("Full prompt saved to: %s", prompt_file)
145
293
 
146
- fence_re = re.compile(r"```json|```", re.I)
147
- for attempt in range(1, max_retries + 1):
148
- try:
149
- LOGGER.info("Calling Gemini API (attempt %d/%d)...", attempt, max_retries)
150
-
151
- # Handle image if provided
152
- if image_b64:
153
- parts = [prompt, {"mime_type": "image/png", "data": image_b64}]
154
- else:
155
- parts = [prompt]
156
-
157
- resp = model.generate_content(
158
- parts,
159
- generation_config={
160
- "temperature": temperature,
161
- "max_output_tokens": 4096,
162
- }
163
- )
164
- raw = resp.text.strip()
165
-
166
- # Log response
167
- LOGGER.info("Gemini response length: %d characters", len(raw))
168
- LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
169
-
170
- # Save full response to debug directory
171
- if debug_dir:
172
- response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
173
- _dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}",
174
- response_file)
175
- LOGGER.info("Full response saved to: %s", response_file)
294
+ try:
295
+ # Store content in bounded LRU caches for the cached function to retrieve
296
+ _PROMPT_CACHE.put(prompt_hash, prompt)
297
+ if image_hash and image_b64:
298
+ _IMAGE_CACHE.put(image_hash, image_b64)
299
+
300
+ # Check if this will be a cache hit
301
+ cache_info_before = _cached_gemini_call.cache_info()
302
+
303
+ # Use cached Gemini call (only with hash keys)
304
+ LOGGER.info("Calling cached Gemini API...")
305
+ raw = _cached_gemini_call(
306
+ model_name=model.model_name,
307
+ prompt_hash=prompt_hash,
308
+ image_hash=image_hash,
309
+ temperature=temperature,
310
+ max_retries=max_retries,
311
+ )
312
+
313
+ # Log cache performance
314
+ cache_info_after = _cached_gemini_call.cache_info()
315
+ if cache_info_after.hits > cache_info_before.hits:
316
+ LOGGER.info(" Cache HIT for prompt hash %s", prompt_hash[:16])
317
+ else:
318
+ LOGGER.info("✗ Cache MISS for prompt hash %s", prompt_hash[:16])
319
+
320
+ # Log response
321
+ LOGGER.info("Gemini response length: %d characters", len(raw))
322
+ LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
323
+
324
+ # Save full response to debug directory
325
+ if debug_dir:
326
+ response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
327
+ _dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{raw}",
328
+ response_file)
329
+ LOGGER.info("Full response saved to: %s", response_file)
176
330
 
177
- # Remove common Markdown fences
178
- if raw.startswith("```"):
179
- raw = fence_re.sub("", raw).strip()
180
-
181
- # Try to find JSON in the response
182
- # First, try to parse as-is
331
+ # Remove common Markdown fences more carefully
332
+ if raw.startswith("```json"):
333
+ raw = raw[7:].strip() # Remove ```json
334
+ elif raw.startswith("```"):
335
+ raw = raw[3:].strip() # Remove ```
336
+
337
+ if raw.endswith("```"):
338
+ raw = raw[:-3].strip() # Remove trailing ```
339
+
340
+
341
+ # Simple JSON parsing approach
342
+ # Try direct parsing first
343
+ LOGGER.debug(f"Raw JSON length: {len(raw)}")
344
+ LOGGER.debug(f"Raw JSON first 200 chars: {raw[:200]}")
345
+ LOGGER.debug(f"Raw JSON last 200 chars: {raw[-200:]}")
346
+
347
+ try:
348
+ parsed = json.loads(raw)
349
+ except json.JSONDecodeError as e:
350
+ LOGGER.error(f"JSON parsing failed at position {e.pos}: {e}")
351
+ LOGGER.error(f"Character at error: {repr(raw[e.pos] if e.pos < len(raw) else 'END')}")
352
+ LOGGER.error(f"Context: {repr(raw[max(0, e.pos-20):e.pos+20])}")
353
+
354
+ # Count braces and quotes for debugging
355
+ open_braces = raw.count('{')
356
+ close_braces = raw.count('}')
357
+ quotes = raw.count('"')
358
+ LOGGER.error(f"Braces: {open_braces} open, {close_braces} close. Quotes: {quotes}")
359
+
360
+ # If that fails, try to extract JSON from the response using a simpler method
183
361
  try:
184
- parsed = json.loads(raw)
185
- except json.JSONDecodeError:
186
- # If that fails, look for JSON array or object
187
- # Find the first '[' or '{' and the matching closing bracket
188
- json_start = -1
189
- json_end = -1
190
- bracket_stack = []
191
- in_string = False
192
- escape_next = False
362
+ # Look for the JSON object start and end
363
+ start_idx = raw.find('{')
364
+ if start_idx == -1:
365
+ raise json.JSONDecodeError("No JSON object found", raw, 0)
193
366
 
194
- for i, char in enumerate(raw):
195
- if escape_next:
196
- escape_next = False
197
- continue
367
+ # Find the matching closing brace by counting
368
+ brace_count = 0
369
+ end_idx = -1
370
+ for i in range(start_idx, len(raw)):
371
+ if raw[i] == '{':
372
+ brace_count += 1
373
+ elif raw[i] == '}':
374
+ brace_count -= 1
375
+ if brace_count == 0:
376
+ end_idx = i + 1
377
+ break
378
+
379
+ if end_idx == -1:
380
+ raise json.JSONDecodeError("No matching closing brace found", raw, 0)
381
+
382
+ json_str = raw[start_idx:end_idx]
383
+ LOGGER.debug(f"Extracted JSON string: {json_str[:200]}...")
384
+ parsed = json.loads(json_str)
385
+
386
+ except json.JSONDecodeError:
387
+ # Final fallback - try to use eval as a last resort (unsafe but functional)
388
+ try:
389
+ # Replace problematic characters and try to parse as Python dict
390
+ safe_raw = raw.replace('null', 'None').replace('true', 'True').replace('false', 'False')
391
+ start_idx = safe_raw.find('{')
392
+ if start_idx == -1:
393
+ raise ValueError("No dict found")
198
394
 
199
- if char == '\\':
200
- escape_next = True
201
- continue
202
-
203
- if char == '"' and not escape_next:
204
- in_string = not in_string
205
- continue
395
+ brace_count = 0
396
+ end_idx = -1
397
+ for i in range(start_idx, len(safe_raw)):
398
+ if safe_raw[i] == '{':
399
+ brace_count += 1
400
+ elif safe_raw[i] == '}':
401
+ brace_count -= 1
402
+ if brace_count == 0:
403
+ end_idx = i + 1
404
+ break
206
405
 
207
- if in_string:
208
- continue
406
+ if end_idx == -1:
407
+ raise ValueError("No matching closing brace found")
209
408
 
210
- if char in '[{':
211
- if json_start == -1:
212
- json_start = i
213
- bracket_stack.append(char)
214
- elif char in ']}':
215
- if bracket_stack:
216
- opening = bracket_stack.pop()
217
- if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
218
- if not bracket_stack: # Found complete JSON
219
- json_end = i + 1
220
- break
221
-
222
- if json_start >= 0 and json_end > json_start:
223
- # Extract the JSON portion
224
- json_str = raw[json_start:json_end]
225
- parsed = json.loads(json_str)
226
- else:
227
- # Look for simple [] in the response
409
+ dict_str = safe_raw[start_idx:end_idx]
410
+ parsed = eval(dict_str) # This is unsafe but we trust our own generated content
411
+ LOGGER.warning("Used eval() fallback for JSON parsing")
412
+
413
+ except Exception:
414
+ # If all else fails, return empty dict
415
+ LOGGER.error("All JSON parsing methods failed")
228
416
  if '[]' in raw:
229
417
  parsed = []
230
418
  else:
231
- # No JSON structure found, re-raise the original error
232
419
  raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
233
- LOGGER.info("Successfully parsed JSON response")
234
- return parsed
235
- except Exception as exc:
236
- LOGGER.warning(
237
- "Gemini call failed (attempt %d/%d): %s",
238
- attempt, max_retries, exc,
239
- )
240
- if attempt == max_retries:
241
- raise
242
- time.sleep(2 ** attempt)
420
+
421
+ LOGGER.info("Successfully parsed JSON response")
422
+ return parsed
423
+ except Exception as exc:
424
+ LOGGER.error("Cached Gemini call failed: %s", exc)
425
+ raise
243
426
 
244
427
 
245
428
  ###############################################################################
@@ -248,14 +431,14 @@ def generate_json_with_retry(
248
431
 
249
432
  PROMPT_FIND_LOCATIONS = dedent("""
250
433
  You are an expert reader of protein engineering manuscripts.
251
- Given the following article captions and section titles, identify ALL locations
434
+ Given the following article captions and section titles, identify most promising locations
252
435
  (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
253
- activity, etc.) for enzyme variants.
436
+ activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
254
437
 
255
438
  IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
256
439
  performance data locations. Pay careful attention to:
257
440
  - The caption text to identify which campaign/lineage the data is for
258
- - Enzyme name prefixes (e.g., PYS vs INS) that indicate different campaigns
441
+ - Enzyme name prefixes that indicate different campaigns
259
442
  - Different substrate/product types mentioned in captions
260
443
 
261
444
  Respond with a JSON array where each element contains:
@@ -267,8 +450,10 @@ Respond with a JSON array where each element contains:
267
450
  - "lineage_hint": any indication of which enzyme group this data is for (or null)
268
451
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
269
452
 
270
- Tables are preferred over figures when both contain the same data.
271
-
453
+ Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
454
+ Do not include too much sources, just return 2 or 3 sources.
455
+ Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
456
+ When returning confidence scores, be more accurate and avoid scores that are too close together.
272
457
  Respond ONLY with **minified JSON**. NO markdown fences.
273
458
 
274
459
  Example:
@@ -280,7 +465,8 @@ You are given either (a) the PNG image of a figure panel, or (b) the caption /
280
465
  text excerpt that contains numeric reaction performance data for an enzyme.
281
466
 
282
467
  Extract ONLY the performance metrics, NOT substrate/product names or reaction conditions.
283
- Return a JSON object with the following keys (use **null** if not found):
468
+
469
+ Return a JSON object with the following keys (use **null** only if the value is not mentioned at all):
284
470
  * "yield" - yield as percentage with ONE decimal place precision
285
471
  * "ttn" - turnover number (total turnovers)
286
472
  * "ton" - turnover number if TTN not available
@@ -296,45 +482,66 @@ IMPORTANT:
296
482
  - Do NOT extract substrate/product names - these will come from SI
297
483
  - Do NOT extract reaction conditions (temperature, pH, time, solvent)
298
484
  - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
485
+ - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
299
486
 
300
487
  Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
301
488
  """)
302
489
 
303
490
  PROMPT_EXTRACT_FIGURE_METRICS_BATCH = dedent("""
304
- You are analyzing a figure showing enzyme reaction performance data for multiple variants.
305
-
306
- Extract performance metrics for ALL the following enzyme variants:
491
+ STEP 1: First, identify ALL X-axis labels in the figure
492
+ - Read each X-axis label from left to right
493
+ - List exactly what text appears under each bar/data point
494
+ - Note: Labels may be abbreviated or use different naming conventions
495
+
496
+ STEP 2: Match X-axis labels to target enzyme variants
497
+ - Compare each X-axis label against the target enzyme list below
498
+ - Look for partial matches, abbreviations, or similar naming patterns
499
+ - If an X-axis label doesn't match any target enzyme, still include it for completeness
500
+
501
+ STEP 3: Identify Y-axis scales and what they measure
502
+ - Look at the Y-axis labels and tick marks to understand what each axis measures
503
+ - If there are multiple Y-axes (left and right), read the axis labels and units
504
+ - Note the minimum and maximum values on each axis scale
505
+ - Identify which visual elements (bars, dots, lines) correspond to which axis
506
+
507
+ STEP 4: Extract values for each matched variant
508
+ - For each X-axis position, identify which visual elements belong to that position
509
+ - LEFT Y-axis (bars): Measure bar height against the left scale by reading tick marks
510
+ - RIGHT Y-axis (dots): Measure dot position against the right scale by reading tick marks
511
+ - CRITICAL: Read actual scale values from the axis labels and tick marks
512
+ - Verify: taller bars should have higher values, higher dots should have higher values
513
+
514
+ Target enzymes to find and extract:
307
515
  {enzyme_names}
308
516
 
309
- Steps:
310
- 1. CHECK THE Y-AXIS SCALE: What is the maximum value? (e.g., 10%, 30%, 50%, 100%)
311
- 2. For each enzyme variant listed above:
312
- - Find its position on the X-axis
313
- - Read the bar height or data point value
314
- - Calculate the actual value based on the Y-axis scale
315
- 3. Compare all bars to understand relative performance
316
-
317
- Return a JSON object with enzyme names as keys, each containing:
318
- * "yield" - yield with ONE decimal place precision
319
- * "ttn" - turnover number if shown
320
- * "ton" - turnover number if TTN not available
321
- * "selectivity" - ee or er value with unit
322
- * "conversion" - conversion percentage if different from yield
323
- * "tof" - turnover frequency if provided
324
- * "activity" - specific activity if provided
325
- * "other_metrics" - dictionary of any other metrics
326
- * "notes" - any relevant notes (including reaction type if different reactions are shown)
327
-
328
- CRITICAL:
329
- - Read ALL pages provided in the image
330
- - If different enzymes are tested for different reactions (e.g., pyrrolidine vs indoline synthesis), note this in "notes"
331
- - For tables, check if data continues beyond what's shown
332
- - Read the Y-axis scale carefully for figures
333
-
334
- Example format:
335
- {{"ApePgb LVQ": {{"yield": 0.0, "ttn": null, "notes": "pyrrolidine synthesis", ...}}, ...}}
336
-
337
- Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
517
+ Instructions:
518
+ 1. First, list ALL X-axis labels you can see in the figure
519
+ 2. Match each X-axis label to the target enzyme variants
520
+ 3. For matched variants, extract both bar heights (left Y-axis) and dot positions (right Y-axis)
521
+ 4. Return data only for variants that have clear X-axis labels and are matched to targets
522
+
523
+ Return JSON with the identified enzyme variant names as keys containing:
524
+ * "x_axis_label" - the exact text from the X-axis for this variant
525
+ * "yield" - percentage from left Y-axis bar height measurement
526
+ * "ttn" - turnover number from right Y-axis dot position measurement
527
+ * "ton" - if TTN not available
528
+ * "selectivity" - if shown
529
+ * "conversion" - if different from yield
530
+ * "tof" - if provided
531
+ * "activity" - if provided
532
+ * "other_metrics" - other metrics
533
+ * "notes" - REQUIRED: Describe the X-axis label, bar position, and dot position (e.g., "X-axis shows P411-CIS, leftmost bar is very short, dot is at bottom")
534
+
535
+ CRITICAL: Return ONLY valid JSON in this exact format:
536
+ {{"enzyme_name": {{"x_axis_label": "label", "yield": number, "ttn": number, "notes": "description"}}}}
537
+
538
+ Rules:
539
+ - Use double quotes for all strings
540
+ - No markdown, no commentary, no explanations
541
+ - All values must be properly formatted
542
+ - Ensure JSON is complete and valid
543
+ - Do not truncate or cut off the response
544
+ - IMPORTANT: When extracting data, prioritize the most complete source that shows data for ALL variants. If there are conflicting values between different sources (e.g., bar graph vs text values), use the source that provides complete data for all target enzymes and ignore partial or conflicting values from other sources
338
545
  """)
339
546
 
340
547
  # Removed substrate scope IUPAC extraction - now handled in model reaction only
@@ -519,13 +726,25 @@ class ReactionExtractor:
519
726
  _TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
520
727
 
521
728
  def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
522
- campaign_filter: Optional[str] = None):
729
+ campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
523
730
  self.manuscript = manuscript
524
731
  self.si = si
525
732
  self.cfg = cfg
526
733
  self.model = get_model(cfg)
527
734
  self.debug_dir = debug_dir
528
735
  self.campaign_filter = campaign_filter # Filter for specific campaign
736
+ self.all_campaigns = all_campaigns or [] # List of all campaigns for context
737
+
738
+ # Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
739
+ self._figure_cache = LRUCache(maxsize=100) # Figures are large, so smaller cache
740
+ self._model_reaction_locations_cache = LRUCache(maxsize=50)
741
+
742
+ # Cache for compound mappings to avoid repeated API calls (bounded to prevent memory leaks)
743
+ self._compound_mapping_cache = LRUCache(maxsize=1000)
744
+ self._compound_mapping_text_cache = LRUCache(maxsize=500) # Cache text extractions too
745
+
746
+ # Cache for reaction locations to avoid repeated API calls (bounded to prevent memory leaks)
747
+ self._reaction_locations_cache = LRUCache(maxsize=50)
529
748
 
530
749
  # Create debug directory if specified
531
750
  if self.debug_dir:
@@ -551,23 +770,40 @@ class ReactionExtractor:
551
770
  # ------------------------------------------------------------------
552
771
 
553
772
  def _collect_captions_and_titles(self) -> str:
554
- # Simpler pattern: match any line starting with Table or Figure
555
- # This catches all variations like "Table S 2", "Table.", "Figure S1", etc.
556
- cap_pattern = re.compile(r"^(Table|Figure).*", re.I | re.M)
773
+ # Pattern to match Table or Figure with optional leading whitespace
774
+ # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
775
+ # Also handles cases where there's whitespace before the caption
776
+ cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
557
777
  captions: List[str] = []
558
778
 
559
779
  # Collect from all pages
560
780
  all_text = "\n".join(self.all_pages)
561
781
 
562
- # Find all figure/table captions
782
+ # Find all figure/table captions with more context
563
783
  for match in cap_pattern.finditer(all_text):
564
784
  caption_start = match.start()
565
- # Get up to 1200 chars or until double newline
785
+
786
+ # Include some context before the caption (up to 200 chars)
787
+ context_start = max(0, caption_start - 200)
788
+ # Find the start of the sentence/paragraph before the caption
789
+ context_text = all_text[context_start:caption_start]
790
+ last_period = context_text.rfind('.')
791
+ if last_period != -1:
792
+ context_start = context_start + last_period + 1
793
+
794
+ # For tables, include much more content after the caption to show actual table data
795
+ # For figures, keep the original limit
796
+ is_table = match.group(1).lower() == 'table'
797
+ max_chars = 5000 if is_table else 3000
798
+
799
+ # Get up to max_chars or until double newline
566
800
  caption_end = all_text.find("\n\n", caption_start)
567
- if caption_end == -1 or caption_end - caption_start > 1200:
568
- caption_end = caption_start + 1200
569
- caption = all_text[caption_start:caption_end].strip()
570
- captions.append(caption)
801
+ if caption_end == -1 or caption_end - caption_start > max_chars:
802
+ caption_end = caption_start + max_chars
803
+
804
+ # Include the context and full caption with table content
805
+ full_caption = all_text[context_start:caption_end].strip()
806
+ captions.append(full_caption)
571
807
 
572
808
  # Also look for SI section titles
573
809
  si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
@@ -584,15 +820,47 @@ class ReactionExtractor:
584
820
 
585
821
  def find_reaction_locations(self) -> List[Dict[str, Any]]:
586
822
  """Find all locations containing reaction performance data."""
587
- # Add campaign context if available
823
+ # Create cache key based on campaign filter
824
+ cache_key = f"locations_{self.campaign_filter or 'all'}"
825
+
826
+ # Check cache first
827
+ cached_result = self._reaction_locations_cache.get(cache_key)
828
+ if cached_result is not None:
829
+ LOGGER.info("Using cached reaction locations for campaign: %s", self.campaign_filter or 'all')
830
+ return cached_result
831
+
832
+ # Add campaign context - always provide context to help model understanding
588
833
  campaign_context = ""
589
834
  if self.campaign_filter:
835
+ campaigns_warning = ""
836
+ if self.all_campaigns:
837
+ campaigns_warning = f"""
838
+ ALL CAMPAIGNS IN THIS PAPER:
839
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
840
+
841
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
842
+ Be extremely careful to only extract data for the {self.campaign_filter} campaign.
843
+ """
844
+
590
845
  campaign_context = f"""
591
- IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
592
- Only return locations that contain data for this specific campaign.
593
- Ignore locations that contain data for other campaigns.
846
+ IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
847
+ Only return locations that contain data for this specific campaign.
848
+ Ignore locations that contain data for other campaigns.
849
+ {campaigns_warning}
594
850
 
595
- """
851
+ """
852
+ else:
853
+ # Even for single campaigns, provide context about what to look for
854
+ campaign_context = f"""
855
+ IMPORTANT: You are looking for performance data showing enzyme evolution progression.
856
+ Look for locations that contain actual performance metrics (yield, TTN, TON, activity, etc.)
857
+ for multiple enzyme variants, not just mutation lists or method descriptions.
858
+
859
+ Tables may only contain mutation information without performance data - check the actual
860
+ table content below the caption to verify if performance metrics are present.
861
+ Figures with evolutionary lineage data often contain the actual performance matrix.
862
+
863
+ """
596
864
 
597
865
  prompt = campaign_context + PROMPT_FIND_LOCATIONS + "\n\n" + self._collect_captions_and_titles()
598
866
  try:
@@ -604,13 +872,20 @@ Ignore locations that contain data for other campaigns.
604
872
  tag="find_locations"
605
873
  )
606
874
  # Handle both single dict (backwards compatibility) and list
875
+ result = []
607
876
  if isinstance(data, dict):
608
- return [data]
877
+ result = [data]
609
878
  elif isinstance(data, list):
610
- return data
879
+ result = data
611
880
  else:
612
881
  LOGGER.error("Expected list or dict from Gemini, got: %s", type(data))
613
- return []
882
+ result = []
883
+
884
+ # Cache the result
885
+ self._reaction_locations_cache.put(cache_key, result)
886
+ LOGGER.info("Cached reaction locations for campaign: %s", self.campaign_filter or 'all')
887
+
888
+ return result
614
889
  except Exception as e:
615
890
  LOGGER.error("Failed to find reaction locations: %s", e)
616
891
  return []
@@ -686,13 +961,27 @@ Ignore locations that contain data for other campaigns.
686
961
  }
687
962
 
688
963
  def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
689
- """Find the model reaction for a specific lineage group."""
964
+ """Find the model reaction for a specific lineage group.
965
+ Returns early if no relevant text is found to avoid unnecessary API calls."""
966
+
690
967
  # Gather relevant text near this location
691
968
  page_text = self._page_with_reference(location) or ""
692
969
 
970
+ # Early exit if no text found for this location
971
+ if not page_text or len(page_text.strip()) < 100:
972
+ LOGGER.info("No sufficient text found for location %s, skipping lineage-specific extraction", location)
973
+ return {}
974
+
693
975
  # Also check manuscript introduction for model reaction info
694
976
  intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
695
977
 
978
+ # Quick relevance check - look for reaction-related keywords
979
+ reaction_keywords = ["substrate", "product", "reaction", "compound", "synthesis", "procedure", "method"]
980
+ combined_text = (page_text + intro_text).lower()
981
+ if not any(keyword in combined_text for keyword in reaction_keywords):
982
+ LOGGER.info("No reaction-related keywords found for location %s, skipping lineage extraction", location)
983
+ return {}
984
+
696
985
  # Build the prompt with location and context
697
986
  prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
698
987
  location=location,
@@ -702,6 +991,7 @@ Ignore locations that contain data for other campaigns.
702
991
  prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
703
992
 
704
993
  # If we have model reaction locations, include text from those locations too
994
+ text_added = False
705
995
  if model_reaction_locations:
706
996
  # Add text from model reaction location
707
997
  if model_reaction_locations.get("model_reaction_location", {}).get("location"):
@@ -709,6 +999,7 @@ Ignore locations that contain data for other campaigns.
709
999
  model_text = self._get_text_around_location(model_loc)
710
1000
  if model_text:
711
1001
  prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
1002
+ text_added = True
712
1003
 
713
1004
  # Add text from conditions location (often contains reaction details)
714
1005
  if model_reaction_locations.get("conditions_location", {}).get("location"):
@@ -716,8 +1007,15 @@ Ignore locations that contain data for other campaigns.
716
1007
  cond_text = self._get_text_around_location(cond_loc)
717
1008
  if cond_text:
718
1009
  prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
1010
+ text_added = True
1011
+
1012
+ # If we didn't find any model reaction locations and the page text is sparse, skip
1013
+ if not text_added and len(page_text.strip()) < 500:
1014
+ LOGGER.info("Insufficient context for lineage model reaction extraction at %s", location)
1015
+ return {}
719
1016
 
720
1017
  try:
1018
+ LOGGER.info("Attempting lineage-specific model reaction extraction for %s", location)
721
1019
  data = generate_json_with_retry(
722
1020
  self.model,
723
1021
  prompt,
@@ -725,7 +1023,15 @@ Ignore locations that contain data for other campaigns.
725
1023
  debug_dir=self.debug_dir,
726
1024
  tag=f"lineage_model_reaction_{location.replace(' ', '_')}"
727
1025
  )
728
- return data if isinstance(data, dict) else {}
1026
+
1027
+ # Validate the response has useful information
1028
+ if isinstance(data, dict) and (data.get('substrate_ids') or data.get('product_ids')):
1029
+ LOGGER.info("Lineage model reaction extraction successful for %s", location)
1030
+ return data
1031
+ else:
1032
+ LOGGER.info("Lineage model reaction extraction returned empty results for %s", location)
1033
+ return {}
1034
+
729
1035
  except Exception as e:
730
1036
  LOGGER.error("Failed to find model reaction for lineage at %s: %s", location, e)
731
1037
  return {}
@@ -777,67 +1083,174 @@ Ignore locations that contain data for other campaigns.
777
1083
  If False, extracts the entire page (useful for tables).
778
1084
  Returns a base64-encoded PNG or None."""
779
1085
 
1086
+ # Check cache first
1087
+ cache_key = f"{ref}_{extract_figure_only}"
1088
+ cached_result = self._figure_cache.get(cache_key)
1089
+ if cached_result is not None:
1090
+ LOGGER.debug("Using cached figure for %s", ref)
1091
+ return cached_result
1092
+
780
1093
  # For table extraction, use multi-page approach
781
1094
  if not extract_figure_only:
782
1095
  pages_with_ref = self._find_pages_with_reference(ref)
783
1096
  if pages_with_ref:
784
1097
  LOGGER.debug(f"Found {len(pages_with_ref)} pages containing {ref}")
785
- return self._extract_multiple_pages_png(pages_with_ref)
1098
+ return self._extract_multiple_pages_png(pages_with_ref, ref)
786
1099
  return None
787
1100
 
788
- # For figure extraction, search both documents
1101
+ # For figure extraction, search both documents for actual figure captions
789
1102
  for doc in filter(None, [self.ms_doc, self.si_doc]):
790
1103
  for page_number in range(doc.page_count):
791
1104
  page = doc.load_page(page_number)
792
1105
  page_text = page.get_text()
793
- if ref.lower() not in page_text.lower():
794
- continue
795
- # Get caption bbox
796
- text_instances = page.search_for(ref, quads=False)
797
- if not text_instances:
1106
+
1107
+ # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
1108
+ # For subfigures like "Figure 1C", extract the main figure "Figure 1"
1109
+ figure_num = ref.replace('Figure ', '').replace('figure ', '')
1110
+
1111
+ # Extract main figure number from subfigure (e.g., "1C" -> "1")
1112
+ main_figure_num = re.match(r'^(\d+)', figure_num)
1113
+ if main_figure_num:
1114
+ main_figure_num = main_figure_num.group(1)
1115
+ else:
1116
+ main_figure_num = figure_num
1117
+
1118
+ caption_patterns = [
1119
+ rf"^Figure\s+{re.escape(main_figure_num)}\.", # "Figure 1."
1120
+ rf"^Figure\s+{re.escape(main_figure_num)}:", # "Figure 1:"
1121
+ rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]", # "Figure 1 Performance"
1122
+ rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
1123
+ rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
1124
+ rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
1125
+ ]
1126
+
1127
+ LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
1128
+ main_figure_num, ref, caption_patterns)
1129
+
1130
+ caption_found = False
1131
+ cap_rect = None
1132
+
1133
+ for pattern in caption_patterns:
1134
+ matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
1135
+ if matches:
1136
+ LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
1137
+ # Found actual figure caption, get its position
1138
+ caption_text = matches.group(0)
1139
+ text_instances = page.search_for(caption_text, quads=False)
1140
+ if text_instances:
1141
+ cap_rect = text_instances[0]
1142
+ caption_found = True
1143
+ LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
1144
+ break
1145
+
1146
+ if not caption_found:
1147
+ # Debug: show what figure-related text is actually on this page
1148
+ figure_mentions = [line.strip() for line in page_text.split('\n')
1149
+ if 'figure' in line.lower() and main_figure_num.lower() in line.lower()]
1150
+ if figure_mentions:
1151
+ LOGGER.debug("Page %d has figure mentions but no caption match: %s",
1152
+ page_number, figure_mentions[:3])
798
1153
  continue
799
- cap_rect = text_instances[0] # first match
800
1154
 
801
1155
  if extract_figure_only:
802
- # Sort images by y0 (top) coordinate ascending
803
- images = sorted(page.get_images(full=True), key=lambda im: im[7])
804
- # Find first image whose bottom y is **above** caption top y
805
- for img in images:
806
- xref = img[0]
807
- # Get image rectangles to find position
808
- img_rects = page.get_image_rects(xref)
809
- if img_rects:
810
- img_rect = img_rects[0] # First rectangle
811
- if img_rect.y1 < cap_rect.y0: # fully above caption
812
- # Extract image bytes
813
- pix = fitz.Pixmap(doc, xref)
814
- pix = self._ensure_rgb_pixmap(pix)
815
- img_bytes = pix.tobytes("png")
816
- return b64encode(img_bytes).decode()
1156
+ # Extract only the area above the caption (the actual figure)
1157
+ # This excludes caption text and focuses on visual elements
1158
+ LOGGER.info("Extracting figure area above caption for %s", ref)
1159
+
1160
+ # Get the page dimensions
1161
+ page_rect = page.rect
1162
+
1163
+ # Extract the area above the caption
1164
+ if cap_rect:
1165
+ # Extract from top of page to top of caption
1166
+ figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
1167
+ LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
1168
+ else:
1169
+ # If no caption found, use top 80% of page
1170
+ figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
1171
+ LOGGER.debug("No caption found, using top 80% of page: %s", figure_rect)
1172
+
1173
+ # Extract the figure area only
1174
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
1175
+ pix = page.get_pixmap(matrix=mat, clip=figure_rect)
1176
+ pix = self._ensure_rgb_pixmap(pix)
1177
+ img_bytes = pix.tobytes("png")
1178
+
1179
+ # Save PNG to debug directory if available
1180
+ if self.debug_dir:
1181
+ timestamp = int(time.time())
1182
+ png_file = self.debug_dir / f"figure_{ref.replace(' ', '_')}_{timestamp}.png"
1183
+ with open(png_file, 'wb') as f:
1184
+ f.write(img_bytes)
1185
+ LOGGER.info("Saved figure page to: %s", png_file)
1186
+
1187
+ result = b64encode(img_bytes).decode()
1188
+ # Cache the result
1189
+ self._figure_cache.put(cache_key, result)
1190
+ return result
817
1191
  else:
818
1192
  # Extract the entire page as an image
819
- mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
1193
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
820
1194
  pix = page.get_pixmap(matrix=mat)
821
1195
  pix = self._ensure_rgb_pixmap(pix)
822
1196
  img_bytes = pix.tobytes("png")
823
- return b64encode(img_bytes).decode()
1197
+
1198
+ # Save PNG to debug directory if available
1199
+ if self.debug_dir:
1200
+ timestamp = int(time.time())
1201
+ png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
1202
+ with open(png_file, 'wb') as f:
1203
+ f.write(img_bytes)
1204
+ LOGGER.info("Saved page image to: %s", png_file)
1205
+
1206
+ result = b64encode(img_bytes).decode()
1207
+ # Cache the result
1208
+ self._figure_cache.put(cache_key, result)
1209
+ return result
824
1210
  return None
825
1211
 
826
1212
  def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
827
1213
  """Find all pages containing the reference across documents.
1214
+ Prioritizes pages with actual captions over just references.
828
1215
  Returns list of (document, page_number) tuples."""
829
1216
  pages_found = []
1217
+ caption_pages = []
830
1218
 
831
1219
  for doc in filter(None, [self.ms_doc, self.si_doc]):
832
1220
  for page_number in range(doc.page_count):
833
1221
  page = doc.load_page(page_number)
834
1222
  page_text = page.get_text()
1223
+
1224
+ # Check for actual figure caption first
1225
+ if ref.lower().startswith('figure'):
1226
+ figure_num = ref.replace('Figure ', '').replace('figure ', '')
1227
+
1228
+ # Extract main figure number from subfigure (e.g., "1C" -> "1")
1229
+ main_figure_num = re.match(r'^(\d+)', figure_num)
1230
+ if main_figure_num:
1231
+ main_figure_num = main_figure_num.group(1)
1232
+ else:
1233
+ main_figure_num = figure_num
1234
+
1235
+ caption_patterns = [
1236
+ rf"^Figure\s+{re.escape(main_figure_num)}\.",
1237
+ rf"^Figure\s+{re.escape(main_figure_num)}:",
1238
+ rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]"
1239
+ ]
1240
+
1241
+ for pattern in caption_patterns:
1242
+ if re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE):
1243
+ caption_pages.append((doc, page_number))
1244
+ break
1245
+
1246
+ # Fallback to any mention of the reference
835
1247
  if ref.lower() in page_text.lower():
836
1248
  pages_found.append((doc, page_number))
837
-
838
- return pages_found
1249
+
1250
+ # Return caption pages first, then other pages
1251
+ return caption_pages + [p for p in pages_found if p not in caption_pages]
839
1252
 
840
- def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]]) -> Optional[str]:
1253
+ def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]], ref: str = "unknown") -> Optional[str]:
841
1254
  """Extract multiple pages as a combined PNG image."""
842
1255
  if not pages:
843
1256
  return None
@@ -854,12 +1267,7 @@ Ignore locations that contain data for other campaigns.
854
1267
  pix = self._ensure_rgb_pixmap(pix)
855
1268
  all_images.append(pix)
856
1269
 
857
- # If this is the last page with the reference, also add the next page
858
- if i == len(pages) - 1 and page_num + 1 < doc.page_count:
859
- next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
860
- next_pix = self._ensure_rgb_pixmap(next_pix)
861
- all_images.append(next_pix)
862
- LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
1270
+ # Only extract the page containing the reference (removed next page logic)
863
1271
 
864
1272
  if not all_images:
865
1273
  return None
@@ -867,7 +1275,17 @@ Ignore locations that contain data for other campaigns.
867
1275
  # If only one page, return it directly
868
1276
  if len(all_images) == 1:
869
1277
  pix = self._ensure_rgb_pixmap(all_images[0])
870
- return b64encode(pix.tobytes("png")).decode()
1278
+ img_bytes = pix.tobytes("png")
1279
+
1280
+ # Save debug file if available
1281
+ if self.debug_dir:
1282
+ timestamp = int(time.time())
1283
+ png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
1284
+ with open(png_file, 'wb') as f:
1285
+ f.write(img_bytes)
1286
+ LOGGER.info("Saved multi-page image to: %s", png_file)
1287
+
1288
+ return b64encode(img_bytes).decode()
871
1289
 
872
1290
  # Combine multiple pages vertically
873
1291
  if not all_images:
@@ -914,7 +1332,7 @@ Ignore locations that contain data for other campaigns.
914
1332
  y_offset += pix.height * scale
915
1333
 
916
1334
  # Convert the page to a pixmap
917
- mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
1335
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for quality
918
1336
  combined_pix = page.get_pixmap(matrix=mat)
919
1337
  combined_pix = self._ensure_rgb_pixmap(combined_pix)
920
1338
 
@@ -922,17 +1340,95 @@ Ignore locations that contain data for other campaigns.
922
1340
  img_bytes = combined_pix.tobytes("png")
923
1341
  output_doc.close()
924
1342
 
1343
+ # Save debug file if available
1344
+ if self.debug_dir:
1345
+ timestamp = int(time.time())
1346
+ png_file = self.debug_dir / f"combined_pages_{ref.replace(' ', '_')}_{timestamp}.png"
1347
+ with open(png_file, 'wb') as f:
1348
+ f.write(img_bytes)
1349
+ LOGGER.info("Saved combined multi-page image to: %s", png_file)
1350
+
925
1351
  return b64encode(img_bytes).decode()
926
1352
 
927
1353
  # ------------------------------------------------------------------
928
1354
  # 6.3 Extract metrics in batch
929
1355
  # ------------------------------------------------------------------
1356
+
1357
+ def _validate_location_exists(self, ref: str) -> bool:
1358
+ """Verify that the referenced location actually exists in the document."""
1359
+ # Search for the actual reference in both manuscript and SI documents
1360
+ docs_to_check = [self.ms_doc]
1361
+ if self.si_doc:
1362
+ docs_to_check.append(self.si_doc)
1363
+
1364
+ for doc in docs_to_check:
1365
+ for page_num in range(len(doc)):
1366
+ page = doc[page_num]
1367
+ text = page.get_text()
1368
+
1369
+ # Look for table references like "Table 1", "Table S1", etc.
1370
+ if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
1371
+ return True
1372
+
1373
+ return False
1374
+
1375
+ def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
1376
+ """Validate that the context contains meaningful content for extraction."""
1377
+ if not snippet or len(snippet.strip()) < 50:
1378
+ LOGGER.warning("Insufficient context for extraction from %s - skipping", ref)
1379
+ return False
1380
+
1381
+ # Check if context actually mentions the enzymes we're looking for
1382
+ enzyme_mentions = sum(1 for enzyme in enzyme_list if enzyme.lower() in snippet.lower())
1383
+ if enzyme_mentions == 0:
1384
+ LOGGER.warning("No enzyme mentions found in context for %s - skipping", ref)
1385
+ return False
1386
+
1387
+ # Check for performance-related keywords
1388
+ performance_keywords = ['yield', 'selectivity', 'conversion', 'ee', 'er', 'ttn', 'ton', 'tof', '%', 'percent']
1389
+ has_performance_data = any(keyword in snippet.lower() for keyword in performance_keywords)
1390
+
1391
+ if not has_performance_data:
1392
+ LOGGER.warning("No performance metrics found in context for %s - skipping", ref)
1393
+ return False
1394
+
1395
+ LOGGER.info("Context validated for %s: %d chars, %d enzyme mentions", ref, len(snippet), enzyme_mentions)
1396
+ return True
1397
+
1398
+ def _validate_response(self, data: Dict, enzyme_list: List[str], ref: str) -> bool:
1399
+ """Validate that the response contains meaningful data for the requested enzymes."""
1400
+ if not data or not isinstance(data, dict):
1401
+ LOGGER.warning("Invalid response format from %s - skipping", ref)
1402
+ return False
1403
+
1404
+ # Check if we got data for at least one enzyme
1405
+ enzymes_with_data = 0
1406
+ for enzyme in enzyme_list:
1407
+ enzyme_data = data.get(enzyme, {})
1408
+ if isinstance(enzyme_data, dict) and enzyme_data:
1409
+ # Check if there's at least one non-null metric
1410
+ metrics = ['yield', 'ttn', 'ton', 'selectivity', 'conversion', 'tof', 'activity']
1411
+ has_metric = any(enzyme_data.get(metric) is not None for metric in metrics)
1412
+ if has_metric:
1413
+ enzymes_with_data += 1
1414
+
1415
+ if enzymes_with_data == 0:
1416
+ LOGGER.warning("No valid metrics found in response from %s - skipping", ref)
1417
+ return False
1418
+
1419
+ LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
1420
+ return True
930
1421
 
931
1422
  def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
932
1423
  """Extract performance metrics for multiple enzymes from the identified location in batch."""
933
1424
  ref_lc = ref.lower()
934
1425
  image_b64: Optional[str] = None
935
1426
 
1427
+ # First, validate that the location actually exists in the document
1428
+ if not self._validate_location_exists(ref):
1429
+ LOGGER.warning("Location %s not found in document - skipping", ref)
1430
+ return []
1431
+
936
1432
  # Add campaign context if available
937
1433
  campaign_context = ""
938
1434
  if self.campaign_filter:
@@ -945,19 +1441,55 @@ Ignore locations that contain data for other campaigns.
945
1441
  LOGGER.debug("No page image found for %s - using full page text", ref)
946
1442
  snippet = self._extract_table_context(ref)
947
1443
  elif self._FIG_RE.search(ref_lc):
948
- # For figures, extract just the figure image
1444
+ # For figures, extract just the figure image (same logic as compound mapping)
949
1445
  image_b64 = self._extract_page_png(ref, extract_figure_only=True)
950
1446
  if not image_b64:
951
1447
  LOGGER.debug("No figure image found for %s - using caption text", ref)
952
1448
  snippet = self._extract_figure_caption(ref)
1449
+ else:
1450
+ # If figure is found, ignore text information - use image only
1451
+ snippet = ""
953
1452
  else:
954
1453
  snippet = self._page_with_reference(ref) or ""
955
1454
 
956
- enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
1455
+ # For figures with images, skip text validation and proceed with image extraction
1456
+ if image_b64 and self._FIG_RE.search(ref_lc):
1457
+ LOGGER.info("Using figure image for %s - ignoring text context", ref)
1458
+ elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
1459
+ return []
1460
+
1461
+ # Create enhanced enzyme descriptions with parent/mutation context
1462
+ if hasattr(self, 'enzyme_df') and self.enzyme_df is not None:
1463
+ enzyme_descriptions = []
1464
+ for enzyme in enzyme_list:
1465
+ # Find this enzyme in the dataframe
1466
+ enzyme_row = None
1467
+ if 'enzyme_id' in self.enzyme_df.columns:
1468
+ enzyme_row = self.enzyme_df[self.enzyme_df['enzyme_id'] == enzyme]
1469
+ elif 'enzyme' in self.enzyme_df.columns:
1470
+ enzyme_row = self.enzyme_df[self.enzyme_df['enzyme'] == enzyme]
1471
+
1472
+ if enzyme_row is not None and len(enzyme_row) > 0:
1473
+ row = enzyme_row.iloc[0]
1474
+ parent = row.get('parent_enzyme_id', '')
1475
+ mutations = row.get('mutations', '')
1476
+
1477
+ desc = f"- {enzyme}"
1478
+ if parent and str(parent).strip() and str(parent) != 'nan':
1479
+ desc += f" (parent: {parent})"
1480
+ if mutations and str(mutations).strip() and str(mutations) != 'nan':
1481
+ desc += f" (mutations: {mutations})"
1482
+ enzyme_descriptions.append(desc)
1483
+ else:
1484
+ enzyme_descriptions.append(f"- {enzyme}")
1485
+ enzyme_names = "\n".join(enzyme_descriptions)
1486
+ else:
1487
+ enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
957
1488
 
958
1489
  if image_b64:
959
1490
  # Use batch extraction prompt for image analysis
960
- prompt = campaign_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
1491
+ location_context = f"\n\nIMPORTANT: You are extracting data from {ref}, which has been identified as the PRIMARY LOCATION containing the most reliable performance data for these enzymes.\n"
1492
+ prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
961
1493
  LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
962
1494
  tag = f"extract_metrics_batch_vision"
963
1495
  else:
@@ -977,6 +1509,35 @@ Ignore locations that contain data for other campaigns.
977
1509
  image_b64=image_b64
978
1510
  )
979
1511
 
1512
+ # Validate response has meaningful data
1513
+ if not self._validate_response(data, enzyme_list, ref):
1514
+ # If figure extraction failed and we have a figure, try falling back to text
1515
+ if image_b64 and self._FIG_RE.search(ref_lc):
1516
+ LOGGER.warning("Figure extraction from %s returned empty results - falling back to text", ref)
1517
+ snippet = self._extract_figure_caption(ref)
1518
+ if self._validate_context(snippet, enzyme_list, ref):
1519
+ # Retry with text extraction
1520
+ format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
1521
+ prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
1522
+ LOGGER.info("Gemini: retrying with text extraction for %d enzymes from %s…", len(enzyme_list), ref)
1523
+
1524
+ data = generate_json_with_retry(
1525
+ self.model,
1526
+ prompt,
1527
+ temperature=self.cfg.extract_temperature,
1528
+ debug_dir=self.debug_dir,
1529
+ tag=f"extract_metrics_batch_text_fallback",
1530
+ image_b64=None
1531
+ )
1532
+
1533
+ # Validate the text extraction response
1534
+ if not self._validate_response(data, enzyme_list, ref):
1535
+ return []
1536
+ else:
1537
+ return []
1538
+ else:
1539
+ return []
1540
+
980
1541
  # Handle the response format - expecting a dict with enzyme names as keys
981
1542
  results = []
982
1543
  if isinstance(data, dict):
@@ -1027,6 +1588,15 @@ Ignore locations that contain data for other campaigns.
1027
1588
 
1028
1589
  def find_model_reaction_locations(self, enzyme_variants: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
1029
1590
  """Find locations for model reaction scheme, conditions, and IUPAC names."""
1591
+ # Create cache key based on campaign filter and enzyme variants
1592
+ cache_key = f"{self.campaign_filter}_{hash(tuple(sorted(enzyme_variants)) if enzyme_variants else ())}"
1593
+
1594
+ # Check cache first
1595
+ cached_result = self._model_reaction_locations_cache.get(cache_key)
1596
+ if cached_result is not None:
1597
+ LOGGER.info("Using cached model reaction locations for campaign: %s", self.campaign_filter)
1598
+ return cached_result
1599
+
1030
1600
  # Collect all text including section titles, captions, and schemes
1031
1601
  all_text = self._collect_captions_and_titles()
1032
1602
 
@@ -1037,13 +1607,25 @@ Ignore locations that contain data for other campaigns.
1037
1607
  # Add enzyme context if provided
1038
1608
  enzyme_context = ""
1039
1609
  if enzyme_variants and self.campaign_filter:
1610
+ campaigns_context = ""
1611
+ if self.all_campaigns:
1612
+ campaigns_context = f"""
1613
+ ALL CAMPAIGNS IN THIS PAPER:
1614
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
1615
+
1616
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
1617
+ - Different campaigns may use similar enzyme names but different substrates
1618
+ - Be extremely careful to only extract data for the {self.campaign_filter} campaign
1619
+ - Ignore data from other campaigns even if they seem similar
1620
+ """
1621
+
1040
1622
  enzyme_context = f"""
1041
1623
  IMPORTANT CONTEXT:
1042
1624
  You are looking for the model reaction used specifically for these enzyme variants:
1043
1625
  {', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
1044
1626
 
1045
1627
  These variants belong to campaign: {self.campaign_filter}
1046
-
1628
+ {campaigns_context}
1047
1629
  Focus on finding the model reaction that was used to evaluate THESE specific variants.
1048
1630
  Different campaigns may use different model reactions.
1049
1631
  """
@@ -1061,6 +1643,11 @@ Different campaigns may use different model reactions.
1061
1643
  if not isinstance(data, dict):
1062
1644
  LOGGER.error("Expected dict from Gemini, got: %s", type(data))
1063
1645
  return None
1646
+
1647
+ # Cache the result
1648
+ self._model_reaction_locations_cache.put(cache_key, data)
1649
+ LOGGER.info("Cached model reaction locations for campaign: %s", self.campaign_filter)
1650
+
1064
1651
  return data
1065
1652
  except Exception as e:
1066
1653
  LOGGER.error("Failed to find model reaction locations: %s", e)
@@ -1159,9 +1746,12 @@ Different campaigns may use different model reactions.
1159
1746
  extraction_text: str,
1160
1747
  compound_ids: List[str] = None,
1161
1748
  tag_suffix: str = "",
1749
+ campaign_filter: Optional[str] = None,
1162
1750
  ) -> Dict[str, CompoundMapping]:
1163
1751
  """Helper function to extract compound mappings from provided text."""
1164
1752
  prompt = PROMPT_COMPOUND_MAPPING
1753
+ if campaign_filter:
1754
+ prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
1165
1755
  if compound_ids:
1166
1756
  prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
1167
1757
  prompt += "\n\nTEXT:\n" + extraction_text
@@ -1209,6 +1799,7 @@ Different campaigns may use different model reactions.
1209
1799
  compound_ids: List[str],
1210
1800
  figure_images: Dict[str, str],
1211
1801
  tag_suffix: str = "",
1802
+ campaign_filter: Optional[str] = None,
1212
1803
  ) -> Dict[str, CompoundMapping]:
1213
1804
  """Extract compound mappings using multimodal approach with figures."""
1214
1805
  # Enhanced prompt for figure-based extraction
@@ -1220,8 +1811,26 @@ Use your best knowledge, Look carefully in:
1220
1811
  1. The chemical structures shown in figures - infer IUPAC names from drawn structures
1221
1812
  2. Figure captions that may define compounds
1222
1813
  3. Text that refers to these compound numbers
1223
- 4. Reaction schemes showing transformations
1814
+ 4. Reaction schemes showing transformations"""
1815
+
1816
+ if campaign_filter:
1817
+ campaigns_warning = ""
1818
+ if self.all_campaigns:
1819
+ campaigns_warning = f"""
1820
+ ALL CAMPAIGNS IN THIS PAPER:
1821
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
1822
+
1823
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
1824
+ """
1825
+
1826
+ prompt += f"""
1224
1827
 
1828
+ IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system.
1829
+ {campaigns_warning}
1830
+ Different campaigns may use different numbering systems for compounds.
1831
+ Do NOT include compound information from other campaigns."""
1832
+
1833
+ prompt += """
1225
1834
 
1226
1835
  IMPORTANT:
1227
1836
  - Only provide IUPAC names you can determine from the figures or text
@@ -1251,9 +1860,10 @@ TEXT FROM MANUSCRIPT:
1251
1860
  for fig_ref, fig_base64 in figure_images.items():
1252
1861
  try:
1253
1862
  img_bytes = b64decode(fig_base64)
1254
- image = PIL.Image.open(io.BytesIO(img_bytes))
1863
+ # Format image for Gemini API
1864
+ image_part = {"mime_type": "image/png", "data": img_bytes}
1255
1865
  content_parts.append(f"\n[Figure: {fig_ref}]")
1256
- content_parts.append(image)
1866
+ content_parts.append(image_part)
1257
1867
  LOGGER.info("Added figure %s to multimodal compound mapping", fig_ref)
1258
1868
  except Exception as e:
1259
1869
  LOGGER.warning("Failed to add figure %s: %s", fig_ref, e)
@@ -1283,6 +1893,21 @@ TEXT FROM MANUSCRIPT:
1283
1893
 
1284
1894
  # Make multimodal API call
1285
1895
  response = self.model.generate_content(content_parts)
1896
+
1897
+ # Track token usage if available
1898
+ try:
1899
+ if hasattr(response, 'usage_metadata'):
1900
+ input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
1901
+ output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
1902
+ if input_tokens or output_tokens:
1903
+ try:
1904
+ from .wrapper import add_token_usage
1905
+ add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
1906
+ except ImportError:
1907
+ pass # wrapper not available
1908
+ except Exception:
1909
+ pass # token tracking is best-effort
1910
+
1286
1911
  raw_text = response.text.strip()
1287
1912
 
1288
1913
  # Log response
@@ -1329,18 +1954,37 @@ TEXT FROM MANUSCRIPT:
1329
1954
  self,
1330
1955
  compound_ids: List[str],
1331
1956
  initial_sections: List[str] = None,
1957
+ campaign_filter: Optional[str] = None,
1332
1958
  ) -> Dict[str, CompoundMapping]:
1333
- """Extract compound ID to IUPAC name mappings using adaptive 3-tier strategy.
1959
+ """Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
1334
1960
 
1335
- 1. First attempts extraction from standard sections
1336
- 2. Expands search to additional sections if compounds are missing
1337
- 3. Uses multimodal figure analysis as final fallback
1961
+ 1. First attempts extraction from specific SI sections + 10 manuscript pages
1962
+ 2. If compounds missing, uses full manuscript + SI with multimodal figure analysis
1338
1963
  """
1339
1964
  if not compound_ids:
1340
1965
  return {}
1341
1966
 
1342
- LOGGER.info("Starting adaptive compound mapping for %d compounds: %s",
1343
- len(compound_ids), sorted(compound_ids))
1967
+ # Check cache first - return cached results for compounds we've already processed
1968
+ cached_mappings = {}
1969
+ uncached_compound_ids = []
1970
+
1971
+ for cid in compound_ids:
1972
+ # Include campaign filter in cache key to prevent cross-campaign contamination
1973
+ cache_key = f"{campaign_filter}_{cid.lower().strip()}" if campaign_filter else cid.lower().strip()
1974
+ cached_mapping = self._compound_mapping_cache.get(cache_key)
1975
+ if cached_mapping is not None:
1976
+ cached_mappings[cid.lower().strip()] = cached_mapping
1977
+ LOGGER.info("Using cached compound mapping for: %s (campaign: %s)", cid, campaign_filter)
1978
+ else:
1979
+ uncached_compound_ids.append(cid)
1980
+
1981
+ # If all compounds are cached, return immediately
1982
+ if not uncached_compound_ids:
1983
+ LOGGER.info("All %d compounds found in cache, skipping API calls", len(compound_ids))
1984
+ return cached_mappings
1985
+
1986
+ LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
1987
+ len(uncached_compound_ids), sorted(uncached_compound_ids))
1344
1988
 
1345
1989
  # Tier 1: Standard sections (manuscript + initial SI sections)
1346
1990
  initial_sections = initial_sections or [
@@ -1351,118 +1995,118 @@ TEXT FROM MANUSCRIPT:
1351
1995
  # Include manuscript pages (first 10) for model reaction context
1352
1996
  manuscript_text = "\n\n".join(self.ms_pages[:10])
1353
1997
 
1998
+ # Add campaign context if provided
1999
+ campaign_context = ""
2000
+ if campaign_filter:
2001
+ campaigns_warning = ""
2002
+ if self.all_campaigns:
2003
+ campaigns_warning = f"""
2004
+ ALL CAMPAIGNS IN THIS PAPER:
2005
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
2006
+
2007
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
2008
+ """
2009
+
2010
+ campaign_context = f"""
2011
+
2012
+ IMPORTANT CAMPAIGN CONTEXT:
2013
+ You are extracting compound information specifically for the {campaign_filter} campaign.
2014
+ {campaigns_warning}
2015
+ Focus ONLY on compound information relevant to the {campaign_filter} campaign/reaction system.
2016
+ Do NOT include compound information from other campaigns.
2017
+
2018
+ """
2019
+
1354
2020
  # Extract from initial sections
1355
2021
  extraction_text = self._extract_sections_by_title(initial_sections)
1356
2022
  if extraction_text:
1357
- extraction_text = manuscript_text + "\n\n" + extraction_text
2023
+ extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
1358
2024
  else:
1359
- extraction_text = manuscript_text
2025
+ extraction_text = manuscript_text + campaign_context
1360
2026
 
1361
- # First extraction attempt
2027
+ # First extraction attempt - only for uncached compounds
1362
2028
  mappings = self._extract_compound_mappings_from_text(
1363
- extraction_text[:50000], compound_ids, tag_suffix="initial"
2029
+ extraction_text[:50000], uncached_compound_ids, tag_suffix="initial", campaign_filter=campaign_filter
1364
2030
  )
1365
2031
  LOGGER.info("Tier 1: Found %d compound mappings from standard sections", len(mappings))
1366
2032
 
1367
2033
  # Check for missing compounds
1368
2034
  missing_compounds = []
1369
- for cid in compound_ids:
2035
+ for cid in uncached_compound_ids:
1370
2036
  mapping = mappings.get(cid.lower().strip())
1371
2037
  if not mapping or not mapping.iupac_name:
1372
2038
  missing_compounds.append(cid)
1373
2039
 
1374
- # Tier 2: Expanded search + multimodal with figures
2040
+ # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
1375
2041
  if missing_compounds:
1376
- LOGGER.info("Tier 2: %d compounds still missing IUPAC names: %s",
2042
+ LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
1377
2043
  len(missing_compounds), sorted(missing_compounds))
1378
2044
 
1379
- # Additional sections to search
1380
- additional_sections = [
1381
- "Engineering strategy", "Evolution campaign",
1382
- "Screening", "Optimization", "Substrate synthesis",
1383
- "Supporting Information", "Supplementary Methods"
1384
- ]
1385
-
1386
- # Extract from additional sections
1387
- additional_text = self._extract_sections_by_title(additional_sections)
1388
-
1389
- # Also extract any figures that might contain compound structures
2045
+ # Get all available figures for compound structure analysis
1390
2046
  figure_images = {}
1391
- figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Scheme 1", "Scheme 2"]
2047
+
2048
+ # Extract main manuscript figures
2049
+ figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
1392
2050
  for ref in figure_refs:
1393
2051
  img_b64 = self._extract_page_png(ref, extract_figure_only=True)
1394
2052
  if img_b64:
1395
2053
  figure_images[ref] = img_b64
1396
- LOGGER.info("Extracted %s for compound mapping", ref)
1397
-
1398
- # Try multimodal approach with figures and expanded text
1399
- if figure_images or additional_text:
1400
- combined_text = additional_text[:30000] if additional_text else ""
1401
- expanded_mappings = self._extract_compound_mappings_with_figures(
1402
- combined_text, missing_compounds, figure_images, tag_suffix="tier2"
1403
- )
2054
+ LOGGER.info("Retrieved %s for compound mapping", ref)
1404
2055
 
1405
- # Merge new mappings
1406
- new_found = 0
1407
- for key, mapping in expanded_mappings.items():
1408
- if key not in mappings or not mappings[key].iupac_name:
1409
- if mapping.iupac_name:
1410
- mappings[key] = mapping
1411
- new_found += 1
1412
- LOGGER.info("Found IUPAC name for '%s': %s",
1413
- key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
1414
-
1415
- LOGGER.info("Tier 2: Found %d additional compound mappings", new_found)
1416
-
1417
- # Check again for still missing compounds
1418
- still_missing = []
1419
- for cid in missing_compounds:
1420
- mapping = mappings.get(cid.lower().strip())
1421
- if not mapping or not mapping.iupac_name:
1422
- still_missing.append(cid)
1423
-
1424
- # Tier 3: Full manuscript search with all available figures
1425
- if still_missing:
1426
- LOGGER.info("Tier 3: %d compounds still missing, trying full manuscript search",
1427
- len(still_missing))
1428
-
1429
- # Get all SI figures
1430
- si_figure_refs = []
1431
- for page in self.si_pages[:5]: # Check first 5 SI pages
1432
- matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
1433
- si_figure_refs.extend(matches[:5]) # Limit to 5 figures
1434
-
1435
- # Extract SI figures
1436
- for ref in set(si_figure_refs):
1437
- if ref not in figure_images:
1438
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
1439
- if img_b64:
1440
- figure_images[ref] = img_b64
1441
- LOGGER.info("Extracted %s for final compound mapping", ref)
1442
-
1443
- # Full text search including all pages
1444
- full_text = "\n\n".join(self.all_pages[:30]) # First 30 pages
1445
-
1446
- final_mappings = self._extract_compound_mappings_with_figures(
1447
- full_text[:50000], still_missing, figure_images, tag_suffix="tier3"
1448
- )
1449
-
1450
- # Merge final mappings
1451
- final_found = 0
1452
- for key, mapping in final_mappings.items():
1453
- if key not in mappings or not mappings[key].iupac_name:
1454
- if mapping.iupac_name:
1455
- mappings[key] = mapping
1456
- final_found += 1
1457
- LOGGER.info("Found IUPAC name for '%s' in final search: %s",
1458
- key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
2056
+ # Get SI figures
2057
+ si_figure_refs = []
2058
+ for page in self.si_pages[:10]: # Check first 10 SI pages
2059
+ matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
2060
+ si_figure_refs.extend(matches[:10]) # Limit to 10 figures
2061
+
2062
+ # Extract SI figures
2063
+ for ref in set(si_figure_refs):
2064
+ if ref not in figure_images:
2065
+ img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2066
+ if img_b64:
2067
+ figure_images[ref] = img_b64
2068
+ LOGGER.info("Extracted %s for compound mapping", ref)
2069
+
2070
+ # Full text search including all pages
2071
+ full_text = "\n\n".join(self.all_pages[:40]) # First 40 pages (more comprehensive)
2072
+
2073
+ final_mappings = self._extract_compound_mappings_with_figures(
2074
+ full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2075
+ )
2076
+
2077
+ # Merge final mappings with better compound ID matching
2078
+ final_found = 0
2079
+ for key, mapping in final_mappings.items():
2080
+ if key not in mappings or not mappings[key].iupac_name:
2081
+ if mapping.iupac_name:
2082
+ mappings[key] = mapping
2083
+ final_found += 1
2084
+ iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
2085
+ LOGGER.info("Found IUPAC name for '%s' in full search: %s", key, iupac_display)
2086
+
2087
+ LOGGER.info("Tier 2: Found %d additional compound mappings", final_found)
2088
+
2089
+ # Cache all newly found mappings using campaign-aware cache key
2090
+ for key, mapping in mappings.items():
2091
+ cache_key = f"{campaign_filter}_{key}" if campaign_filter else key
2092
+ if self._compound_mapping_cache.get(cache_key) is None:
2093
+ self._compound_mapping_cache.put(cache_key, mapping)
2094
+ iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
2095
+ LOGGER.info("Cached compound mapping for: %s -> %s (campaign: %s)", key, iupac_display, campaign_filter)
1459
2096
 
1460
- LOGGER.info("Tier 3: Found %d additional compound mappings", final_found)
2097
+ # Also cache without campaign prefix for backward compatibility during integration
2098
+ if campaign_filter:
2099
+ self._compound_mapping_cache.put(key, mapping)
2100
+
2101
+ # Combine cached and new mappings
2102
+ final_mappings = cached_mappings.copy()
2103
+ final_mappings.update(mappings)
1461
2104
 
1462
- LOGGER.info("Adaptive compound mapping complete: %d total mappings", len(mappings))
1463
- return mappings
2105
+ LOGGER.info("Adaptive compound mapping complete: %d total mappings (%d cached, %d new)",
2106
+ len(final_mappings), len(cached_mappings), len(mappings))
2107
+ return final_mappings
1464
2108
 
1465
- def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
2109
+ def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None, lineage_compound_ids: Optional[Dict[str, List[str]]] = None) -> Dict[str, Any]:
1466
2110
  """Extract model reaction information using identified locations and 3-tier compound mapping."""
1467
2111
  # First find the best locations
1468
2112
  locations = self.find_model_reaction_locations(enzyme_variants)
@@ -1585,9 +2229,15 @@ TEXT FROM MANUSCRIPT:
1585
2229
  figure_images[fig_ref] = img_b64
1586
2230
  LOGGER.info("Successfully extracted %s image for model reaction analysis", fig_ref)
1587
2231
 
1588
- # Extract compound IDs from locations
2232
+ # Extract compound IDs from locations or use lineage-specific ones
1589
2233
  compound_ids = []
1590
- if locations and locations.get("model_reaction_location", {}).get("compound_ids"):
2234
+ if lineage_compound_ids:
2235
+ # Use lineage-specific compound IDs if provided
2236
+ substrate_ids = lineage_compound_ids.get("substrate_ids", [])
2237
+ product_ids = lineage_compound_ids.get("product_ids", [])
2238
+ compound_ids = substrate_ids + product_ids
2239
+ LOGGER.info("Using lineage-specific compound IDs: %s", compound_ids)
2240
+ elif locations and locations.get("model_reaction_location", {}).get("compound_ids"):
1591
2241
  compound_ids = locations["model_reaction_location"]["compound_ids"]
1592
2242
  LOGGER.info("Found compound IDs in model reaction: %s", compound_ids)
1593
2243
 
@@ -1595,7 +2245,7 @@ TEXT FROM MANUSCRIPT:
1595
2245
  compound_mappings = {}
1596
2246
  if compound_ids:
1597
2247
  LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
1598
- compound_mappings = self._extract_compound_mappings_adaptive(compound_ids)
2248
+ compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
1599
2249
 
1600
2250
  # Add the mapped IUPAC names to the context for better extraction
1601
2251
  if compound_mappings:
@@ -1606,8 +2256,35 @@ TEXT FROM MANUSCRIPT:
1606
2256
  mapping_text += f"Compound {cid}: {mapping.iupac_name}\n"
1607
2257
  text_context += mapping_text
1608
2258
 
2259
+ # Add campaign context if available
2260
+ campaign_context = ""
2261
+ if enzyme_variants and self.campaign_filter:
2262
+ campaigns_context = ""
2263
+ if self.all_campaigns:
2264
+ campaigns_context = f"""
2265
+ ALL CAMPAIGNS IN THIS PAPER:
2266
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
2267
+
2268
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
2269
+ - Different campaigns may use similar enzyme names but different substrates
2270
+ - Be extremely careful to only extract data for the {self.campaign_filter} campaign
2271
+ - Ignore data from other campaigns even if they seem similar
2272
+ """
2273
+
2274
+ campaign_context = f"""
2275
+ IMPORTANT CONTEXT:
2276
+ You are extracting the model reaction used specifically for these enzyme variants:
2277
+ {', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
2278
+
2279
+ These variants belong to campaign: {self.campaign_filter}
2280
+ {campaigns_context}
2281
+ Focus on extracting the model reaction that was used to evaluate THESE specific variants.
2282
+ Different campaigns may use different model reactions and substrates.
2283
+
2284
+ """
2285
+
1609
2286
  # Include both manuscript and SI text for better coverage
1610
- prompt = PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
2287
+ prompt = campaign_context + PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
1611
2288
 
1612
2289
  try:
1613
2290
  # Use multimodal extraction if we have figure images
@@ -1620,9 +2297,10 @@ TEXT FROM MANUSCRIPT:
1620
2297
  for fig_ref, fig_base64 in figure_images.items():
1621
2298
  try:
1622
2299
  img_bytes = b64decode(fig_base64)
1623
- image = PIL.Image.open(io.BytesIO(img_bytes))
2300
+ # Format image for Gemini API
2301
+ image_part = {"mime_type": "image/png", "data": img_bytes}
1624
2302
  content_parts.append(f"\n[Figure: {fig_ref}]")
1625
- content_parts.append(image)
2303
+ content_parts.append(image_part)
1626
2304
  except Exception as e:
1627
2305
  LOGGER.warning("Failed to process figure %s: %s", fig_ref, e)
1628
2306
 
@@ -1641,6 +2319,20 @@ TEXT FROM MANUSCRIPT:
1641
2319
 
1642
2320
  response = model.generate_content(content_parts)
1643
2321
 
2322
+ # Track token usage if available
2323
+ try:
2324
+ if hasattr(response, 'usage_metadata'):
2325
+ input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
2326
+ output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
2327
+ if input_tokens or output_tokens:
2328
+ try:
2329
+ from .wrapper import add_token_usage
2330
+ add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
2331
+ except ImportError:
2332
+ pass # wrapper not available
2333
+ except Exception:
2334
+ pass # token tracking is best-effort
2335
+
1644
2336
  # Parse JSON from response
1645
2337
  if response and response.text:
1646
2338
  # Save debug output
@@ -1681,32 +2373,59 @@ TEXT FROM MANUSCRIPT:
1681
2373
  if isinstance(data, dict):
1682
2374
  # If we have compound mappings, enhance the IUPAC names
1683
2375
  if compound_ids and compound_mappings:
2376
+ LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
2377
+ list(compound_mappings.keys()))
2378
+
1684
2379
  # Try to map substrate/product lists through compound IDs
1685
- substrate_list = data.get("substrate_iupac_list", [])
2380
+ substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
1686
2381
  if isinstance(substrate_list, list):
1687
2382
  enhanced_substrates = []
1688
2383
  for item in substrate_list:
2384
+ item_str = str(item).lower().strip()
1689
2385
  # Check if it's a compound ID that we can map
1690
- mapping = compound_mappings.get(str(item).lower().strip())
2386
+ mapping = compound_mappings.get(item_str)
1691
2387
  if mapping and mapping.iupac_name:
1692
2388
  enhanced_substrates.append(mapping.iupac_name)
2389
+ LOGGER.info("Mapped substrate '%s' -> '%s'", item, mapping.iupac_name)
1693
2390
  elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
1694
- # Keep valid IUPAC names
2391
+ # Keep valid IUPAC names that aren't compound IDs
1695
2392
  enhanced_substrates.append(str(item))
2393
+ LOGGER.info("Kept substrate IUPAC name: '%s'", item)
2394
+ else:
2395
+ LOGGER.warning("Could not map substrate compound ID '%s'", item)
1696
2396
  data["substrate_iupac_list"] = enhanced_substrates
1697
2397
 
1698
- product_list = data.get("product_iupac_list", [])
2398
+ product_list = data.get("product_iupac_list", []) or data.get("product_list", [])
1699
2399
  if isinstance(product_list, list):
1700
2400
  enhanced_products = []
1701
2401
  for item in product_list:
2402
+ item_str = str(item).lower().strip()
1702
2403
  # Check if it's a compound ID that we can map
1703
- mapping = compound_mappings.get(str(item).lower().strip())
2404
+ mapping = compound_mappings.get(item_str)
1704
2405
  if mapping and mapping.iupac_name:
1705
2406
  enhanced_products.append(mapping.iupac_name)
2407
+ LOGGER.info("Mapped product '%s' -> '%s'", item, mapping.iupac_name)
1706
2408
  elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
1707
- # Keep valid IUPAC names
2409
+ # Keep valid IUPAC names that aren't compound IDs
1708
2410
  enhanced_products.append(str(item))
2411
+ LOGGER.info("Kept product IUPAC name: '%s'", item)
2412
+ else:
2413
+ LOGGER.warning("Could not map product compound ID '%s'", item)
1709
2414
  data["product_iupac_list"] = enhanced_products
2415
+
2416
+ # Also try to enhance using both substrate_list and product_list if they contain compound IDs
2417
+ for list_key, target_key in [("substrate_list", "substrate_iupac_list"), ("product_list", "product_iupac_list")]:
2418
+ if list_key in data and isinstance(data[list_key], list):
2419
+ if target_key not in data or not data[target_key]:
2420
+ enhanced_list = []
2421
+ for item in data[list_key]:
2422
+ item_str = str(item).lower().strip()
2423
+ mapping = compound_mappings.get(item_str)
2424
+ if mapping and mapping.iupac_name:
2425
+ enhanced_list.append(mapping.iupac_name)
2426
+ LOGGER.info("Enhanced %s: mapped '%s' -> '%s'", target_key, item, mapping.iupac_name)
2427
+ if enhanced_list:
2428
+ data[target_key] = enhanced_list
1710
2429
 
1711
2430
  # Validate and convert arrays to semicolon-separated strings for CSV compatibility
1712
2431
  if "substrate_iupac_list" in data and isinstance(data["substrate_iupac_list"], list):
@@ -1753,8 +2472,8 @@ TEXT FROM MANUSCRIPT:
1753
2472
  return data
1754
2473
 
1755
2474
  def _process_single_lineage(self, location: Dict[str, Any], enzyme_df: pd.DataFrame) -> pd.DataFrame:
1756
- """Process a single lineage case - still extract based on location."""
1757
- # Even for single lineage, use location-based extraction
2475
+ """Process a single lineage case - use confidence-based processing."""
2476
+ # Create lineage analysis for single location
1758
2477
  lineage_analysis = {
1759
2478
  'has_multiple_lineages': False,
1760
2479
  'lineage_groups': [{
@@ -1766,7 +2485,7 @@ TEXT FROM MANUSCRIPT:
1766
2485
  }]
1767
2486
  }
1768
2487
 
1769
- return self._process_multiple_lineages([location], enzyme_df, lineage_analysis)
2488
+ return self._process_multiple_lineages_by_confidence([location], enzyme_df, lineage_analysis)
1770
2489
 
1771
2490
  def _process_multiple_lineages_by_confidence(self, locations: List[Dict[str, Any]],
1772
2491
  enzyme_df: pd.DataFrame,
@@ -1781,186 +2500,69 @@ TEXT FROM MANUSCRIPT:
1781
2500
  # If enzyme_df has campaign_id column, we can use it to filter
1782
2501
  has_campaign_info = 'campaign_id' in enzyme_df.columns
1783
2502
 
1784
- # Process locations in order of confidence
1785
- for location in locations:
1786
- if len(variants_with_data) >= len(all_variants):
1787
- LOGGER.info("All variants have data, stopping extraction")
1788
- break
1789
-
1790
- LOGGER.info("\nProcessing location %s (confidence: %d%%)",
1791
- location['location'], location.get('confidence', 0))
2503
+ # Select the most confident source only
2504
+ best_location = None
2505
+ if locations:
2506
+ # Sort by confidence only
2507
+ locations_sorted = sorted(locations, key=lambda x: -x.get('confidence', 0))
2508
+ best_location = locations_sorted[0]
1792
2509
 
1793
- # Extract metrics from this location for ALL enzymes
1794
- metrics_rows = self.extract_metrics_batch(all_enzyme_ids, location['location'])
2510
+ LOGGER.info("Selected primary location: %s (type: %s, confidence: %d%%)",
2511
+ best_location['location'],
2512
+ best_location.get('type', 'unknown'),
2513
+ best_location.get('confidence', 0))
2514
+
2515
+ # Extract metrics from the most confident source only
2516
+ metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
1795
2517
 
1796
2518
  # Filter to valid metrics
1797
2519
  valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
1798
2520
 
1799
2521
  if not valid_metrics:
1800
- LOGGER.warning("No valid metrics found in %s", location['location'])
1801
- continue
2522
+ LOGGER.warning("No valid metrics found in primary location %s", best_location['location'])
2523
+ return pd.DataFrame()
1802
2524
 
1803
- LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), location['location'])
2525
+ LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), best_location['location'])
1804
2526
 
1805
- # Create DataFrame for this location
2527
+ # Create DataFrame for the single best location
1806
2528
  df_location = pd.DataFrame(valid_metrics)
1807
2529
 
1808
- # Track which variants we got data for
1809
- new_variants = set(df_location['enzyme'].tolist()) - variants_with_data
1810
- LOGGER.info("Found data for %d new variants in %s", len(new_variants), location['location'])
1811
- variants_with_data.update(new_variants)
2530
+ # Add metadata about the location
2531
+ df_location['data_location'] = best_location['location']
2532
+ df_location['confidence'] = best_location.get('confidence', 0)
1812
2533
 
1813
- # Determine which campaign/lineage this location represents
1814
- # by checking which variants are present
1815
- location_variants = set(df_location['enzyme'].tolist())
2534
+ LOGGER.info("Successfully extracted data for %d enzymes from primary location", len(df_location))
1816
2535
 
1817
- # If we have campaign info, determine the campaign for this location
1818
- campaign_id = None
1819
- if has_campaign_info:
1820
- # Find which campaign(s) these variants belong to
1821
- if 'enzyme_id' in enzyme_df.columns:
1822
- variant_campaigns = enzyme_df[enzyme_df['enzyme_id'].isin(location_variants)]['campaign_id'].unique()
1823
- else:
1824
- variant_campaigns = enzyme_df[enzyme_df['enzyme'].isin(location_variants)]['campaign_id'].unique()
1825
- if len(variant_campaigns) == 1:
1826
- campaign_id = variant_campaigns[0]
1827
- LOGGER.info("Location %s contains variants from campaign: %s",
1828
- location['location'], campaign_id)
1829
- elif len(variant_campaigns) > 1:
1830
- LOGGER.warning("Location %s contains variants from multiple campaigns: %s",
1831
- location['location'], variant_campaigns)
1832
-
1833
- # Extract model reaction specific to this location/campaign
1834
- location_context = f"Location: {location['location']}"
1835
- if location.get('caption'):
1836
- location_context += f"\nCaption: {location['caption']}"
1837
-
1838
- # First find model reaction locations for this campaign/enzyme group
2536
+ # Extract model reaction info once for this location
2537
+ location_context = f"Location: {best_location['location']}"
2538
+ if best_location.get('caption'):
2539
+ location_context += f"\nCaption: {best_location['caption']}"
2540
+
2541
+ # Get enzyme list for model reaction
1839
2542
  location_enzymes = df_location['enzyme'].unique().tolist()
2543
+ # Get model reaction locations for this campaign
1840
2544
  model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
1841
2545
 
1842
- # Try to find model reaction for this specific lineage, passing the locations
1843
- location_model_reaction = self.find_lineage_model_reaction(
1844
- location['location'],
1845
- location_context,
1846
- model_reaction_locations
1847
- )
1848
-
1849
- # Get full model reaction info with IUPAC names
1850
- if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
1851
- model_info = self._extract_lineage_model_info(location_model_reaction)
1852
- else:
1853
- # Fall back to general model reaction extraction
1854
- # Pass the enzyme variants from this location
1855
- model_info = self.gather_model_reaction_info(location_enzymes)
1856
-
1857
- # Add model reaction info to all enzymes from this location
1858
- for key, value in model_info.items():
1859
- if isinstance(value, list):
1860
- value = "; ".join(str(v) for v in value) if value else None
1861
- df_location[key] = value
1862
-
1863
- # Add location and campaign info
1864
- df_location['data_location'] = location['location']
1865
- df_location['location_type'] = location.get('type', 'unknown')
1866
- df_location['location_confidence'] = location.get('confidence', 0)
1867
- # Remove lineage_group column - not working properly
1868
- # df_location['lineage_group'] = location.get('lineage_hint', campaign_id or 'unknown')
1869
-
1870
- all_results.append(df_location)
1871
-
1872
- # Log progress
1873
- LOGGER.info("Progress: %d/%d variants have data",
1874
- len(variants_with_data), len(all_variants))
1875
-
1876
- if all_results:
1877
- # Combine all results
1878
- df_combined = pd.concat(all_results, ignore_index=True)
1879
-
1880
- # If we have duplicates (same variant in multiple locations), keep the one with highest confidence
1881
- if df_combined.duplicated(subset=['enzyme']).any():
1882
- LOGGER.info("Removing duplicates, keeping highest confidence data")
1883
- df_combined = df_combined.sort_values(
1884
- ['enzyme', 'location_confidence'],
1885
- ascending=[True, False]
1886
- ).drop_duplicates(subset=['enzyme'], keep='first')
1887
-
1888
- # Log extraction summary
1889
- LOGGER.info("Extraction complete: %d unique variants from %d locations",
1890
- len(df_combined), len(all_results))
2546
+ # Extract model reaction for this location - use unified approach
2547
+ LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
1891
2548
 
1892
- if 'data_location' in df_combined.columns:
1893
- for location in df_combined['data_location'].unique():
1894
- location_enzymes = df_combined[df_combined['data_location'] == location]
1895
- LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
1896
-
1897
- return df_combined
1898
- else:
1899
- LOGGER.warning("No metrics extracted from any location")
1900
- return pd.DataFrame()
1901
-
1902
- def _process_multiple_lineages(self, locations: List[Dict[str, Any]],
1903
- enzyme_df: pd.DataFrame,
1904
- lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
1905
- """Process multiple lineages where each location represents a different model reaction."""
1906
- all_metrics = []
1907
- lineage_groups = lineage_analysis.get('lineage_groups', [])
1908
-
1909
- # Get all enzyme IDs for extraction attempts
1910
- all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else []
1911
-
1912
- for group in lineage_groups:
1913
- group_location = group.get('data_location')
1914
- group_id = group.get('group_id')
1915
-
1916
- # Find the location info
1917
- location_info = next((loc for loc in locations if loc['location'] == group_location), None)
1918
- if not location_info:
1919
- LOGGER.warning("No location info found for group %s at %s", group_id, group_location)
1920
- continue
1921
-
1922
- LOGGER.info("Processing location %s (%s)", group_location, group_id)
1923
-
1924
- # Extract metrics from this location for ALL enzymes
1925
- # The extractor will return only those that actually have data
1926
- metrics_rows = self.extract_metrics_batch(all_enzyme_ids, group_location)
1927
-
1928
- # Filter to enzymes that actually had data in this location
1929
- valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
1930
-
1931
- if not valid_metrics:
1932
- LOGGER.warning("No valid metrics found in %s", group_location)
1933
- continue
1934
-
1935
- LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), group_location)
1936
-
1937
- # Create DataFrame for this location
1938
- df_location = pd.DataFrame(valid_metrics)
1939
-
1940
- # Extract model reaction specific to this location
1941
- # Different locations = different model reactions
1942
- location_context = f"Location: {group_location}"
1943
- if group.get('caption'):
1944
- location_context += f"\nCaption: {group['caption']}"
1945
-
1946
- # First find model reaction locations for this enzyme group
1947
- location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
1948
- model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
1949
-
1950
- # Try to find model reaction for this specific lineage, passing the locations
2549
+ # Try lineage-specific extraction first
1951
2550
  location_model_reaction = self.find_lineage_model_reaction(
1952
- group_location,
2551
+ best_location['location'],
1953
2552
  location_context,
1954
2553
  model_reaction_locations
1955
2554
  )
1956
2555
 
1957
- # Get full model reaction info with IUPAC names
2556
+ # Check if lineage extraction was successful
1958
2557
  if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
1959
- model_info = self._extract_lineage_model_info(location_model_reaction)
2558
+ LOGGER.info("Using lineage-specific model reaction data")
2559
+ model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
1960
2560
  else:
1961
- # Try to extract model reaction from this specific location
1962
- # Pass the enzyme variants that have data in this location
2561
+ LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
2562
+ # Use the comprehensive multimodal approach as fallback
1963
2563
  model_info = self.gather_model_reaction_info(location_enzymes)
2564
+
2565
+ LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
1964
2566
 
1965
2567
  # Add model reaction info to all enzymes from this location
1966
2568
  for key, value in model_info.items():
@@ -1968,30 +2570,18 @@ TEXT FROM MANUSCRIPT:
1968
2570
  value = "; ".join(str(v) for v in value) if value else None
1969
2571
  df_location[key] = value
1970
2572
 
1971
- # Add location identifier
1972
- df_location['data_location'] = group_location
1973
- # Remove lineage_group column - not working properly
1974
- # df_location['lineage_group'] = group.get('lineage_hint', group_id)
1975
-
1976
- all_metrics.append(df_location)
1977
-
1978
- if all_metrics:
1979
- # Combine all metrics
1980
- df_combined = pd.concat(all_metrics, ignore_index=True)
2573
+ # Add additional location metadata (data_location already set above)
2574
+ df_location['location_type'] = best_location.get('type', 'unknown')
2575
+ df_location['location_confidence'] = best_location.get('confidence', 0)
1981
2576
 
1982
- # Log extraction summary
1983
- LOGGER.info("Extraction complete: %d total enzymes from %d locations",
1984
- len(df_combined), len(all_metrics))
2577
+ LOGGER.info("Extraction complete: %d variants from primary location %s",
2578
+ len(df_location), best_location['location'])
1985
2579
 
1986
- if 'data_location' in df_combined.columns:
1987
- for location in df_combined['data_location'].unique():
1988
- location_enzymes = df_combined[df_combined['data_location'] == location]
1989
- LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
1990
-
1991
- return df_combined
1992
- else:
1993
- LOGGER.warning("No metrics extracted from any location")
1994
- return pd.DataFrame()
2580
+ return df_location
2581
+
2582
+ # No locations found
2583
+ LOGGER.warning("No valid locations found for extraction")
2584
+ return pd.DataFrame()
1995
2585
 
1996
2586
  def _has_valid_metrics(self, metrics_row: Dict[str, Any]) -> bool:
1997
2587
  """Check if a metrics row contains any valid performance data."""
@@ -2078,14 +2668,18 @@ TEXT FROM MANUSCRIPT:
2078
2668
 
2079
2669
  return filtered
2080
2670
 
2081
- def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any]) -> Dict[str, Any]:
2671
+ def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any], enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
2082
2672
  """Extract full model reaction info including IUPAC names for a lineage."""
2083
2673
  # Get substrate/product IDs from lineage-specific extraction
2084
2674
  substrate_ids = lineage_reaction.get('substrate_ids', [])
2085
2675
  product_ids = lineage_reaction.get('product_ids', [])
2086
2676
 
2087
- # Get general model reaction info for conditions
2088
- general_info = self.gather_model_reaction_info()
2677
+ # Get general model reaction info for conditions, using lineage-specific compound IDs
2678
+ lineage_ids = {
2679
+ "substrate_ids": substrate_ids,
2680
+ "product_ids": product_ids
2681
+ }
2682
+ general_info = self.gather_model_reaction_info(enzyme_variants, lineage_compound_ids=lineage_ids)
2089
2683
 
2090
2684
  # Override substrate/product lists with lineage-specific ones only if they contain actual compound IDs
2091
2685
  model_info = general_info.copy()
@@ -2231,6 +2825,9 @@ TEXT FROM MANUSCRIPT:
2231
2825
  LOGGER.error("No enzyme DataFrame provided - this module requires enzyme CSV input")
2232
2826
  return pd.DataFrame()
2233
2827
 
2828
+ # Store enzyme_df for use in extract_metrics_batch
2829
+ self.enzyme_df = enzyme_df
2830
+
2234
2831
  # Check if we have campaign_id column - if so, process each campaign separately
2235
2832
  if 'campaign_id' in enzyme_df.columns and not self.campaign_filter:
2236
2833
  campaigns = enzyme_df['campaign_id'].unique()
@@ -2249,15 +2846,16 @@ TEXT FROM MANUSCRIPT:
2249
2846
  si=self.si,
2250
2847
  cfg=self.cfg,
2251
2848
  debug_dir=self.debug_dir / campaign_id if self.debug_dir else None,
2252
- campaign_filter=campaign_id
2849
+ campaign_filter=campaign_id,
2850
+ all_campaigns=campaigns.tolist()
2253
2851
  )
2254
2852
 
2255
2853
  # Run extraction for this campaign
2256
2854
  campaign_df = campaign_extractor.run(enzyme_df)
2257
2855
 
2258
2856
  if not campaign_df.empty:
2259
- # Add campaign identifier
2260
- campaign_df['campaign_id'] = campaign_id
2857
+ # Add a temporary campaign identifier for merging
2858
+ campaign_df['_extraction_campaign'] = campaign_id
2261
2859
  all_campaign_results.append(campaign_df)
2262
2860
  LOGGER.info("Extracted %d reactions for campaign %s", len(campaign_df), campaign_id)
2263
2861
 
@@ -2328,7 +2926,7 @@ TEXT FROM MANUSCRIPT:
2328
2926
  def merge_with_lineage_data(
2329
2927
  df_lineage: pd.DataFrame, df_metrics: pd.DataFrame
2330
2928
  ) -> pd.DataFrame:
2331
- """Outer-merge on 'enzyme' column. Left CSV defines desired row order."""
2929
+ """Merge lineage and metrics data ensuring one-to-one mapping per campaign."""
2332
2930
 
2333
2931
  # Handle both 'enzyme' and 'enzyme_id' column names
2334
2932
  if "enzyme_id" in df_lineage.columns and "enzyme" not in df_lineage.columns:
@@ -2337,7 +2935,19 @@ def merge_with_lineage_data(
2337
2935
  if "enzyme" not in df_lineage.columns:
2338
2936
  raise ValueError("Lineage CSV must have an 'enzyme' or 'enzyme_id' column.")
2339
2937
 
2340
- merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
2938
+ # Check if we have campaign information to match on
2939
+ if "campaign_id" in df_lineage.columns and "_extraction_campaign" in df_metrics.columns:
2940
+ # Match on both enzyme and campaign to ensure correct pairing
2941
+ df_metrics_temp = df_metrics.copy()
2942
+ df_metrics_temp['campaign_id'] = df_metrics_temp['_extraction_campaign']
2943
+ df_metrics_temp = df_metrics_temp.drop('_extraction_campaign', axis=1)
2944
+ merged = df_lineage.merge(df_metrics_temp, on=["enzyme", "campaign_id"], how="left")
2945
+ else:
2946
+ # Simple merge on enzyme only
2947
+ if "_extraction_campaign" in df_metrics.columns:
2948
+ df_metrics = df_metrics.drop('_extraction_campaign', axis=1)
2949
+ merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
2950
+
2341
2951
  return merged
2342
2952
 
2343
2953
  ###############################################################################
@@ -2366,25 +2976,103 @@ def main() -> None:
2366
2976
  if args.verbose:
2367
2977
  LOGGER.setLevel(logging.DEBUG)
2368
2978
  cfg = Config()
2369
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir)
2370
2979
 
2371
- # Load enzyme data from CSV if provided
2980
+ # Load enzyme data from CSV if provided to detect campaign information
2372
2981
  enzyme_df = None
2982
+ campaign_filter = None
2983
+ all_campaigns = None
2984
+
2373
2985
  if args.lineage_csv and args.lineage_csv.exists():
2374
2986
  LOGGER.info("Loading enzyme data from CSV…")
2375
2987
  enzyme_df = pd.read_csv(args.lineage_csv)
2376
-
2377
- # Run extraction with enzyme data
2378
- df_metrics = extractor.run(enzyme_df)
2988
+
2989
+ # Detect campaign information from the enzyme CSV
2990
+ if 'campaign_id' in enzyme_df.columns:
2991
+ all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
2992
+ if len(all_campaigns) == 1:
2993
+ campaign_filter = all_campaigns[0]
2994
+ LOGGER.info("Detected single campaign: %s", campaign_filter)
2995
+
2996
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
2997
+ campaign_filter=campaign_filter, all_campaigns=all_campaigns)
2998
+ df_metrics = extractor.run(enzyme_df)
2999
+
3000
+ elif len(all_campaigns) > 1:
3001
+ LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
3002
+ all_results = []
3003
+
3004
+ # Process each campaign separately
3005
+ for campaign in all_campaigns:
3006
+ LOGGER.info("Processing campaign: %s", campaign)
3007
+
3008
+ # Filter enzyme_df to this campaign
3009
+ campaign_df = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3010
+ LOGGER.info("Found %d enzymes for campaign %s", len(campaign_df), campaign)
3011
+
3012
+ if len(campaign_df) == 0:
3013
+ LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
3014
+ continue
3015
+
3016
+ # Create extractor for this campaign
3017
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3018
+ campaign_filter=campaign, all_campaigns=all_campaigns)
3019
+
3020
+ # Run extraction for this campaign
3021
+ campaign_metrics = extractor.run(campaign_df)
3022
+
3023
+ if not campaign_metrics.empty:
3024
+ # Merge with lineage data for this campaign
3025
+ campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3026
+ if "enzyme_id" in campaign_lineage.columns and "enzyme" not in campaign_lineage.columns:
3027
+ campaign_lineage = campaign_lineage.rename(columns={"enzyme_id": "enzyme"})
3028
+
3029
+ # Merge campaign metrics with lineage data
3030
+ campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
3031
+
3032
+ # Save campaign-specific file immediately
3033
+ output_dir = args.output.parent
3034
+ base_name = args.output.stem
3035
+ campaign_file = output_dir / f"{base_name}_{campaign}.csv"
3036
+ campaign_final.to_csv(campaign_file, index=False)
3037
+ LOGGER.info("Saved %d rows for campaign %s -> %s", len(campaign_final), campaign, campaign_file)
3038
+
3039
+ # Add the merged data (not just metrics) to final results
3040
+ all_results.append(campaign_final)
3041
+ LOGGER.info("Added %d merged results for campaign %s", len(campaign_final), campaign)
3042
+ else:
3043
+ LOGGER.warning("No results extracted for campaign %s", campaign)
3044
+
3045
+ # Still save an empty campaign file with lineage data
3046
+ campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3047
+ if not campaign_lineage.empty:
3048
+ output_dir = args.output.parent
3049
+ base_name = args.output.stem
3050
+ campaign_file = output_dir / f"{base_name}_{campaign}.csv"
3051
+ campaign_lineage.to_csv(campaign_file, index=False)
3052
+ LOGGER.info("Saved %d rows (lineage only) for campaign %s -> %s", len(campaign_lineage), campaign, campaign_file)
3053
+
3054
+ # Combine all campaign results
3055
+ if all_results:
3056
+ df_metrics = pd.concat(all_results, ignore_index=True)
3057
+ LOGGER.info("Combined results from %d campaigns: %d total rows", len(all_results), len(df_metrics))
3058
+ else:
3059
+ LOGGER.warning("No results from any campaign")
3060
+ df_metrics = pd.DataFrame()
3061
+ else:
3062
+ # No campaign information, process all enzymes together
3063
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3064
+ campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3065
+ df_metrics = extractor.run(enzyme_df)
2379
3066
 
2380
- if args.lineage_csv and args.lineage_csv.exists() and not df_metrics.empty:
2381
- LOGGER.info("Merging with lineage CSV…")
2382
- df_final = merge_with_lineage_data(enzyme_df, df_metrics)
2383
- else:
2384
- df_final = df_metrics
3067
+ # Skip final merge since campaign-specific merges already happened during processing
3068
+ # This avoids duplicate entries when same enzyme appears in multiple campaigns
3069
+ df_final = df_metrics
3070
+ LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
2385
3071
 
2386
3072
  df_final.to_csv(args.output, index=False)
2387
3073
  LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
3074
+
3075
+ # Campaign-specific files are already saved during processing above
2388
3076
 
2389
3077
  if __name__ == "__main__":
2390
3078
  main()