debase 0.1.19__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,7 @@ Key June 2025 additions
24
24
  from __future__ import annotations
25
25
 
26
26
  import argparse
27
+ import hashlib
27
28
  import json
28
29
  import logging
29
30
  import os
@@ -31,7 +32,9 @@ import re
31
32
  import sys
32
33
  import time
33
34
  from base64 import b64encode, b64decode
35
+ from collections import OrderedDict
34
36
  from dataclasses import dataclass, field
37
+ from functools import lru_cache
35
38
  from pathlib import Path
36
39
  from textwrap import dedent
37
40
  from typing import Any, Dict, List, Optional, Tuple
@@ -40,7 +43,6 @@ import fitz # PyMuPDF - for image extraction
40
43
  import google.generativeai as genai # type: ignore
41
44
  import pandas as pd
42
45
  from PyPDF2 import PdfReader
43
- import PIL.Image
44
46
  import io
45
47
 
46
48
  ###############################################################################
@@ -51,7 +53,7 @@ import io
51
53
  class Config:
52
54
  """Centralised tunables so tests can override them easily."""
53
55
 
54
- model_name: str = "gemini-1.5-pro-latest"
56
+ model_name: str = "gemini-2.5-flash"
55
57
  location_temperature: float = 0.2
56
58
  extract_temperature: float = 0.0
57
59
  model_reaction_temperature: float = 0.0
@@ -117,6 +119,144 @@ def get_model(cfg: Config):
117
119
  genai.configure(api_key=api_key)
118
120
  return genai.GenerativeModel(cfg.model_name)
119
121
 
122
+ # Bounded LRU caches to store prompt/image content by hash (prevents memory leaks)
123
+
124
+ class LRUCache:
125
+ """Simple LRU cache implementation."""
126
+ def __init__(self, maxsize: int):
127
+ self.maxsize = maxsize
128
+ self.cache = OrderedDict()
129
+
130
+ def get(self, key: str) -> Optional[str]:
131
+ if key in self.cache:
132
+ # Move to end (most recently used)
133
+ self.cache.move_to_end(key)
134
+ return self.cache[key]
135
+ return None
136
+
137
+ def put(self, key: str, value: str) -> None:
138
+ if key in self.cache:
139
+ # Update existing
140
+ self.cache.move_to_end(key)
141
+ else:
142
+ # Add new, evict oldest if needed
143
+ if len(self.cache) >= self.maxsize:
144
+ self.cache.popitem(last=False)
145
+ self.cache[key] = value
146
+
147
+ def __len__(self) -> int:
148
+ return len(self.cache)
149
+
150
+ # Global bounded caches
151
+ _PROMPT_CACHE = LRUCache(maxsize=1000)
152
+ _IMAGE_CACHE = LRUCache(maxsize=500) # Images are larger, so smaller cache
153
+
154
+ def get_cache_stats() -> Dict[str, Any]:
155
+ """Get cache statistics for debugging."""
156
+ return {
157
+ "gemini_cache_info": _cached_gemini_call.cache_info(),
158
+ "prompt_cache_size": len(_PROMPT_CACHE),
159
+ "image_cache_size": len(_IMAGE_CACHE),
160
+ }
161
+
162
+ @lru_cache(maxsize=1000)
163
+ def _cached_gemini_call(
164
+ model_name: str,
165
+ prompt_hash: str,
166
+ image_hash: Optional[str],
167
+ temperature: float,
168
+ max_retries: int,
169
+ ) -> str:
170
+ """Pure cached function for Gemini API calls using only hash keys.
171
+
172
+ Args:
173
+ model_name: Name of the Gemini model
174
+ prompt_hash: SHA256 hash of the prompt
175
+ image_hash: SHA256 hash of the image (if any)
176
+ temperature: Temperature for generation
177
+ max_retries: Maximum number of retries
178
+
179
+ Returns:
180
+ Raw response text from Gemini
181
+ """
182
+ # Retrieve actual content from LRU cache
183
+ prompt = _PROMPT_CACHE.get(prompt_hash)
184
+ image_b64 = _IMAGE_CACHE.get(image_hash) if image_hash else None
185
+
186
+ if prompt is None:
187
+ raise RuntimeError(f"Prompt content not found for hash {prompt_hash}")
188
+
189
+ # Configure API key (this is idempotent)
190
+ api_key = os.getenv("GEMINI_API_KEY")
191
+ if not api_key:
192
+ raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
193
+ genai.configure(api_key=api_key)
194
+
195
+ # Create model instance (not cached since it's lightweight)
196
+ model = genai.GenerativeModel(model_name)
197
+
198
+ for attempt in range(1, max_retries + 1):
199
+ try:
200
+ # Handle image if provided
201
+ if image_b64:
202
+ # Decode base64 string to bytes for Gemini API
203
+ image_bytes = b64decode(image_b64)
204
+ parts = [prompt, {"mime_type": "image/png", "data": image_bytes}]
205
+ else:
206
+ parts = [prompt]
207
+
208
+ resp = model.generate_content(
209
+ parts,
210
+ generation_config={
211
+ "temperature": temperature,
212
+ "max_output_tokens": 8192,
213
+ }
214
+ )
215
+ # Track token usage if available
216
+ try:
217
+ if hasattr(resp, 'usage_metadata'):
218
+ input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
219
+ output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
220
+ if input_tokens or output_tokens:
221
+ try:
222
+ from .wrapper import add_token_usage
223
+ add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
224
+ except ImportError:
225
+ pass # wrapper not available
226
+ except Exception:
227
+ pass # token tracking is best-effort
228
+
229
+ return resp.text.strip()
230
+ except Exception as exc:
231
+ if attempt == max_retries:
232
+ raise
233
+ time.sleep(2 ** attempt)
234
+
235
+ # Should never reach here
236
+ raise RuntimeError("Max retries exceeded")
237
+
238
+ def _normalize_prompt_for_caching(prompt: str) -> str:
239
+ """Normalize prompt for better cache hit rates by removing boilerplate and collapsing whitespace."""
240
+ # Remove common boilerplate lines that don't affect the core query
241
+ lines = prompt.split('\n')
242
+ normalized_lines = []
243
+
244
+ for line in lines:
245
+ # Skip timestamp and debug lines
246
+ if any(skip in line.lower() for skip in ['timestamp:', 'length:', 'characters', '===', '***']):
247
+ continue
248
+ # Skip lines that are just separators
249
+ if line.strip() and not line.strip().replace('=', '').replace('-', '').replace('*', ''):
250
+ continue
251
+ # Collapse whitespace but preserve structure
252
+ normalized_lines.append(' '.join(line.split()))
253
+
254
+ # Join and collapse multiple newlines
255
+ normalized = '\n'.join(normalized_lines)
256
+ normalized = re.sub(r'\n\s*\n+', '\n\n', normalized)
257
+
258
+ return normalized.strip()
259
+
120
260
  def generate_json_with_retry(
121
261
  model,
122
262
  prompt: str,
@@ -129,9 +269,17 @@ def generate_json_with_retry(
129
269
  image_b64: Optional[str] = None,
130
270
  ):
131
271
  """Call Gemini with retries & exponential back-off, returning parsed JSON."""
272
+ # Generate cache keys based on normalized prompt and image content
273
+ normalized_prompt = _normalize_prompt_for_caching(prompt)
274
+ prompt_hash = hashlib.sha256(normalized_prompt.encode()).hexdigest()
275
+ image_hash = hashlib.sha256(image_b64.encode()).hexdigest() if image_b64 else None
276
+
132
277
  # Log prompt details
133
278
  LOGGER.info("=== GEMINI API CALL: %s ===", tag.upper())
134
279
  LOGGER.info("Prompt length: %d characters", len(prompt))
280
+ LOGGER.info("Prompt hash: %s", prompt_hash[:16])
281
+ if image_hash:
282
+ LOGGER.info("Image hash: %s", image_hash[:16])
135
283
  LOGGER.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
136
284
 
137
285
  # Save full prompt to debug directory if provided
@@ -139,107 +287,142 @@ def generate_json_with_retry(
139
287
  debug_path = Path(debug_dir)
140
288
  debug_path.mkdir(parents=True, exist_ok=True)
141
289
  prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
142
- _dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\n{'='*80}\n\n{prompt}",
290
+ _dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{prompt}",
143
291
  prompt_file)
144
292
  LOGGER.info("Full prompt saved to: %s", prompt_file)
145
293
 
146
- fence_re = re.compile(r"```json|```", re.I)
147
- for attempt in range(1, max_retries + 1):
148
- try:
149
- LOGGER.info("Calling Gemini API (attempt %d/%d)...", attempt, max_retries)
150
-
151
- # Handle image if provided
152
- if image_b64:
153
- parts = [prompt, {"mime_type": "image/png", "data": image_b64}]
154
- else:
155
- parts = [prompt]
156
-
157
- resp = model.generate_content(
158
- parts,
159
- generation_config={
160
- "temperature": temperature,
161
- "max_output_tokens": 4096,
162
- }
163
- )
164
- raw = resp.text.strip()
165
-
166
- # Log response
167
- LOGGER.info("Gemini response length: %d characters", len(raw))
168
- LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
169
-
170
- # Save full response to debug directory
171
- if debug_dir:
172
- response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
173
- _dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}",
174
- response_file)
175
- LOGGER.info("Full response saved to: %s", response_file)
294
+ try:
295
+ # Store content in bounded LRU caches for the cached function to retrieve
296
+ _PROMPT_CACHE.put(prompt_hash, prompt)
297
+ if image_hash and image_b64:
298
+ _IMAGE_CACHE.put(image_hash, image_b64)
299
+
300
+ # Check if this will be a cache hit
301
+ cache_info_before = _cached_gemini_call.cache_info()
302
+
303
+ # Use cached Gemini call (only with hash keys)
304
+ LOGGER.info("Calling cached Gemini API...")
305
+ raw = _cached_gemini_call(
306
+ model_name=model.model_name,
307
+ prompt_hash=prompt_hash,
308
+ image_hash=image_hash,
309
+ temperature=temperature,
310
+ max_retries=max_retries,
311
+ )
312
+
313
+ # Log cache performance
314
+ cache_info_after = _cached_gemini_call.cache_info()
315
+ if cache_info_after.hits > cache_info_before.hits:
316
+ LOGGER.info(" Cache HIT for prompt hash %s", prompt_hash[:16])
317
+ else:
318
+ LOGGER.info("✗ Cache MISS for prompt hash %s", prompt_hash[:16])
319
+
320
+ # Log response
321
+ LOGGER.info("Gemini response length: %d characters", len(raw))
322
+ LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
323
+
324
+ # Save full response to debug directory
325
+ if debug_dir:
326
+ response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
327
+ _dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{raw}",
328
+ response_file)
329
+ LOGGER.info("Full response saved to: %s", response_file)
176
330
 
177
- # Remove common Markdown fences
178
- if raw.startswith("```"):
179
- raw = fence_re.sub("", raw).strip()
180
-
181
- # Try to find JSON in the response
182
- # First, try to parse as-is
331
+ # Remove common Markdown fences more carefully
332
+ if raw.startswith("```json"):
333
+ raw = raw[7:].strip() # Remove ```json
334
+ elif raw.startswith("```"):
335
+ raw = raw[3:].strip() # Remove ```
336
+
337
+ if raw.endswith("```"):
338
+ raw = raw[:-3].strip() # Remove trailing ```
339
+
340
+
341
+ # Simple JSON parsing approach
342
+ # Try direct parsing first
343
+ LOGGER.debug(f"Raw JSON length: {len(raw)}")
344
+ LOGGER.debug(f"Raw JSON first 200 chars: {raw[:200]}")
345
+ LOGGER.debug(f"Raw JSON last 200 chars: {raw[-200:]}")
346
+
347
+ try:
348
+ parsed = json.loads(raw)
349
+ except json.JSONDecodeError as e:
350
+ LOGGER.error(f"JSON parsing failed at position {e.pos}: {e}")
351
+ LOGGER.error(f"Character at error: {repr(raw[e.pos] if e.pos < len(raw) else 'END')}")
352
+ LOGGER.error(f"Context: {repr(raw[max(0, e.pos-20):e.pos+20])}")
353
+
354
+ # Count braces and quotes for debugging
355
+ open_braces = raw.count('{')
356
+ close_braces = raw.count('}')
357
+ quotes = raw.count('"')
358
+ LOGGER.error(f"Braces: {open_braces} open, {close_braces} close. Quotes: {quotes}")
359
+
360
+ # If that fails, try to extract JSON from the response using a simpler method
183
361
  try:
184
- parsed = json.loads(raw)
185
- except json.JSONDecodeError:
186
- # If that fails, look for JSON array or object
187
- # Find the first '[' or '{' and the matching closing bracket
188
- json_start = -1
189
- json_end = -1
190
- bracket_stack = []
191
- in_string = False
192
- escape_next = False
362
+ # Look for the JSON object start and end
363
+ start_idx = raw.find('{')
364
+ if start_idx == -1:
365
+ raise json.JSONDecodeError("No JSON object found", raw, 0)
193
366
 
194
- for i, char in enumerate(raw):
195
- if escape_next:
196
- escape_next = False
197
- continue
367
+ # Find the matching closing brace by counting
368
+ brace_count = 0
369
+ end_idx = -1
370
+ for i in range(start_idx, len(raw)):
371
+ if raw[i] == '{':
372
+ brace_count += 1
373
+ elif raw[i] == '}':
374
+ brace_count -= 1
375
+ if brace_count == 0:
376
+ end_idx = i + 1
377
+ break
378
+
379
+ if end_idx == -1:
380
+ raise json.JSONDecodeError("No matching closing brace found", raw, 0)
381
+
382
+ json_str = raw[start_idx:end_idx]
383
+ LOGGER.debug(f"Extracted JSON string: {json_str[:200]}...")
384
+ parsed = json.loads(json_str)
385
+
386
+ except json.JSONDecodeError:
387
+ # Final fallback - try to use eval as a last resort (unsafe but functional)
388
+ try:
389
+ # Replace problematic characters and try to parse as Python dict
390
+ safe_raw = raw.replace('null', 'None').replace('true', 'True').replace('false', 'False')
391
+ start_idx = safe_raw.find('{')
392
+ if start_idx == -1:
393
+ raise ValueError("No dict found")
198
394
 
199
- if char == '\\':
200
- escape_next = True
201
- continue
202
-
203
- if char == '"' and not escape_next:
204
- in_string = not in_string
205
- continue
395
+ brace_count = 0
396
+ end_idx = -1
397
+ for i in range(start_idx, len(safe_raw)):
398
+ if safe_raw[i] == '{':
399
+ brace_count += 1
400
+ elif safe_raw[i] == '}':
401
+ brace_count -= 1
402
+ if brace_count == 0:
403
+ end_idx = i + 1
404
+ break
206
405
 
207
- if in_string:
208
- continue
406
+ if end_idx == -1:
407
+ raise ValueError("No matching closing brace found")
209
408
 
210
- if char in '[{':
211
- if json_start == -1:
212
- json_start = i
213
- bracket_stack.append(char)
214
- elif char in ']}':
215
- if bracket_stack:
216
- opening = bracket_stack.pop()
217
- if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
218
- if not bracket_stack: # Found complete JSON
219
- json_end = i + 1
220
- break
221
-
222
- if json_start >= 0 and json_end > json_start:
223
- # Extract the JSON portion
224
- json_str = raw[json_start:json_end]
225
- parsed = json.loads(json_str)
226
- else:
227
- # Look for simple [] in the response
409
+ dict_str = safe_raw[start_idx:end_idx]
410
+ parsed = eval(dict_str) # This is unsafe but we trust our own generated content
411
+ LOGGER.warning("Used eval() fallback for JSON parsing")
412
+
413
+ except Exception:
414
+ # If all else fails, return empty dict
415
+ LOGGER.error("All JSON parsing methods failed")
228
416
  if '[]' in raw:
229
417
  parsed = []
230
418
  else:
231
- # No JSON structure found, re-raise the original error
232
419
  raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
233
- LOGGER.info("Successfully parsed JSON response")
234
- return parsed
235
- except Exception as exc:
236
- LOGGER.warning(
237
- "Gemini call failed (attempt %d/%d): %s",
238
- attempt, max_retries, exc,
239
- )
240
- if attempt == max_retries:
241
- raise
242
- time.sleep(2 ** attempt)
420
+
421
+ LOGGER.info("Successfully parsed JSON response")
422
+ return parsed
423
+ except Exception as exc:
424
+ LOGGER.error("Cached Gemini call failed: %s", exc)
425
+ raise
243
426
 
244
427
 
245
428
  ###############################################################################
@@ -248,14 +431,14 @@ def generate_json_with_retry(
248
431
 
249
432
  PROMPT_FIND_LOCATIONS = dedent("""
250
433
  You are an expert reader of protein engineering manuscripts.
251
- Given the following article captions and section titles, identify ALL locations
434
+ Given the following article captions and section titles, identify most promising locations
252
435
  (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
253
- activity, etc.) for enzyme variants.
436
+ activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
254
437
 
255
438
  IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
256
439
  performance data locations. Pay careful attention to:
257
440
  - The caption text to identify which campaign/lineage the data is for
258
- - Enzyme name prefixes (e.g., PYS vs INS) that indicate different campaigns
441
+ - Enzyme name prefixes that indicate different campaigns
259
442
  - Different substrate/product types mentioned in captions
260
443
 
261
444
  Respond with a JSON array where each element contains:
@@ -267,8 +450,10 @@ Respond with a JSON array where each element contains:
267
450
  - "lineage_hint": any indication of which enzyme group this data is for (or null)
268
451
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
269
452
 
270
- Tables are preferred over figures when both contain the same data.
271
-
453
+ Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
454
+ Do not include too much sources, just return 2 or 3 sources.
455
+ Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
456
+ When returning confidence scores, be more accurate and avoid scores that are too close together.
272
457
  Respond ONLY with **minified JSON**. NO markdown fences.
273
458
 
274
459
  Example:
@@ -280,7 +465,8 @@ You are given either (a) the PNG image of a figure panel, or (b) the caption /
280
465
  text excerpt that contains numeric reaction performance data for an enzyme.
281
466
 
282
467
  Extract ONLY the performance metrics, NOT substrate/product names or reaction conditions.
283
- Return a JSON object with the following keys (use **null** if not found):
468
+
469
+ Return a JSON object with the following keys (use **null** only if the value is not mentioned at all):
284
470
  * "yield" - yield as percentage with ONE decimal place precision
285
471
  * "ttn" - turnover number (total turnovers)
286
472
  * "ton" - turnover number if TTN not available
@@ -296,45 +482,66 @@ IMPORTANT:
296
482
  - Do NOT extract substrate/product names - these will come from SI
297
483
  - Do NOT extract reaction conditions (temperature, pH, time, solvent)
298
484
  - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
485
+ - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
299
486
 
300
487
  Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
301
488
  """)
302
489
 
303
490
  PROMPT_EXTRACT_FIGURE_METRICS_BATCH = dedent("""
304
- You are analyzing a figure showing enzyme reaction performance data for multiple variants.
305
-
306
- Extract performance metrics for ALL the following enzyme variants:
491
+ STEP 1: First, identify ALL X-axis labels in the figure
492
+ - Read each X-axis label from left to right
493
+ - List exactly what text appears under each bar/data point
494
+ - Note: Labels may be abbreviated or use different naming conventions
495
+
496
+ STEP 2: Match X-axis labels to target enzyme variants
497
+ - Compare each X-axis label against the target enzyme list below
498
+ - Look for partial matches, abbreviations, or similar naming patterns
499
+ - If an X-axis label doesn't match any target enzyme, still include it for completeness
500
+
501
+ STEP 3: Identify Y-axis scales and what they measure
502
+ - Look at the Y-axis labels and tick marks to understand what each axis measures
503
+ - If there are multiple Y-axes (left and right), read the axis labels and units
504
+ - Note the minimum and maximum values on each axis scale
505
+ - Identify which visual elements (bars, dots, lines) correspond to which axis
506
+
507
+ STEP 4: Extract values for each matched variant
508
+ - For each X-axis position, identify which visual elements belong to that position
509
+ - LEFT Y-axis (bars): Measure bar height against the left scale by reading tick marks
510
+ - RIGHT Y-axis (dots): Measure dot position against the right scale by reading tick marks
511
+ - CRITICAL: Read actual scale values from the axis labels and tick marks
512
+ - Verify: taller bars should have higher values, higher dots should have higher values
513
+
514
+ Target enzymes to find and extract:
307
515
  {enzyme_names}
308
516
 
309
- Steps:
310
- 1. CHECK THE Y-AXIS SCALE: What is the maximum value? (e.g., 10%, 30%, 50%, 100%)
311
- 2. For each enzyme variant listed above:
312
- - Find its position on the X-axis
313
- - Read the bar height or data point value
314
- - Calculate the actual value based on the Y-axis scale
315
- 3. Compare all bars to understand relative performance
316
-
317
- Return a JSON object with enzyme names as keys, each containing:
318
- * "yield" - yield with ONE decimal place precision
319
- * "ttn" - turnover number if shown
320
- * "ton" - turnover number if TTN not available
321
- * "selectivity" - ee or er value with unit
322
- * "conversion" - conversion percentage if different from yield
323
- * "tof" - turnover frequency if provided
324
- * "activity" - specific activity if provided
325
- * "other_metrics" - dictionary of any other metrics
326
- * "notes" - any relevant notes (including reaction type if different reactions are shown)
327
-
328
- CRITICAL:
329
- - Read ALL pages provided in the image
330
- - If different enzymes are tested for different reactions (e.g., pyrrolidine vs indoline synthesis), note this in "notes"
331
- - For tables, check if data continues beyond what's shown
332
- - Read the Y-axis scale carefully for figures
333
-
334
- Example format:
335
- {{"ApePgb LVQ": {{"yield": 0.0, "ttn": null, "notes": "pyrrolidine synthesis", ...}}, ...}}
336
-
337
- Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
517
+ Instructions:
518
+ 1. First, list ALL X-axis labels you can see in the figure
519
+ 2. Match each X-axis label to the target enzyme variants
520
+ 3. For matched variants, extract both bar heights (left Y-axis) and dot positions (right Y-axis)
521
+ 4. Return data only for variants that have clear X-axis labels and are matched to targets
522
+
523
+ Return JSON with the identified enzyme variant names as keys containing:
524
+ * "x_axis_label" - the exact text from the X-axis for this variant
525
+ * "yield" - percentage from left Y-axis bar height measurement
526
+ * "ttn" - turnover number from right Y-axis dot position measurement
527
+ * "ton" - if TTN not available
528
+ * "selectivity" - if shown
529
+ * "conversion" - if different from yield
530
+ * "tof" - if provided
531
+ * "activity" - if provided
532
+ * "other_metrics" - other metrics
533
+ * "notes" - REQUIRED: Describe the X-axis label, bar position, and dot position (e.g., "X-axis shows P411-CIS, leftmost bar is very short, dot is at bottom")
534
+
535
+ CRITICAL: Return ONLY valid JSON in this exact format:
536
+ {{"enzyme_name": {{"x_axis_label": "label", "yield": number, "ttn": number, "notes": "description"}}}}
537
+
538
+ Rules:
539
+ - Use double quotes for all strings
540
+ - No markdown, no commentary, no explanations
541
+ - All values must be properly formatted
542
+ - Ensure JSON is complete and valid
543
+ - Do not truncate or cut off the response
544
+ - IMPORTANT: When extracting data, prioritize the most complete source that shows data for ALL variants. If there are conflicting values between different sources (e.g., bar graph vs text values), use the source that provides complete data for all target enzymes and ignore partial or conflicting values from other sources
338
545
  """)
339
546
 
340
547
  # Removed substrate scope IUPAC extraction - now handled in model reaction only
@@ -519,13 +726,25 @@ class ReactionExtractor:
519
726
  _TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
520
727
 
521
728
  def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
522
- campaign_filter: Optional[str] = None):
729
+ campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
523
730
  self.manuscript = manuscript
524
731
  self.si = si
525
732
  self.cfg = cfg
526
733
  self.model = get_model(cfg)
527
734
  self.debug_dir = debug_dir
528
735
  self.campaign_filter = campaign_filter # Filter for specific campaign
736
+ self.all_campaigns = all_campaigns or [] # List of all campaigns for context
737
+
738
+ # Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
739
+ self._figure_cache = LRUCache(maxsize=100) # Figures are large, so smaller cache
740
+ self._model_reaction_locations_cache = LRUCache(maxsize=50)
741
+
742
+ # Cache for compound mappings to avoid repeated API calls (bounded to prevent memory leaks)
743
+ self._compound_mapping_cache = LRUCache(maxsize=1000)
744
+ self._compound_mapping_text_cache = LRUCache(maxsize=500) # Cache text extractions too
745
+
746
+ # Cache for reaction locations to avoid repeated API calls (bounded to prevent memory leaks)
747
+ self._reaction_locations_cache = LRUCache(maxsize=50)
529
748
 
530
749
  # Create debug directory if specified
531
750
  if self.debug_dir:
@@ -551,23 +770,40 @@ class ReactionExtractor:
551
770
  # ------------------------------------------------------------------
552
771
 
553
772
  def _collect_captions_and_titles(self) -> str:
554
- # Simpler pattern: match any line starting with Table or Figure
555
- # This catches all variations like "Table S 2", "Table.", "Figure S1", etc.
556
- cap_pattern = re.compile(r"^(Table|Figure).*", re.I | re.M)
773
+ # Pattern to match Table or Figure with optional leading whitespace
774
+ # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
775
+ # Also handles cases where there's whitespace before the caption
776
+ cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
557
777
  captions: List[str] = []
558
778
 
559
779
  # Collect from all pages
560
780
  all_text = "\n".join(self.all_pages)
561
781
 
562
- # Find all figure/table captions
782
+ # Find all figure/table captions with more context
563
783
  for match in cap_pattern.finditer(all_text):
564
784
  caption_start = match.start()
565
- # Get up to 1200 chars or until double newline
785
+
786
+ # Include some context before the caption (up to 200 chars)
787
+ context_start = max(0, caption_start - 200)
788
+ # Find the start of the sentence/paragraph before the caption
789
+ context_text = all_text[context_start:caption_start]
790
+ last_period = context_text.rfind('.')
791
+ if last_period != -1:
792
+ context_start = context_start + last_period + 1
793
+
794
+ # For tables, include much more content after the caption to show actual table data
795
+ # For figures, keep the original limit
796
+ is_table = match.group(1).lower() == 'table'
797
+ max_chars = 5000 if is_table else 3000
798
+
799
+ # Get up to max_chars or until double newline
566
800
  caption_end = all_text.find("\n\n", caption_start)
567
- if caption_end == -1 or caption_end - caption_start > 1200:
568
- caption_end = caption_start + 1200
569
- caption = all_text[caption_start:caption_end].strip()
570
- captions.append(caption)
801
+ if caption_end == -1 or caption_end - caption_start > max_chars:
802
+ caption_end = caption_start + max_chars
803
+
804
+ # Include the context and full caption with table content
805
+ full_caption = all_text[context_start:caption_end].strip()
806
+ captions.append(full_caption)
571
807
 
572
808
  # Also look for SI section titles
573
809
  si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
@@ -584,15 +820,47 @@ class ReactionExtractor:
584
820
 
585
821
  def find_reaction_locations(self) -> List[Dict[str, Any]]:
586
822
  """Find all locations containing reaction performance data."""
587
- # Add campaign context if available
823
+ # Create cache key based on campaign filter
824
+ cache_key = f"locations_{self.campaign_filter or 'all'}"
825
+
826
+ # Check cache first
827
+ cached_result = self._reaction_locations_cache.get(cache_key)
828
+ if cached_result is not None:
829
+ LOGGER.info("Using cached reaction locations for campaign: %s", self.campaign_filter or 'all')
830
+ return cached_result
831
+
832
+ # Add campaign context - always provide context to help model understanding
588
833
  campaign_context = ""
589
834
  if self.campaign_filter:
835
+ campaigns_warning = ""
836
+ if self.all_campaigns:
837
+ campaigns_warning = f"""
838
+ ALL CAMPAIGNS IN THIS PAPER:
839
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
840
+
841
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
842
+ Be extremely careful to only extract data for the {self.campaign_filter} campaign.
843
+ """
844
+
590
845
  campaign_context = f"""
591
- IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
592
- Only return locations that contain data for this specific campaign.
593
- Ignore locations that contain data for other campaigns.
846
+ IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
847
+ Only return locations that contain data for this specific campaign.
848
+ Ignore locations that contain data for other campaigns.
849
+ {campaigns_warning}
594
850
 
595
- """
851
+ """
852
+ else:
853
+ # Even for single campaigns, provide context about what to look for
854
+ campaign_context = f"""
855
+ IMPORTANT: You are looking for performance data showing enzyme evolution progression.
856
+ Look for locations that contain actual performance metrics (yield, TTN, TON, activity, etc.)
857
+ for multiple enzyme variants, not just mutation lists or method descriptions.
858
+
859
+ Tables may only contain mutation information without performance data - check the actual
860
+ table content below the caption to verify if performance metrics are present.
861
+ Figures with evolutionary lineage data often contain the actual performance matrix.
862
+
863
+ """
596
864
 
597
865
  prompt = campaign_context + PROMPT_FIND_LOCATIONS + "\n\n" + self._collect_captions_and_titles()
598
866
  try:
@@ -604,13 +872,20 @@ Ignore locations that contain data for other campaigns.
604
872
  tag="find_locations"
605
873
  )
606
874
  # Handle both single dict (backwards compatibility) and list
875
+ result = []
607
876
  if isinstance(data, dict):
608
- return [data]
877
+ result = [data]
609
878
  elif isinstance(data, list):
610
- return data
879
+ result = data
611
880
  else:
612
881
  LOGGER.error("Expected list or dict from Gemini, got: %s", type(data))
613
- return []
882
+ result = []
883
+
884
+ # Cache the result
885
+ self._reaction_locations_cache.put(cache_key, result)
886
+ LOGGER.info("Cached reaction locations for campaign: %s", self.campaign_filter or 'all')
887
+
888
+ return result
614
889
  except Exception as e:
615
890
  LOGGER.error("Failed to find reaction locations: %s", e)
616
891
  return []
@@ -686,13 +961,27 @@ Ignore locations that contain data for other campaigns.
686
961
  }
687
962
 
688
963
  def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
689
- """Find the model reaction for a specific lineage group."""
964
+ """Find the model reaction for a specific lineage group.
965
+ Returns early if no relevant text is found to avoid unnecessary API calls."""
966
+
690
967
  # Gather relevant text near this location
691
968
  page_text = self._page_with_reference(location) or ""
692
969
 
970
+ # Early exit if no text found for this location
971
+ if not page_text or len(page_text.strip()) < 100:
972
+ LOGGER.info("No sufficient text found for location %s, skipping lineage-specific extraction", location)
973
+ return {}
974
+
693
975
  # Also check manuscript introduction for model reaction info
694
976
  intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
695
977
 
978
+ # Quick relevance check - look for reaction-related keywords
979
+ reaction_keywords = ["substrate", "product", "reaction", "compound", "synthesis", "procedure", "method"]
980
+ combined_text = (page_text + intro_text).lower()
981
+ if not any(keyword in combined_text for keyword in reaction_keywords):
982
+ LOGGER.info("No reaction-related keywords found for location %s, skipping lineage extraction", location)
983
+ return {}
984
+
696
985
  # Build the prompt with location and context
697
986
  prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
698
987
  location=location,
@@ -702,6 +991,7 @@ Ignore locations that contain data for other campaigns.
702
991
  prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
703
992
 
704
993
  # If we have model reaction locations, include text from those locations too
994
+ text_added = False
705
995
  if model_reaction_locations:
706
996
  # Add text from model reaction location
707
997
  if model_reaction_locations.get("model_reaction_location", {}).get("location"):
@@ -709,6 +999,7 @@ Ignore locations that contain data for other campaigns.
709
999
  model_text = self._get_text_around_location(model_loc)
710
1000
  if model_text:
711
1001
  prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
1002
+ text_added = True
712
1003
 
713
1004
  # Add text from conditions location (often contains reaction details)
714
1005
  if model_reaction_locations.get("conditions_location", {}).get("location"):
@@ -716,8 +1007,15 @@ Ignore locations that contain data for other campaigns.
716
1007
  cond_text = self._get_text_around_location(cond_loc)
717
1008
  if cond_text:
718
1009
  prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
1010
+ text_added = True
1011
+
1012
+ # If we didn't find any model reaction locations and the page text is sparse, skip
1013
+ if not text_added and len(page_text.strip()) < 500:
1014
+ LOGGER.info("Insufficient context for lineage model reaction extraction at %s", location)
1015
+ return {}
719
1016
 
720
1017
  try:
1018
+ LOGGER.info("Attempting lineage-specific model reaction extraction for %s", location)
721
1019
  data = generate_json_with_retry(
722
1020
  self.model,
723
1021
  prompt,
@@ -725,7 +1023,15 @@ Ignore locations that contain data for other campaigns.
725
1023
  debug_dir=self.debug_dir,
726
1024
  tag=f"lineage_model_reaction_{location.replace(' ', '_')}"
727
1025
  )
728
- return data if isinstance(data, dict) else {}
1026
+
1027
+ # Validate the response has useful information
1028
+ if isinstance(data, dict) and (data.get('substrate_ids') or data.get('product_ids')):
1029
+ LOGGER.info("Lineage model reaction extraction successful for %s", location)
1030
+ return data
1031
+ else:
1032
+ LOGGER.info("Lineage model reaction extraction returned empty results for %s", location)
1033
+ return {}
1034
+
729
1035
  except Exception as e:
730
1036
  LOGGER.error("Failed to find model reaction for lineage at %s: %s", location, e)
731
1037
  return {}
@@ -777,67 +1083,174 @@ Ignore locations that contain data for other campaigns.
777
1083
  If False, extracts the entire page (useful for tables).
778
1084
  Returns a base64-encoded PNG or None."""
779
1085
 
1086
+ # Check cache first
1087
+ cache_key = f"{ref}_{extract_figure_only}"
1088
+ cached_result = self._figure_cache.get(cache_key)
1089
+ if cached_result is not None:
1090
+ LOGGER.debug("Using cached figure for %s", ref)
1091
+ return cached_result
1092
+
780
1093
  # For table extraction, use multi-page approach
781
1094
  if not extract_figure_only:
782
1095
  pages_with_ref = self._find_pages_with_reference(ref)
783
1096
  if pages_with_ref:
784
1097
  LOGGER.debug(f"Found {len(pages_with_ref)} pages containing {ref}")
785
- return self._extract_multiple_pages_png(pages_with_ref)
1098
+ return self._extract_multiple_pages_png(pages_with_ref, ref)
786
1099
  return None
787
1100
 
788
- # For figure extraction, search both documents
1101
+ # For figure extraction, search both documents for actual figure captions
789
1102
  for doc in filter(None, [self.ms_doc, self.si_doc]):
790
1103
  for page_number in range(doc.page_count):
791
1104
  page = doc.load_page(page_number)
792
1105
  page_text = page.get_text()
793
- if ref.lower() not in page_text.lower():
794
- continue
795
- # Get caption bbox
796
- text_instances = page.search_for(ref, quads=False)
797
- if not text_instances:
1106
+
1107
+ # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
1108
+ # For subfigures like "Figure 1C", extract the main figure "Figure 1"
1109
+ figure_num = ref.replace('Figure ', '').replace('figure ', '')
1110
+
1111
+ # Extract main figure number from subfigure (e.g., "1C" -> "1")
1112
+ main_figure_num = re.match(r'^(\d+)', figure_num)
1113
+ if main_figure_num:
1114
+ main_figure_num = main_figure_num.group(1)
1115
+ else:
1116
+ main_figure_num = figure_num
1117
+
1118
+ caption_patterns = [
1119
+ rf"^Figure\s+{re.escape(main_figure_num)}\.", # "Figure 1."
1120
+ rf"^Figure\s+{re.escape(main_figure_num)}:", # "Figure 1:"
1121
+ rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]", # "Figure 1 Performance"
1122
+ rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
1123
+ rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
1124
+ rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
1125
+ ]
1126
+
1127
+ LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
1128
+ main_figure_num, ref, caption_patterns)
1129
+
1130
+ caption_found = False
1131
+ cap_rect = None
1132
+
1133
+ for pattern in caption_patterns:
1134
+ matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
1135
+ if matches:
1136
+ LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
1137
+ # Found actual figure caption, get its position
1138
+ caption_text = matches.group(0)
1139
+ text_instances = page.search_for(caption_text, quads=False)
1140
+ if text_instances:
1141
+ cap_rect = text_instances[0]
1142
+ caption_found = True
1143
+ LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
1144
+ break
1145
+
1146
+ if not caption_found:
1147
+ # Debug: show what figure-related text is actually on this page
1148
+ figure_mentions = [line.strip() for line in page_text.split('\n')
1149
+ if 'figure' in line.lower() and main_figure_num.lower() in line.lower()]
1150
+ if figure_mentions:
1151
+ LOGGER.debug("Page %d has figure mentions but no caption match: %s",
1152
+ page_number, figure_mentions[:3])
798
1153
  continue
799
- cap_rect = text_instances[0] # first match
800
1154
 
801
1155
  if extract_figure_only:
802
- # Sort images by y0 (top) coordinate ascending
803
- images = sorted(page.get_images(full=True), key=lambda im: im[7])
804
- # Find first image whose bottom y is **above** caption top y
805
- for img in images:
806
- xref = img[0]
807
- # Get image rectangles to find position
808
- img_rects = page.get_image_rects(xref)
809
- if img_rects:
810
- img_rect = img_rects[0] # First rectangle
811
- if img_rect.y1 < cap_rect.y0: # fully above caption
812
- # Extract image bytes
813
- pix = fitz.Pixmap(doc, xref)
814
- pix = self._ensure_rgb_pixmap(pix)
815
- img_bytes = pix.tobytes("png")
816
- return b64encode(img_bytes).decode()
1156
+ # Extract only the area above the caption (the actual figure)
1157
+ # This excludes caption text and focuses on visual elements
1158
+ LOGGER.info("Extracting figure area above caption for %s", ref)
1159
+
1160
+ # Get the page dimensions
1161
+ page_rect = page.rect
1162
+
1163
+ # Extract the area above the caption
1164
+ if cap_rect:
1165
+ # Extract from top of page to top of caption
1166
+ figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
1167
+ LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
1168
+ else:
1169
+ # If no caption found, use top 80% of page
1170
+ figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
1171
+ LOGGER.debug("No caption found, using top 80% of page: %s", figure_rect)
1172
+
1173
+ # Extract the figure area only
1174
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
1175
+ pix = page.get_pixmap(matrix=mat, clip=figure_rect)
1176
+ pix = self._ensure_rgb_pixmap(pix)
1177
+ img_bytes = pix.tobytes("png")
1178
+
1179
+ # Save PNG to debug directory if available
1180
+ if self.debug_dir:
1181
+ timestamp = int(time.time())
1182
+ png_file = self.debug_dir / f"figure_{ref.replace(' ', '_')}_{timestamp}.png"
1183
+ with open(png_file, 'wb') as f:
1184
+ f.write(img_bytes)
1185
+ LOGGER.info("Saved figure page to: %s", png_file)
1186
+
1187
+ result = b64encode(img_bytes).decode()
1188
+ # Cache the result
1189
+ self._figure_cache.put(cache_key, result)
1190
+ return result
817
1191
  else:
818
1192
  # Extract the entire page as an image
819
- mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
1193
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
820
1194
  pix = page.get_pixmap(matrix=mat)
821
1195
  pix = self._ensure_rgb_pixmap(pix)
822
1196
  img_bytes = pix.tobytes("png")
823
- return b64encode(img_bytes).decode()
1197
+
1198
+ # Save PNG to debug directory if available
1199
+ if self.debug_dir:
1200
+ timestamp = int(time.time())
1201
+ png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
1202
+ with open(png_file, 'wb') as f:
1203
+ f.write(img_bytes)
1204
+ LOGGER.info("Saved page image to: %s", png_file)
1205
+
1206
+ result = b64encode(img_bytes).decode()
1207
+ # Cache the result
1208
+ self._figure_cache.put(cache_key, result)
1209
+ return result
824
1210
  return None
825
1211
 
826
1212
  def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
827
1213
  """Find all pages containing the reference across documents.
1214
+ Prioritizes pages with actual captions over just references.
828
1215
  Returns list of (document, page_number) tuples."""
829
1216
  pages_found = []
1217
+ caption_pages = []
830
1218
 
831
1219
  for doc in filter(None, [self.ms_doc, self.si_doc]):
832
1220
  for page_number in range(doc.page_count):
833
1221
  page = doc.load_page(page_number)
834
1222
  page_text = page.get_text()
1223
+
1224
+ # Check for actual figure caption first
1225
+ if ref.lower().startswith('figure'):
1226
+ figure_num = ref.replace('Figure ', '').replace('figure ', '')
1227
+
1228
+ # Extract main figure number from subfigure (e.g., "1C" -> "1")
1229
+ main_figure_num = re.match(r'^(\d+)', figure_num)
1230
+ if main_figure_num:
1231
+ main_figure_num = main_figure_num.group(1)
1232
+ else:
1233
+ main_figure_num = figure_num
1234
+
1235
+ caption_patterns = [
1236
+ rf"^Figure\s+{re.escape(main_figure_num)}\.",
1237
+ rf"^Figure\s+{re.escape(main_figure_num)}:",
1238
+ rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]"
1239
+ ]
1240
+
1241
+ for pattern in caption_patterns:
1242
+ if re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE):
1243
+ caption_pages.append((doc, page_number))
1244
+ break
1245
+
1246
+ # Fallback to any mention of the reference
835
1247
  if ref.lower() in page_text.lower():
836
1248
  pages_found.append((doc, page_number))
837
-
838
- return pages_found
1249
+
1250
+ # Return caption pages first, then other pages
1251
+ return caption_pages + [p for p in pages_found if p not in caption_pages]
839
1252
 
840
- def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]]) -> Optional[str]:
1253
+ def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]], ref: str = "unknown") -> Optional[str]:
841
1254
  """Extract multiple pages as a combined PNG image."""
842
1255
  if not pages:
843
1256
  return None
@@ -854,12 +1267,7 @@ Ignore locations that contain data for other campaigns.
854
1267
  pix = self._ensure_rgb_pixmap(pix)
855
1268
  all_images.append(pix)
856
1269
 
857
- # If this is the last page with the reference, also add the next page
858
- if i == len(pages) - 1 and page_num + 1 < doc.page_count:
859
- next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
860
- next_pix = self._ensure_rgb_pixmap(next_pix)
861
- all_images.append(next_pix)
862
- LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
1270
+ # Only extract the page containing the reference (removed next page logic)
863
1271
 
864
1272
  if not all_images:
865
1273
  return None
@@ -867,7 +1275,17 @@ Ignore locations that contain data for other campaigns.
867
1275
  # If only one page, return it directly
868
1276
  if len(all_images) == 1:
869
1277
  pix = self._ensure_rgb_pixmap(all_images[0])
870
- return b64encode(pix.tobytes("png")).decode()
1278
+ img_bytes = pix.tobytes("png")
1279
+
1280
+ # Save debug file if available
1281
+ if self.debug_dir:
1282
+ timestamp = int(time.time())
1283
+ png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
1284
+ with open(png_file, 'wb') as f:
1285
+ f.write(img_bytes)
1286
+ LOGGER.info("Saved multi-page image to: %s", png_file)
1287
+
1288
+ return b64encode(img_bytes).decode()
871
1289
 
872
1290
  # Combine multiple pages vertically
873
1291
  if not all_images:
@@ -914,7 +1332,7 @@ Ignore locations that contain data for other campaigns.
914
1332
  y_offset += pix.height * scale
915
1333
 
916
1334
  # Convert the page to a pixmap
917
- mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
1335
+ mat = fitz.Matrix(5.0, 5.0) # 5x zoom for quality
918
1336
  combined_pix = page.get_pixmap(matrix=mat)
919
1337
  combined_pix = self._ensure_rgb_pixmap(combined_pix)
920
1338
 
@@ -922,6 +1340,14 @@ Ignore locations that contain data for other campaigns.
922
1340
  img_bytes = combined_pix.tobytes("png")
923
1341
  output_doc.close()
924
1342
 
1343
+ # Save debug file if available
1344
+ if self.debug_dir:
1345
+ timestamp = int(time.time())
1346
+ png_file = self.debug_dir / f"combined_pages_{ref.replace(' ', '_')}_{timestamp}.png"
1347
+ with open(png_file, 'wb') as f:
1348
+ f.write(img_bytes)
1349
+ LOGGER.info("Saved combined multi-page image to: %s", png_file)
1350
+
925
1351
  return b64encode(img_bytes).decode()
926
1352
 
927
1353
  # ------------------------------------------------------------------
@@ -930,14 +1356,19 @@ Ignore locations that contain data for other campaigns.
930
1356
 
931
1357
  def _validate_location_exists(self, ref: str) -> bool:
932
1358
  """Verify that the referenced location actually exists in the document."""
933
- # Search for the actual reference in the document
934
- for page_num in range(len(self.doc)):
935
- page = self.doc[page_num]
936
- text = page.get_text()
937
-
938
- # Look for table references like "Table 1", "Table S1", etc.
939
- if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
940
- return True
1359
+ # Search for the actual reference in both manuscript and SI documents
1360
+ docs_to_check = [self.ms_doc]
1361
+ if self.si_doc:
1362
+ docs_to_check.append(self.si_doc)
1363
+
1364
+ for doc in docs_to_check:
1365
+ for page_num in range(len(doc)):
1366
+ page = doc[page_num]
1367
+ text = page.get_text()
1368
+
1369
+ # Look for table references like "Table 1", "Table S1", etc.
1370
+ if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
1371
+ return True
941
1372
 
942
1373
  return False
943
1374
 
@@ -1010,23 +1441,55 @@ Ignore locations that contain data for other campaigns.
1010
1441
  LOGGER.debug("No page image found for %s - using full page text", ref)
1011
1442
  snippet = self._extract_table_context(ref)
1012
1443
  elif self._FIG_RE.search(ref_lc):
1013
- # For figures, extract just the figure image
1444
+ # For figures, extract just the figure image (same logic as compound mapping)
1014
1445
  image_b64 = self._extract_page_png(ref, extract_figure_only=True)
1015
1446
  if not image_b64:
1016
1447
  LOGGER.debug("No figure image found for %s - using caption text", ref)
1017
1448
  snippet = self._extract_figure_caption(ref)
1449
+ else:
1450
+ # If figure is found, ignore text information - use image only
1451
+ snippet = ""
1018
1452
  else:
1019
1453
  snippet = self._page_with_reference(ref) or ""
1020
1454
 
1021
- # Validate context before sending to Gemini
1022
- if not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
1455
+ # For figures with images, skip text validation and proceed with image extraction
1456
+ if image_b64 and self._FIG_RE.search(ref_lc):
1457
+ LOGGER.info("Using figure image for %s - ignoring text context", ref)
1458
+ elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
1023
1459
  return []
1024
1460
 
1025
- enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
1461
+ # Create enhanced enzyme descriptions with parent/mutation context
1462
+ if hasattr(self, 'enzyme_df') and self.enzyme_df is not None:
1463
+ enzyme_descriptions = []
1464
+ for enzyme in enzyme_list:
1465
+ # Find this enzyme in the dataframe
1466
+ enzyme_row = None
1467
+ if 'enzyme_id' in self.enzyme_df.columns:
1468
+ enzyme_row = self.enzyme_df[self.enzyme_df['enzyme_id'] == enzyme]
1469
+ elif 'enzyme' in self.enzyme_df.columns:
1470
+ enzyme_row = self.enzyme_df[self.enzyme_df['enzyme'] == enzyme]
1471
+
1472
+ if enzyme_row is not None and len(enzyme_row) > 0:
1473
+ row = enzyme_row.iloc[0]
1474
+ parent = row.get('parent_enzyme_id', '')
1475
+ mutations = row.get('mutations', '')
1476
+
1477
+ desc = f"- {enzyme}"
1478
+ if parent and str(parent).strip() and str(parent) != 'nan':
1479
+ desc += f" (parent: {parent})"
1480
+ if mutations and str(mutations).strip() and str(mutations) != 'nan':
1481
+ desc += f" (mutations: {mutations})"
1482
+ enzyme_descriptions.append(desc)
1483
+ else:
1484
+ enzyme_descriptions.append(f"- {enzyme}")
1485
+ enzyme_names = "\n".join(enzyme_descriptions)
1486
+ else:
1487
+ enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
1026
1488
 
1027
1489
  if image_b64:
1028
1490
  # Use batch extraction prompt for image analysis
1029
- prompt = campaign_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
1491
+ location_context = f"\n\nIMPORTANT: You are extracting data from {ref}, which has been identified as the PRIMARY LOCATION containing the most reliable performance data for these enzymes.\n"
1492
+ prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
1030
1493
  LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
1031
1494
  tag = f"extract_metrics_batch_vision"
1032
1495
  else:
@@ -1048,7 +1511,32 @@ Ignore locations that contain data for other campaigns.
1048
1511
 
1049
1512
  # Validate response has meaningful data
1050
1513
  if not self._validate_response(data, enzyme_list, ref):
1051
- return []
1514
+ # If figure extraction failed and we have a figure, try falling back to text
1515
+ if image_b64 and self._FIG_RE.search(ref_lc):
1516
+ LOGGER.warning("Figure extraction from %s returned empty results - falling back to text", ref)
1517
+ snippet = self._extract_figure_caption(ref)
1518
+ if self._validate_context(snippet, enzyme_list, ref):
1519
+ # Retry with text extraction
1520
+ format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
1521
+ prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
1522
+ LOGGER.info("Gemini: retrying with text extraction for %d enzymes from %s…", len(enzyme_list), ref)
1523
+
1524
+ data = generate_json_with_retry(
1525
+ self.model,
1526
+ prompt,
1527
+ temperature=self.cfg.extract_temperature,
1528
+ debug_dir=self.debug_dir,
1529
+ tag=f"extract_metrics_batch_text_fallback",
1530
+ image_b64=None
1531
+ )
1532
+
1533
+ # Validate the text extraction response
1534
+ if not self._validate_response(data, enzyme_list, ref):
1535
+ return []
1536
+ else:
1537
+ return []
1538
+ else:
1539
+ return []
1052
1540
 
1053
1541
  # Handle the response format - expecting a dict with enzyme names as keys
1054
1542
  results = []
@@ -1100,6 +1588,15 @@ Ignore locations that contain data for other campaigns.
1100
1588
 
1101
1589
  def find_model_reaction_locations(self, enzyme_variants: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
1102
1590
  """Find locations for model reaction scheme, conditions, and IUPAC names."""
1591
+ # Create cache key based on campaign filter and enzyme variants
1592
+ cache_key = f"{self.campaign_filter}_{hash(tuple(sorted(enzyme_variants)) if enzyme_variants else ())}"
1593
+
1594
+ # Check cache first
1595
+ cached_result = self._model_reaction_locations_cache.get(cache_key)
1596
+ if cached_result is not None:
1597
+ LOGGER.info("Using cached model reaction locations for campaign: %s", self.campaign_filter)
1598
+ return cached_result
1599
+
1103
1600
  # Collect all text including section titles, captions, and schemes
1104
1601
  all_text = self._collect_captions_and_titles()
1105
1602
 
@@ -1110,13 +1607,25 @@ Ignore locations that contain data for other campaigns.
1110
1607
  # Add enzyme context if provided
1111
1608
  enzyme_context = ""
1112
1609
  if enzyme_variants and self.campaign_filter:
1610
+ campaigns_context = ""
1611
+ if self.all_campaigns:
1612
+ campaigns_context = f"""
1613
+ ALL CAMPAIGNS IN THIS PAPER:
1614
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
1615
+
1616
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
1617
+ - Different campaigns may use similar enzyme names but different substrates
1618
+ - Be extremely careful to only extract data for the {self.campaign_filter} campaign
1619
+ - Ignore data from other campaigns even if they seem similar
1620
+ """
1621
+
1113
1622
  enzyme_context = f"""
1114
1623
  IMPORTANT CONTEXT:
1115
1624
  You are looking for the model reaction used specifically for these enzyme variants:
1116
1625
  {', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
1117
1626
 
1118
1627
  These variants belong to campaign: {self.campaign_filter}
1119
-
1628
+ {campaigns_context}
1120
1629
  Focus on finding the model reaction that was used to evaluate THESE specific variants.
1121
1630
  Different campaigns may use different model reactions.
1122
1631
  """
@@ -1134,6 +1643,11 @@ Different campaigns may use different model reactions.
1134
1643
  if not isinstance(data, dict):
1135
1644
  LOGGER.error("Expected dict from Gemini, got: %s", type(data))
1136
1645
  return None
1646
+
1647
+ # Cache the result
1648
+ self._model_reaction_locations_cache.put(cache_key, data)
1649
+ LOGGER.info("Cached model reaction locations for campaign: %s", self.campaign_filter)
1650
+
1137
1651
  return data
1138
1652
  except Exception as e:
1139
1653
  LOGGER.error("Failed to find model reaction locations: %s", e)
@@ -1232,9 +1746,12 @@ Different campaigns may use different model reactions.
1232
1746
  extraction_text: str,
1233
1747
  compound_ids: List[str] = None,
1234
1748
  tag_suffix: str = "",
1749
+ campaign_filter: Optional[str] = None,
1235
1750
  ) -> Dict[str, CompoundMapping]:
1236
1751
  """Helper function to extract compound mappings from provided text."""
1237
1752
  prompt = PROMPT_COMPOUND_MAPPING
1753
+ if campaign_filter:
1754
+ prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
1238
1755
  if compound_ids:
1239
1756
  prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
1240
1757
  prompt += "\n\nTEXT:\n" + extraction_text
@@ -1282,6 +1799,7 @@ Different campaigns may use different model reactions.
1282
1799
  compound_ids: List[str],
1283
1800
  figure_images: Dict[str, str],
1284
1801
  tag_suffix: str = "",
1802
+ campaign_filter: Optional[str] = None,
1285
1803
  ) -> Dict[str, CompoundMapping]:
1286
1804
  """Extract compound mappings using multimodal approach with figures."""
1287
1805
  # Enhanced prompt for figure-based extraction
@@ -1293,8 +1811,26 @@ Use your best knowledge, Look carefully in:
1293
1811
  1. The chemical structures shown in figures - infer IUPAC names from drawn structures
1294
1812
  2. Figure captions that may define compounds
1295
1813
  3. Text that refers to these compound numbers
1296
- 4. Reaction schemes showing transformations
1814
+ 4. Reaction schemes showing transformations"""
1815
+
1816
+ if campaign_filter:
1817
+ campaigns_warning = ""
1818
+ if self.all_campaigns:
1819
+ campaigns_warning = f"""
1820
+ ALL CAMPAIGNS IN THIS PAPER:
1821
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
1822
+
1823
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
1824
+ """
1825
+
1826
+ prompt += f"""
1827
+
1828
+ IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system.
1829
+ {campaigns_warning}
1830
+ Different campaigns may use different numbering systems for compounds.
1831
+ Do NOT include compound information from other campaigns."""
1297
1832
 
1833
+ prompt += """
1298
1834
 
1299
1835
  IMPORTANT:
1300
1836
  - Only provide IUPAC names you can determine from the figures or text
@@ -1324,9 +1860,10 @@ TEXT FROM MANUSCRIPT:
1324
1860
  for fig_ref, fig_base64 in figure_images.items():
1325
1861
  try:
1326
1862
  img_bytes = b64decode(fig_base64)
1327
- image = PIL.Image.open(io.BytesIO(img_bytes))
1863
+ # Format image for Gemini API
1864
+ image_part = {"mime_type": "image/png", "data": img_bytes}
1328
1865
  content_parts.append(f"\n[Figure: {fig_ref}]")
1329
- content_parts.append(image)
1866
+ content_parts.append(image_part)
1330
1867
  LOGGER.info("Added figure %s to multimodal compound mapping", fig_ref)
1331
1868
  except Exception as e:
1332
1869
  LOGGER.warning("Failed to add figure %s: %s", fig_ref, e)
@@ -1356,6 +1893,21 @@ TEXT FROM MANUSCRIPT:
1356
1893
 
1357
1894
  # Make multimodal API call
1358
1895
  response = self.model.generate_content(content_parts)
1896
+
1897
+ # Track token usage if available
1898
+ try:
1899
+ if hasattr(response, 'usage_metadata'):
1900
+ input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
1901
+ output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
1902
+ if input_tokens or output_tokens:
1903
+ try:
1904
+ from .wrapper import add_token_usage
1905
+ add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
1906
+ except ImportError:
1907
+ pass # wrapper not available
1908
+ except Exception:
1909
+ pass # token tracking is best-effort
1910
+
1359
1911
  raw_text = response.text.strip()
1360
1912
 
1361
1913
  # Log response
@@ -1402,18 +1954,37 @@ TEXT FROM MANUSCRIPT:
1402
1954
  self,
1403
1955
  compound_ids: List[str],
1404
1956
  initial_sections: List[str] = None,
1957
+ campaign_filter: Optional[str] = None,
1405
1958
  ) -> Dict[str, CompoundMapping]:
1406
- """Extract compound ID to IUPAC name mappings using adaptive 3-tier strategy.
1959
+ """Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
1407
1960
 
1408
- 1. First attempts extraction from standard sections
1409
- 2. Expands search to additional sections if compounds are missing
1410
- 3. Uses multimodal figure analysis as final fallback
1961
+ 1. First attempts extraction from specific SI sections + 10 manuscript pages
1962
+ 2. If compounds missing, uses full manuscript + SI with multimodal figure analysis
1411
1963
  """
1412
1964
  if not compound_ids:
1413
1965
  return {}
1414
1966
 
1415
- LOGGER.info("Starting adaptive compound mapping for %d compounds: %s",
1416
- len(compound_ids), sorted(compound_ids))
1967
+ # Check cache first - return cached results for compounds we've already processed
1968
+ cached_mappings = {}
1969
+ uncached_compound_ids = []
1970
+
1971
+ for cid in compound_ids:
1972
+ # Include campaign filter in cache key to prevent cross-campaign contamination
1973
+ cache_key = f"{campaign_filter}_{cid.lower().strip()}" if campaign_filter else cid.lower().strip()
1974
+ cached_mapping = self._compound_mapping_cache.get(cache_key)
1975
+ if cached_mapping is not None:
1976
+ cached_mappings[cid.lower().strip()] = cached_mapping
1977
+ LOGGER.info("Using cached compound mapping for: %s (campaign: %s)", cid, campaign_filter)
1978
+ else:
1979
+ uncached_compound_ids.append(cid)
1980
+
1981
+ # If all compounds are cached, return immediately
1982
+ if not uncached_compound_ids:
1983
+ LOGGER.info("All %d compounds found in cache, skipping API calls", len(compound_ids))
1984
+ return cached_mappings
1985
+
1986
+ LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
1987
+ len(uncached_compound_ids), sorted(uncached_compound_ids))
1417
1988
 
1418
1989
  # Tier 1: Standard sections (manuscript + initial SI sections)
1419
1990
  initial_sections = initial_sections or [
@@ -1424,118 +1995,118 @@ TEXT FROM MANUSCRIPT:
1424
1995
  # Include manuscript pages (first 10) for model reaction context
1425
1996
  manuscript_text = "\n\n".join(self.ms_pages[:10])
1426
1997
 
1998
+ # Add campaign context if provided
1999
+ campaign_context = ""
2000
+ if campaign_filter:
2001
+ campaigns_warning = ""
2002
+ if self.all_campaigns:
2003
+ campaigns_warning = f"""
2004
+ ALL CAMPAIGNS IN THIS PAPER:
2005
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
2006
+
2007
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
2008
+ """
2009
+
2010
+ campaign_context = f"""
2011
+
2012
+ IMPORTANT CAMPAIGN CONTEXT:
2013
+ You are extracting compound information specifically for the {campaign_filter} campaign.
2014
+ {campaigns_warning}
2015
+ Focus ONLY on compound information relevant to the {campaign_filter} campaign/reaction system.
2016
+ Do NOT include compound information from other campaigns.
2017
+
2018
+ """
2019
+
1427
2020
  # Extract from initial sections
1428
2021
  extraction_text = self._extract_sections_by_title(initial_sections)
1429
2022
  if extraction_text:
1430
- extraction_text = manuscript_text + "\n\n" + extraction_text
2023
+ extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
1431
2024
  else:
1432
- extraction_text = manuscript_text
2025
+ extraction_text = manuscript_text + campaign_context
1433
2026
 
1434
- # First extraction attempt
2027
+ # First extraction attempt - only for uncached compounds
1435
2028
  mappings = self._extract_compound_mappings_from_text(
1436
- extraction_text[:50000], compound_ids, tag_suffix="initial"
2029
+ extraction_text[:50000], uncached_compound_ids, tag_suffix="initial", campaign_filter=campaign_filter
1437
2030
  )
1438
2031
  LOGGER.info("Tier 1: Found %d compound mappings from standard sections", len(mappings))
1439
2032
 
1440
2033
  # Check for missing compounds
1441
2034
  missing_compounds = []
1442
- for cid in compound_ids:
2035
+ for cid in uncached_compound_ids:
1443
2036
  mapping = mappings.get(cid.lower().strip())
1444
2037
  if not mapping or not mapping.iupac_name:
1445
2038
  missing_compounds.append(cid)
1446
2039
 
1447
- # Tier 2: Expanded search + multimodal with figures
2040
+ # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
1448
2041
  if missing_compounds:
1449
- LOGGER.info("Tier 2: %d compounds still missing IUPAC names: %s",
2042
+ LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
1450
2043
  len(missing_compounds), sorted(missing_compounds))
1451
2044
 
1452
- # Additional sections to search
1453
- additional_sections = [
1454
- "Engineering strategy", "Evolution campaign",
1455
- "Screening", "Optimization", "Substrate synthesis",
1456
- "Supporting Information", "Supplementary Methods"
1457
- ]
1458
-
1459
- # Extract from additional sections
1460
- additional_text = self._extract_sections_by_title(additional_sections)
1461
-
1462
- # Also extract any figures that might contain compound structures
2045
+ # Get all available figures for compound structure analysis
1463
2046
  figure_images = {}
1464
- figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Scheme 1", "Scheme 2"]
2047
+
2048
+ # Extract main manuscript figures
2049
+ figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
1465
2050
  for ref in figure_refs:
1466
2051
  img_b64 = self._extract_page_png(ref, extract_figure_only=True)
1467
2052
  if img_b64:
1468
2053
  figure_images[ref] = img_b64
1469
- LOGGER.info("Extracted %s for compound mapping", ref)
1470
-
1471
- # Try multimodal approach with figures and expanded text
1472
- if figure_images or additional_text:
1473
- combined_text = additional_text[:30000] if additional_text else ""
1474
- expanded_mappings = self._extract_compound_mappings_with_figures(
1475
- combined_text, missing_compounds, figure_images, tag_suffix="tier2"
1476
- )
1477
-
1478
- # Merge new mappings
1479
- new_found = 0
1480
- for key, mapping in expanded_mappings.items():
1481
- if key not in mappings or not mappings[key].iupac_name:
1482
- if mapping.iupac_name:
1483
- mappings[key] = mapping
1484
- new_found += 1
1485
- LOGGER.info("Found IUPAC name for '%s': %s",
1486
- key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
1487
-
1488
- LOGGER.info("Tier 2: Found %d additional compound mappings", new_found)
1489
-
1490
- # Check again for still missing compounds
1491
- still_missing = []
1492
- for cid in missing_compounds:
1493
- mapping = mappings.get(cid.lower().strip())
1494
- if not mapping or not mapping.iupac_name:
1495
- still_missing.append(cid)
1496
-
1497
- # Tier 3: Full manuscript search with all available figures
1498
- if still_missing:
1499
- LOGGER.info("Tier 3: %d compounds still missing, trying full manuscript search",
1500
- len(still_missing))
1501
-
1502
- # Get all SI figures
1503
- si_figure_refs = []
1504
- for page in self.si_pages[:5]: # Check first 5 SI pages
1505
- matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
1506
- si_figure_refs.extend(matches[:5]) # Limit to 5 figures
2054
+ LOGGER.info("Retrieved %s for compound mapping", ref)
1507
2055
 
1508
- # Extract SI figures
1509
- for ref in set(si_figure_refs):
1510
- if ref not in figure_images:
1511
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
1512
- if img_b64:
1513
- figure_images[ref] = img_b64
1514
- LOGGER.info("Extracted %s for final compound mapping", ref)
1515
-
1516
- # Full text search including all pages
1517
- full_text = "\n\n".join(self.all_pages[:30]) # First 30 pages
1518
-
1519
- final_mappings = self._extract_compound_mappings_with_figures(
1520
- full_text[:50000], still_missing, figure_images, tag_suffix="tier3"
1521
- )
1522
-
1523
- # Merge final mappings
1524
- final_found = 0
1525
- for key, mapping in final_mappings.items():
1526
- if key not in mappings or not mappings[key].iupac_name:
1527
- if mapping.iupac_name:
1528
- mappings[key] = mapping
1529
- final_found += 1
1530
- LOGGER.info("Found IUPAC name for '%s' in final search: %s",
1531
- key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
2056
+ # Get SI figures
2057
+ si_figure_refs = []
2058
+ for page in self.si_pages[:10]: # Check first 10 SI pages
2059
+ matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
2060
+ si_figure_refs.extend(matches[:10]) # Limit to 10 figures
2061
+
2062
+ # Extract SI figures
2063
+ for ref in set(si_figure_refs):
2064
+ if ref not in figure_images:
2065
+ img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2066
+ if img_b64:
2067
+ figure_images[ref] = img_b64
2068
+ LOGGER.info("Extracted %s for compound mapping", ref)
2069
+
2070
+ # Full text search including all pages
2071
+ full_text = "\n\n".join(self.all_pages[:40]) # First 40 pages (more comprehensive)
2072
+
2073
+ final_mappings = self._extract_compound_mappings_with_figures(
2074
+ full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2075
+ )
2076
+
2077
+ # Merge final mappings with better compound ID matching
2078
+ final_found = 0
2079
+ for key, mapping in final_mappings.items():
2080
+ if key not in mappings or not mappings[key].iupac_name:
2081
+ if mapping.iupac_name:
2082
+ mappings[key] = mapping
2083
+ final_found += 1
2084
+ iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
2085
+ LOGGER.info("Found IUPAC name for '%s' in full search: %s", key, iupac_display)
2086
+
2087
+ LOGGER.info("Tier 2: Found %d additional compound mappings", final_found)
2088
+
2089
+ # Cache all newly found mappings using campaign-aware cache key
2090
+ for key, mapping in mappings.items():
2091
+ cache_key = f"{campaign_filter}_{key}" if campaign_filter else key
2092
+ if self._compound_mapping_cache.get(cache_key) is None:
2093
+ self._compound_mapping_cache.put(cache_key, mapping)
2094
+ iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
2095
+ LOGGER.info("Cached compound mapping for: %s -> %s (campaign: %s)", key, iupac_display, campaign_filter)
1532
2096
 
1533
- LOGGER.info("Tier 3: Found %d additional compound mappings", final_found)
2097
+ # Also cache without campaign prefix for backward compatibility during integration
2098
+ if campaign_filter:
2099
+ self._compound_mapping_cache.put(key, mapping)
2100
+
2101
+ # Combine cached and new mappings
2102
+ final_mappings = cached_mappings.copy()
2103
+ final_mappings.update(mappings)
1534
2104
 
1535
- LOGGER.info("Adaptive compound mapping complete: %d total mappings", len(mappings))
1536
- return mappings
2105
+ LOGGER.info("Adaptive compound mapping complete: %d total mappings (%d cached, %d new)",
2106
+ len(final_mappings), len(cached_mappings), len(mappings))
2107
+ return final_mappings
1537
2108
 
1538
- def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
2109
+ def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None, lineage_compound_ids: Optional[Dict[str, List[str]]] = None) -> Dict[str, Any]:
1539
2110
  """Extract model reaction information using identified locations and 3-tier compound mapping."""
1540
2111
  # First find the best locations
1541
2112
  locations = self.find_model_reaction_locations(enzyme_variants)
@@ -1658,9 +2229,15 @@ TEXT FROM MANUSCRIPT:
1658
2229
  figure_images[fig_ref] = img_b64
1659
2230
  LOGGER.info("Successfully extracted %s image for model reaction analysis", fig_ref)
1660
2231
 
1661
- # Extract compound IDs from locations
2232
+ # Extract compound IDs from locations or use lineage-specific ones
1662
2233
  compound_ids = []
1663
- if locations and locations.get("model_reaction_location", {}).get("compound_ids"):
2234
+ if lineage_compound_ids:
2235
+ # Use lineage-specific compound IDs if provided
2236
+ substrate_ids = lineage_compound_ids.get("substrate_ids", [])
2237
+ product_ids = lineage_compound_ids.get("product_ids", [])
2238
+ compound_ids = substrate_ids + product_ids
2239
+ LOGGER.info("Using lineage-specific compound IDs: %s", compound_ids)
2240
+ elif locations and locations.get("model_reaction_location", {}).get("compound_ids"):
1664
2241
  compound_ids = locations["model_reaction_location"]["compound_ids"]
1665
2242
  LOGGER.info("Found compound IDs in model reaction: %s", compound_ids)
1666
2243
 
@@ -1668,7 +2245,7 @@ TEXT FROM MANUSCRIPT:
1668
2245
  compound_mappings = {}
1669
2246
  if compound_ids:
1670
2247
  LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
1671
- compound_mappings = self._extract_compound_mappings_adaptive(compound_ids)
2248
+ compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
1672
2249
 
1673
2250
  # Add the mapped IUPAC names to the context for better extraction
1674
2251
  if compound_mappings:
@@ -1679,8 +2256,35 @@ TEXT FROM MANUSCRIPT:
1679
2256
  mapping_text += f"Compound {cid}: {mapping.iupac_name}\n"
1680
2257
  text_context += mapping_text
1681
2258
 
2259
+ # Add campaign context if available
2260
+ campaign_context = ""
2261
+ if enzyme_variants and self.campaign_filter:
2262
+ campaigns_context = ""
2263
+ if self.all_campaigns:
2264
+ campaigns_context = f"""
2265
+ ALL CAMPAIGNS IN THIS PAPER:
2266
+ {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
2267
+
2268
+ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
2269
+ - Different campaigns may use similar enzyme names but different substrates
2270
+ - Be extremely careful to only extract data for the {self.campaign_filter} campaign
2271
+ - Ignore data from other campaigns even if they seem similar
2272
+ """
2273
+
2274
+ campaign_context = f"""
2275
+ IMPORTANT CONTEXT:
2276
+ You are extracting the model reaction used specifically for these enzyme variants:
2277
+ {', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
2278
+
2279
+ These variants belong to campaign: {self.campaign_filter}
2280
+ {campaigns_context}
2281
+ Focus on extracting the model reaction that was used to evaluate THESE specific variants.
2282
+ Different campaigns may use different model reactions and substrates.
2283
+
2284
+ """
2285
+
1682
2286
  # Include both manuscript and SI text for better coverage
1683
- prompt = PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
2287
+ prompt = campaign_context + PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
1684
2288
 
1685
2289
  try:
1686
2290
  # Use multimodal extraction if we have figure images
@@ -1693,9 +2297,10 @@ TEXT FROM MANUSCRIPT:
1693
2297
  for fig_ref, fig_base64 in figure_images.items():
1694
2298
  try:
1695
2299
  img_bytes = b64decode(fig_base64)
1696
- image = PIL.Image.open(io.BytesIO(img_bytes))
2300
+ # Format image for Gemini API
2301
+ image_part = {"mime_type": "image/png", "data": img_bytes}
1697
2302
  content_parts.append(f"\n[Figure: {fig_ref}]")
1698
- content_parts.append(image)
2303
+ content_parts.append(image_part)
1699
2304
  except Exception as e:
1700
2305
  LOGGER.warning("Failed to process figure %s: %s", fig_ref, e)
1701
2306
 
@@ -1714,6 +2319,20 @@ TEXT FROM MANUSCRIPT:
1714
2319
 
1715
2320
  response = model.generate_content(content_parts)
1716
2321
 
2322
+ # Track token usage if available
2323
+ try:
2324
+ if hasattr(response, 'usage_metadata'):
2325
+ input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
2326
+ output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
2327
+ if input_tokens or output_tokens:
2328
+ try:
2329
+ from .wrapper import add_token_usage
2330
+ add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
2331
+ except ImportError:
2332
+ pass # wrapper not available
2333
+ except Exception:
2334
+ pass # token tracking is best-effort
2335
+
1717
2336
  # Parse JSON from response
1718
2337
  if response and response.text:
1719
2338
  # Save debug output
@@ -1754,32 +2373,59 @@ TEXT FROM MANUSCRIPT:
1754
2373
  if isinstance(data, dict):
1755
2374
  # If we have compound mappings, enhance the IUPAC names
1756
2375
  if compound_ids and compound_mappings:
2376
+ LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
2377
+ list(compound_mappings.keys()))
2378
+
1757
2379
  # Try to map substrate/product lists through compound IDs
1758
- substrate_list = data.get("substrate_iupac_list", [])
2380
+ substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
1759
2381
  if isinstance(substrate_list, list):
1760
2382
  enhanced_substrates = []
1761
2383
  for item in substrate_list:
2384
+ item_str = str(item).lower().strip()
1762
2385
  # Check if it's a compound ID that we can map
1763
- mapping = compound_mappings.get(str(item).lower().strip())
2386
+ mapping = compound_mappings.get(item_str)
1764
2387
  if mapping and mapping.iupac_name:
1765
2388
  enhanced_substrates.append(mapping.iupac_name)
2389
+ LOGGER.info("Mapped substrate '%s' -> '%s'", item, mapping.iupac_name)
1766
2390
  elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
1767
- # Keep valid IUPAC names
2391
+ # Keep valid IUPAC names that aren't compound IDs
1768
2392
  enhanced_substrates.append(str(item))
2393
+ LOGGER.info("Kept substrate IUPAC name: '%s'", item)
2394
+ else:
2395
+ LOGGER.warning("Could not map substrate compound ID '%s'", item)
1769
2396
  data["substrate_iupac_list"] = enhanced_substrates
1770
2397
 
1771
- product_list = data.get("product_iupac_list", [])
2398
+ product_list = data.get("product_iupac_list", []) or data.get("product_list", [])
1772
2399
  if isinstance(product_list, list):
1773
2400
  enhanced_products = []
1774
2401
  for item in product_list:
2402
+ item_str = str(item).lower().strip()
1775
2403
  # Check if it's a compound ID that we can map
1776
- mapping = compound_mappings.get(str(item).lower().strip())
2404
+ mapping = compound_mappings.get(item_str)
1777
2405
  if mapping and mapping.iupac_name:
1778
2406
  enhanced_products.append(mapping.iupac_name)
2407
+ LOGGER.info("Mapped product '%s' -> '%s'", item, mapping.iupac_name)
1779
2408
  elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
1780
- # Keep valid IUPAC names
2409
+ # Keep valid IUPAC names that aren't compound IDs
1781
2410
  enhanced_products.append(str(item))
2411
+ LOGGER.info("Kept product IUPAC name: '%s'", item)
2412
+ else:
2413
+ LOGGER.warning("Could not map product compound ID '%s'", item)
1782
2414
  data["product_iupac_list"] = enhanced_products
2415
+
2416
+ # Also try to enhance using both substrate_list and product_list if they contain compound IDs
2417
+ for list_key, target_key in [("substrate_list", "substrate_iupac_list"), ("product_list", "product_iupac_list")]:
2418
+ if list_key in data and isinstance(data[list_key], list):
2419
+ if target_key not in data or not data[target_key]:
2420
+ enhanced_list = []
2421
+ for item in data[list_key]:
2422
+ item_str = str(item).lower().strip()
2423
+ mapping = compound_mappings.get(item_str)
2424
+ if mapping and mapping.iupac_name:
2425
+ enhanced_list.append(mapping.iupac_name)
2426
+ LOGGER.info("Enhanced %s: mapped '%s' -> '%s'", target_key, item, mapping.iupac_name)
2427
+ if enhanced_list:
2428
+ data[target_key] = enhanced_list
1783
2429
 
1784
2430
  # Validate and convert arrays to semicolon-separated strings for CSV compatibility
1785
2431
  if "substrate_iupac_list" in data and isinstance(data["substrate_iupac_list"], list):
@@ -1826,8 +2472,8 @@ TEXT FROM MANUSCRIPT:
1826
2472
  return data
1827
2473
 
1828
2474
  def _process_single_lineage(self, location: Dict[str, Any], enzyme_df: pd.DataFrame) -> pd.DataFrame:
1829
- """Process a single lineage case - still extract based on location."""
1830
- # Even for single lineage, use location-based extraction
2475
+ """Process a single lineage case - use confidence-based processing."""
2476
+ # Create lineage analysis for single location
1831
2477
  lineage_analysis = {
1832
2478
  'has_multiple_lineages': False,
1833
2479
  'lineage_groups': [{
@@ -1839,7 +2485,7 @@ TEXT FROM MANUSCRIPT:
1839
2485
  }]
1840
2486
  }
1841
2487
 
1842
- return self._process_multiple_lineages([location], enzyme_df, lineage_analysis)
2488
+ return self._process_multiple_lineages_by_confidence([location], enzyme_df, lineage_analysis)
1843
2489
 
1844
2490
  def _process_multiple_lineages_by_confidence(self, locations: List[Dict[str, Any]],
1845
2491
  enzyme_df: pd.DataFrame,
@@ -1854,186 +2500,69 @@ TEXT FROM MANUSCRIPT:
1854
2500
  # If enzyme_df has campaign_id column, we can use it to filter
1855
2501
  has_campaign_info = 'campaign_id' in enzyme_df.columns
1856
2502
 
1857
- # Process locations in order of confidence
1858
- for location in locations:
1859
- if len(variants_with_data) >= len(all_variants):
1860
- LOGGER.info("All variants have data, stopping extraction")
1861
- break
1862
-
1863
- LOGGER.info("\nProcessing location %s (confidence: %d%%)",
1864
- location['location'], location.get('confidence', 0))
2503
+ # Select the most confident source only
2504
+ best_location = None
2505
+ if locations:
2506
+ # Sort by confidence only
2507
+ locations_sorted = sorted(locations, key=lambda x: -x.get('confidence', 0))
2508
+ best_location = locations_sorted[0]
2509
+
2510
+ LOGGER.info("Selected primary location: %s (type: %s, confidence: %d%%)",
2511
+ best_location['location'],
2512
+ best_location.get('type', 'unknown'),
2513
+ best_location.get('confidence', 0))
1865
2514
 
1866
- # Extract metrics from this location for ALL enzymes
1867
- metrics_rows = self.extract_metrics_batch(all_enzyme_ids, location['location'])
2515
+ # Extract metrics from the most confident source only
2516
+ metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
1868
2517
 
1869
2518
  # Filter to valid metrics
1870
2519
  valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
1871
2520
 
1872
2521
  if not valid_metrics:
1873
- LOGGER.warning("No valid metrics found in %s", location['location'])
1874
- continue
2522
+ LOGGER.warning("No valid metrics found in primary location %s", best_location['location'])
2523
+ return pd.DataFrame()
1875
2524
 
1876
- LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), location['location'])
2525
+ LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), best_location['location'])
1877
2526
 
1878
- # Create DataFrame for this location
2527
+ # Create DataFrame for the single best location
1879
2528
  df_location = pd.DataFrame(valid_metrics)
1880
2529
 
1881
- # Track which variants we got data for
1882
- new_variants = set(df_location['enzyme'].tolist()) - variants_with_data
1883
- LOGGER.info("Found data for %d new variants in %s", len(new_variants), location['location'])
1884
- variants_with_data.update(new_variants)
2530
+ # Add metadata about the location
2531
+ df_location['data_location'] = best_location['location']
2532
+ df_location['confidence'] = best_location.get('confidence', 0)
1885
2533
 
1886
- # Determine which campaign/lineage this location represents
1887
- # by checking which variants are present
1888
- location_variants = set(df_location['enzyme'].tolist())
2534
+ LOGGER.info("Successfully extracted data for %d enzymes from primary location", len(df_location))
1889
2535
 
1890
- # If we have campaign info, determine the campaign for this location
1891
- campaign_id = None
1892
- if has_campaign_info:
1893
- # Find which campaign(s) these variants belong to
1894
- if 'enzyme_id' in enzyme_df.columns:
1895
- variant_campaigns = enzyme_df[enzyme_df['enzyme_id'].isin(location_variants)]['campaign_id'].unique()
1896
- else:
1897
- variant_campaigns = enzyme_df[enzyme_df['enzyme'].isin(location_variants)]['campaign_id'].unique()
1898
- if len(variant_campaigns) == 1:
1899
- campaign_id = variant_campaigns[0]
1900
- LOGGER.info("Location %s contains variants from campaign: %s",
1901
- location['location'], campaign_id)
1902
- elif len(variant_campaigns) > 1:
1903
- LOGGER.warning("Location %s contains variants from multiple campaigns: %s",
1904
- location['location'], variant_campaigns)
1905
-
1906
- # Extract model reaction specific to this location/campaign
1907
- location_context = f"Location: {location['location']}"
1908
- if location.get('caption'):
1909
- location_context += f"\nCaption: {location['caption']}"
1910
-
1911
- # First find model reaction locations for this campaign/enzyme group
2536
+ # Extract model reaction info once for this location
2537
+ location_context = f"Location: {best_location['location']}"
2538
+ if best_location.get('caption'):
2539
+ location_context += f"\nCaption: {best_location['caption']}"
2540
+
2541
+ # Get enzyme list for model reaction
1912
2542
  location_enzymes = df_location['enzyme'].unique().tolist()
2543
+ # Get model reaction locations for this campaign
1913
2544
  model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
1914
2545
 
1915
- # Try to find model reaction for this specific lineage, passing the locations
1916
- location_model_reaction = self.find_lineage_model_reaction(
1917
- location['location'],
1918
- location_context,
1919
- model_reaction_locations
1920
- )
1921
-
1922
- # Get full model reaction info with IUPAC names
1923
- if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
1924
- model_info = self._extract_lineage_model_info(location_model_reaction)
1925
- else:
1926
- # Fall back to general model reaction extraction
1927
- # Pass the enzyme variants from this location
1928
- model_info = self.gather_model_reaction_info(location_enzymes)
1929
-
1930
- # Add model reaction info to all enzymes from this location
1931
- for key, value in model_info.items():
1932
- if isinstance(value, list):
1933
- value = "; ".join(str(v) for v in value) if value else None
1934
- df_location[key] = value
1935
-
1936
- # Add location and campaign info
1937
- df_location['data_location'] = location['location']
1938
- df_location['location_type'] = location.get('type', 'unknown')
1939
- df_location['location_confidence'] = location.get('confidence', 0)
1940
- # Remove lineage_group column - not working properly
1941
- # df_location['lineage_group'] = location.get('lineage_hint', campaign_id or 'unknown')
1942
-
1943
- all_results.append(df_location)
2546
+ # Extract model reaction for this location - use unified approach
2547
+ LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
1944
2548
 
1945
- # Log progress
1946
- LOGGER.info("Progress: %d/%d variants have data",
1947
- len(variants_with_data), len(all_variants))
1948
-
1949
- if all_results:
1950
- # Combine all results
1951
- df_combined = pd.concat(all_results, ignore_index=True)
1952
-
1953
- # If we have duplicates (same variant in multiple locations), keep the one with highest confidence
1954
- if df_combined.duplicated(subset=['enzyme']).any():
1955
- LOGGER.info("Removing duplicates, keeping highest confidence data")
1956
- df_combined = df_combined.sort_values(
1957
- ['enzyme', 'location_confidence'],
1958
- ascending=[True, False]
1959
- ).drop_duplicates(subset=['enzyme'], keep='first')
1960
-
1961
- # Log extraction summary
1962
- LOGGER.info("Extraction complete: %d unique variants from %d locations",
1963
- len(df_combined), len(all_results))
1964
-
1965
- if 'data_location' in df_combined.columns:
1966
- for location in df_combined['data_location'].unique():
1967
- location_enzymes = df_combined[df_combined['data_location'] == location]
1968
- LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
1969
-
1970
- return df_combined
1971
- else:
1972
- LOGGER.warning("No metrics extracted from any location")
1973
- return pd.DataFrame()
1974
-
1975
- def _process_multiple_lineages(self, locations: List[Dict[str, Any]],
1976
- enzyme_df: pd.DataFrame,
1977
- lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
1978
- """Process multiple lineages where each location represents a different model reaction."""
1979
- all_metrics = []
1980
- lineage_groups = lineage_analysis.get('lineage_groups', [])
1981
-
1982
- # Get all enzyme IDs for extraction attempts
1983
- all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else []
1984
-
1985
- for group in lineage_groups:
1986
- group_location = group.get('data_location')
1987
- group_id = group.get('group_id')
1988
-
1989
- # Find the location info
1990
- location_info = next((loc for loc in locations if loc['location'] == group_location), None)
1991
- if not location_info:
1992
- LOGGER.warning("No location info found for group %s at %s", group_id, group_location)
1993
- continue
1994
-
1995
- LOGGER.info("Processing location %s (%s)", group_location, group_id)
1996
-
1997
- # Extract metrics from this location for ALL enzymes
1998
- # The extractor will return only those that actually have data
1999
- metrics_rows = self.extract_metrics_batch(all_enzyme_ids, group_location)
2000
-
2001
- # Filter to enzymes that actually had data in this location
2002
- valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
2003
-
2004
- if not valid_metrics:
2005
- LOGGER.warning("No valid metrics found in %s", group_location)
2006
- continue
2007
-
2008
- LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), group_location)
2009
-
2010
- # Create DataFrame for this location
2011
- df_location = pd.DataFrame(valid_metrics)
2012
-
2013
- # Extract model reaction specific to this location
2014
- # Different locations = different model reactions
2015
- location_context = f"Location: {group_location}"
2016
- if group.get('caption'):
2017
- location_context += f"\nCaption: {group['caption']}"
2018
-
2019
- # First find model reaction locations for this enzyme group
2020
- location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
2021
- model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
2022
-
2023
- # Try to find model reaction for this specific lineage, passing the locations
2549
+ # Try lineage-specific extraction first
2024
2550
  location_model_reaction = self.find_lineage_model_reaction(
2025
- group_location,
2551
+ best_location['location'],
2026
2552
  location_context,
2027
2553
  model_reaction_locations
2028
2554
  )
2029
2555
 
2030
- # Get full model reaction info with IUPAC names
2556
+ # Check if lineage extraction was successful
2031
2557
  if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
2032
- model_info = self._extract_lineage_model_info(location_model_reaction)
2558
+ LOGGER.info("Using lineage-specific model reaction data")
2559
+ model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
2033
2560
  else:
2034
- # Try to extract model reaction from this specific location
2035
- # Pass the enzyme variants that have data in this location
2561
+ LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
2562
+ # Use the comprehensive multimodal approach as fallback
2036
2563
  model_info = self.gather_model_reaction_info(location_enzymes)
2564
+
2565
+ LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
2037
2566
 
2038
2567
  # Add model reaction info to all enzymes from this location
2039
2568
  for key, value in model_info.items():
@@ -2041,30 +2570,18 @@ TEXT FROM MANUSCRIPT:
2041
2570
  value = "; ".join(str(v) for v in value) if value else None
2042
2571
  df_location[key] = value
2043
2572
 
2044
- # Add location identifier
2045
- df_location['data_location'] = group_location
2046
- # Remove lineage_group column - not working properly
2047
- # df_location['lineage_group'] = group.get('lineage_hint', group_id)
2048
-
2049
- all_metrics.append(df_location)
2050
-
2051
- if all_metrics:
2052
- # Combine all metrics
2053
- df_combined = pd.concat(all_metrics, ignore_index=True)
2054
-
2055
- # Log extraction summary
2056
- LOGGER.info("Extraction complete: %d total enzymes from %d locations",
2057
- len(df_combined), len(all_metrics))
2573
+ # Add additional location metadata (data_location already set above)
2574
+ df_location['location_type'] = best_location.get('type', 'unknown')
2575
+ df_location['location_confidence'] = best_location.get('confidence', 0)
2058
2576
 
2059
- if 'data_location' in df_combined.columns:
2060
- for location in df_combined['data_location'].unique():
2061
- location_enzymes = df_combined[df_combined['data_location'] == location]
2062
- LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
2577
+ LOGGER.info("Extraction complete: %d variants from primary location %s",
2578
+ len(df_location), best_location['location'])
2063
2579
 
2064
- return df_combined
2065
- else:
2066
- LOGGER.warning("No metrics extracted from any location")
2067
- return pd.DataFrame()
2580
+ return df_location
2581
+
2582
+ # No locations found
2583
+ LOGGER.warning("No valid locations found for extraction")
2584
+ return pd.DataFrame()
2068
2585
 
2069
2586
  def _has_valid_metrics(self, metrics_row: Dict[str, Any]) -> bool:
2070
2587
  """Check if a metrics row contains any valid performance data."""
@@ -2151,14 +2668,18 @@ TEXT FROM MANUSCRIPT:
2151
2668
 
2152
2669
  return filtered
2153
2670
 
2154
- def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any]) -> Dict[str, Any]:
2671
+ def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any], enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
2155
2672
  """Extract full model reaction info including IUPAC names for a lineage."""
2156
2673
  # Get substrate/product IDs from lineage-specific extraction
2157
2674
  substrate_ids = lineage_reaction.get('substrate_ids', [])
2158
2675
  product_ids = lineage_reaction.get('product_ids', [])
2159
2676
 
2160
- # Get general model reaction info for conditions
2161
- general_info = self.gather_model_reaction_info()
2677
+ # Get general model reaction info for conditions, using lineage-specific compound IDs
2678
+ lineage_ids = {
2679
+ "substrate_ids": substrate_ids,
2680
+ "product_ids": product_ids
2681
+ }
2682
+ general_info = self.gather_model_reaction_info(enzyme_variants, lineage_compound_ids=lineage_ids)
2162
2683
 
2163
2684
  # Override substrate/product lists with lineage-specific ones only if they contain actual compound IDs
2164
2685
  model_info = general_info.copy()
@@ -2304,6 +2825,9 @@ TEXT FROM MANUSCRIPT:
2304
2825
  LOGGER.error("No enzyme DataFrame provided - this module requires enzyme CSV input")
2305
2826
  return pd.DataFrame()
2306
2827
 
2828
+ # Store enzyme_df for use in extract_metrics_batch
2829
+ self.enzyme_df = enzyme_df
2830
+
2307
2831
  # Check if we have campaign_id column - if so, process each campaign separately
2308
2832
  if 'campaign_id' in enzyme_df.columns and not self.campaign_filter:
2309
2833
  campaigns = enzyme_df['campaign_id'].unique()
@@ -2322,15 +2846,16 @@ TEXT FROM MANUSCRIPT:
2322
2846
  si=self.si,
2323
2847
  cfg=self.cfg,
2324
2848
  debug_dir=self.debug_dir / campaign_id if self.debug_dir else None,
2325
- campaign_filter=campaign_id
2849
+ campaign_filter=campaign_id,
2850
+ all_campaigns=campaigns.tolist()
2326
2851
  )
2327
2852
 
2328
2853
  # Run extraction for this campaign
2329
2854
  campaign_df = campaign_extractor.run(enzyme_df)
2330
2855
 
2331
2856
  if not campaign_df.empty:
2332
- # Add campaign identifier
2333
- campaign_df['campaign_id'] = campaign_id
2857
+ # Add a temporary campaign identifier for merging
2858
+ campaign_df['_extraction_campaign'] = campaign_id
2334
2859
  all_campaign_results.append(campaign_df)
2335
2860
  LOGGER.info("Extracted %d reactions for campaign %s", len(campaign_df), campaign_id)
2336
2861
 
@@ -2401,7 +2926,7 @@ TEXT FROM MANUSCRIPT:
2401
2926
  def merge_with_lineage_data(
2402
2927
  df_lineage: pd.DataFrame, df_metrics: pd.DataFrame
2403
2928
  ) -> pd.DataFrame:
2404
- """Outer-merge on 'enzyme' column. Left CSV defines desired row order."""
2929
+ """Merge lineage and metrics data ensuring one-to-one mapping per campaign."""
2405
2930
 
2406
2931
  # Handle both 'enzyme' and 'enzyme_id' column names
2407
2932
  if "enzyme_id" in df_lineage.columns and "enzyme" not in df_lineage.columns:
@@ -2410,7 +2935,19 @@ def merge_with_lineage_data(
2410
2935
  if "enzyme" not in df_lineage.columns:
2411
2936
  raise ValueError("Lineage CSV must have an 'enzyme' or 'enzyme_id' column.")
2412
2937
 
2413
- merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
2938
+ # Check if we have campaign information to match on
2939
+ if "campaign_id" in df_lineage.columns and "_extraction_campaign" in df_metrics.columns:
2940
+ # Match on both enzyme and campaign to ensure correct pairing
2941
+ df_metrics_temp = df_metrics.copy()
2942
+ df_metrics_temp['campaign_id'] = df_metrics_temp['_extraction_campaign']
2943
+ df_metrics_temp = df_metrics_temp.drop('_extraction_campaign', axis=1)
2944
+ merged = df_lineage.merge(df_metrics_temp, on=["enzyme", "campaign_id"], how="left")
2945
+ else:
2946
+ # Simple merge on enzyme only
2947
+ if "_extraction_campaign" in df_metrics.columns:
2948
+ df_metrics = df_metrics.drop('_extraction_campaign', axis=1)
2949
+ merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
2950
+
2414
2951
  return merged
2415
2952
 
2416
2953
  ###############################################################################
@@ -2439,25 +2976,103 @@ def main() -> None:
2439
2976
  if args.verbose:
2440
2977
  LOGGER.setLevel(logging.DEBUG)
2441
2978
  cfg = Config()
2442
- extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir)
2443
2979
 
2444
- # Load enzyme data from CSV if provided
2980
+ # Load enzyme data from CSV if provided to detect campaign information
2445
2981
  enzyme_df = None
2982
+ campaign_filter = None
2983
+ all_campaigns = None
2984
+
2446
2985
  if args.lineage_csv and args.lineage_csv.exists():
2447
2986
  LOGGER.info("Loading enzyme data from CSV…")
2448
2987
  enzyme_df = pd.read_csv(args.lineage_csv)
2449
-
2450
- # Run extraction with enzyme data
2451
- df_metrics = extractor.run(enzyme_df)
2988
+
2989
+ # Detect campaign information from the enzyme CSV
2990
+ if 'campaign_id' in enzyme_df.columns:
2991
+ all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
2992
+ if len(all_campaigns) == 1:
2993
+ campaign_filter = all_campaigns[0]
2994
+ LOGGER.info("Detected single campaign: %s", campaign_filter)
2995
+
2996
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
2997
+ campaign_filter=campaign_filter, all_campaigns=all_campaigns)
2998
+ df_metrics = extractor.run(enzyme_df)
2999
+
3000
+ elif len(all_campaigns) > 1:
3001
+ LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
3002
+ all_results = []
3003
+
3004
+ # Process each campaign separately
3005
+ for campaign in all_campaigns:
3006
+ LOGGER.info("Processing campaign: %s", campaign)
3007
+
3008
+ # Filter enzyme_df to this campaign
3009
+ campaign_df = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3010
+ LOGGER.info("Found %d enzymes for campaign %s", len(campaign_df), campaign)
3011
+
3012
+ if len(campaign_df) == 0:
3013
+ LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
3014
+ continue
3015
+
3016
+ # Create extractor for this campaign
3017
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3018
+ campaign_filter=campaign, all_campaigns=all_campaigns)
3019
+
3020
+ # Run extraction for this campaign
3021
+ campaign_metrics = extractor.run(campaign_df)
3022
+
3023
+ if not campaign_metrics.empty:
3024
+ # Merge with lineage data for this campaign
3025
+ campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3026
+ if "enzyme_id" in campaign_lineage.columns and "enzyme" not in campaign_lineage.columns:
3027
+ campaign_lineage = campaign_lineage.rename(columns={"enzyme_id": "enzyme"})
3028
+
3029
+ # Merge campaign metrics with lineage data
3030
+ campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
3031
+
3032
+ # Save campaign-specific file immediately
3033
+ output_dir = args.output.parent
3034
+ base_name = args.output.stem
3035
+ campaign_file = output_dir / f"{base_name}_{campaign}.csv"
3036
+ campaign_final.to_csv(campaign_file, index=False)
3037
+ LOGGER.info("Saved %d rows for campaign %s -> %s", len(campaign_final), campaign, campaign_file)
3038
+
3039
+ # Add the merged data (not just metrics) to final results
3040
+ all_results.append(campaign_final)
3041
+ LOGGER.info("Added %d merged results for campaign %s", len(campaign_final), campaign)
3042
+ else:
3043
+ LOGGER.warning("No results extracted for campaign %s", campaign)
3044
+
3045
+ # Still save an empty campaign file with lineage data
3046
+ campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3047
+ if not campaign_lineage.empty:
3048
+ output_dir = args.output.parent
3049
+ base_name = args.output.stem
3050
+ campaign_file = output_dir / f"{base_name}_{campaign}.csv"
3051
+ campaign_lineage.to_csv(campaign_file, index=False)
3052
+ LOGGER.info("Saved %d rows (lineage only) for campaign %s -> %s", len(campaign_lineage), campaign, campaign_file)
3053
+
3054
+ # Combine all campaign results
3055
+ if all_results:
3056
+ df_metrics = pd.concat(all_results, ignore_index=True)
3057
+ LOGGER.info("Combined results from %d campaigns: %d total rows", len(all_results), len(df_metrics))
3058
+ else:
3059
+ LOGGER.warning("No results from any campaign")
3060
+ df_metrics = pd.DataFrame()
3061
+ else:
3062
+ # No campaign information, process all enzymes together
3063
+ extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
3064
+ campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3065
+ df_metrics = extractor.run(enzyme_df)
2452
3066
 
2453
- if args.lineage_csv and args.lineage_csv.exists() and not df_metrics.empty:
2454
- LOGGER.info("Merging with lineage CSV…")
2455
- df_final = merge_with_lineage_data(enzyme_df, df_metrics)
2456
- else:
2457
- df_final = df_metrics
3067
+ # Skip final merge since campaign-specific merges already happened during processing
3068
+ # This avoids duplicate entries when same enzyme appears in multiple campaigns
3069
+ df_final = df_metrics
3070
+ LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
2458
3071
 
2459
3072
  df_final.to_csv(args.output, index=False)
2460
3073
  LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
3074
+
3075
+ # Campaign-specific files are already saved during processing above
2461
3076
 
2462
3077
  if __name__ == "__main__":
2463
3078
  main()