PyPI - debase - Versions diffs - 0.1.19__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

debase 0.1.19py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +40 -8
debase/enzyme_lineage_extractor.py +153 -9
debase/reaction_info_extractor.py +1119 -504
debase/substrate_scope_extractor.py +83 -34
debase/wrapper.py +75 -0
{debase-0.1.19.dist-info → debase-0.4.0.dist-info}/METADATA +1 -1
debase-0.4.0.dist-info/RECORD +16 -0
debase/PIPELINE_FLOW.md +0 -100
debase-0.1.19.dist-info/RECORD +0 -17
{debase-0.1.19.dist-info → debase-0.4.0.dist-info}/WHEEL +0 -0
{debase-0.1.19.dist-info → debase-0.4.0.dist-info}/entry_points.txt +0 -0
{debase-0.1.19.dist-info → debase-0.4.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.1.19.dist-info → debase-0.4.0.dist-info}/top_level.txt +0 -0

debase/reaction_info_extractor.py CHANGED Viewed

@@ -24,6 +24,7 @@ Key June 2025 additions
 from __future__ import annotations
 import argparse
+import hashlib
 import json
 import logging
 import os
@@ -31,7 +32,9 @@ import re
 import sys
 import time
 from base64 import b64encode, b64decode
+from collections import OrderedDict
 from dataclasses import dataclass, field
+from functools import lru_cache
 from pathlib import Path
 from textwrap import dedent
 from typing import Any, Dict, List, Optional, Tuple
@@ -40,7 +43,6 @@ import fitz  # PyMuPDF - for image extraction
 import google.generativeai as genai  # type: ignore
 import pandas as pd
 from PyPDF2 import PdfReader
-import PIL.Image
 import io
 ###############################################################################
@@ -51,7 +53,7 @@ import io
 class Config:
     """Centralised tunables so tests can override them easily."""
-    model_name: str = "gemini-1.5-pro-latest"
+    model_name: str = "gemini-2.5-flash"
     location_temperature: float = 0.2
     extract_temperature: float = 0.0
     model_reaction_temperature: float = 0.0
@@ -117,6 +119,144 @@ def get_model(cfg: Config):
     genai.configure(api_key=api_key)
     return genai.GenerativeModel(cfg.model_name)
+# Bounded LRU caches to store prompt/image content by hash (prevents memory leaks)
+class LRUCache:
+    """Simple LRU cache implementation."""
+    def __init__(self, maxsize: int):
+        self.maxsize = maxsize
+        self.cache = OrderedDict()
+    def get(self, key: str) -> Optional[str]:
+        if key in self.cache:
+            # Move to end (most recently used)
+            self.cache.move_to_end(key)
+            return self.cache[key]
+        return None
+    def put(self, key: str, value: str) -> None:
+        if key in self.cache:
+            # Update existing
+            self.cache.move_to_end(key)
+        else:
+            # Add new, evict oldest if needed
+            if len(self.cache) >= self.maxsize:
+                self.cache.popitem(last=False)
+        self.cache[key] = value
+    def __len__(self) -> int:
+        return len(self.cache)
+# Global bounded caches
+_PROMPT_CACHE = LRUCache(maxsize=1000)
+_IMAGE_CACHE = LRUCache(maxsize=500)  # Images are larger, so smaller cache
+def get_cache_stats() -> Dict[str, Any]:
+    """Get cache statistics for debugging."""
+    return {
+        "gemini_cache_info": _cached_gemini_call.cache_info(),
+        "prompt_cache_size": len(_PROMPT_CACHE),
+        "image_cache_size": len(_IMAGE_CACHE),
+    }
+@lru_cache(maxsize=1000)
+def _cached_gemini_call(
+    model_name: str,
+    prompt_hash: str,
+    image_hash: Optional[str],
+    temperature: float,
+    max_retries: int,
+) -> str:
+    """Pure cached function for Gemini API calls using only hash keys.
+    Args:
+        model_name: Name of the Gemini model
+        prompt_hash: SHA256 hash of the prompt
+        image_hash: SHA256 hash of the image (if any)
+        temperature: Temperature for generation
+        max_retries: Maximum number of retries
+    Returns:
+        Raw response text from Gemini
+    """
+    # Retrieve actual content from LRU cache
+    prompt = _PROMPT_CACHE.get(prompt_hash)
+    image_b64 = _IMAGE_CACHE.get(image_hash) if image_hash else None
+    if prompt is None:
+        raise RuntimeError(f"Prompt content not found for hash {prompt_hash}")
+    # Configure API key (this is idempotent)
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
+    genai.configure(api_key=api_key)
+    # Create model instance (not cached since it's lightweight)
+    model = genai.GenerativeModel(model_name)
+    for attempt in range(1, max_retries + 1):
+        try:
+            # Handle image if provided
+            if image_b64:
+                # Decode base64 string to bytes for Gemini API
+                image_bytes = b64decode(image_b64)
+                parts = [prompt, {"mime_type": "image/png", "data": image_bytes}]
+            else:
+                parts = [prompt]
+            resp = model.generate_content(
+                parts,
+                generation_config={
+                    "temperature": temperature,
+                    "max_output_tokens": 8192,
+                }
+            )
+            # Track token usage if available
+            try:
+                if hasattr(resp, 'usage_metadata'):
+                    input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
+                    output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
+                    if input_tokens or output_tokens:
+                        try:
+                            from .wrapper import add_token_usage
+                            add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
+                        except ImportError:
+                            pass  # wrapper not available
+            except Exception:
+                pass  # token tracking is best-effort
+            return resp.text.strip()
+        except Exception as exc:
+            if attempt == max_retries:
+                raise
+            time.sleep(2 ** attempt)
+    # Should never reach here
+    raise RuntimeError("Max retries exceeded")
+def _normalize_prompt_for_caching(prompt: str) -> str:
+    """Normalize prompt for better cache hit rates by removing boilerplate and collapsing whitespace."""
+    # Remove common boilerplate lines that don't affect the core query
+    lines = prompt.split('\n')
+    normalized_lines = []
+    for line in lines:
+        # Skip timestamp and debug lines
+        if any(skip in line.lower() for skip in ['timestamp:', 'length:', 'characters', '===', '***']):
+            continue
+        # Skip lines that are just separators
+        if line.strip() and not line.strip().replace('=', '').replace('-', '').replace('*', ''):
+            continue
+        # Collapse whitespace but preserve structure
+        normalized_lines.append(' '.join(line.split()))
+    # Join and collapse multiple newlines
+    normalized = '\n'.join(normalized_lines)
+    normalized = re.sub(r'\n\s*\n+', '\n\n', normalized)
+    return normalized.strip()
 def generate_json_with_retry(
     model,
     prompt: str,
@@ -129,9 +269,17 @@ def generate_json_with_retry(
     image_b64: Optional[str] = None,
 ):
     """Call Gemini with retries & exponential back-off, returning parsed JSON."""
+    # Generate cache keys based on normalized prompt and image content
+    normalized_prompt = _normalize_prompt_for_caching(prompt)
+    prompt_hash = hashlib.sha256(normalized_prompt.encode()).hexdigest()
+    image_hash = hashlib.sha256(image_b64.encode()).hexdigest() if image_b64 else None
     # Log prompt details
     LOGGER.info("=== GEMINI API CALL: %s ===", tag.upper())
     LOGGER.info("Prompt length: %d characters", len(prompt))
+    LOGGER.info("Prompt hash: %s", prompt_hash[:16])
+    if image_hash:
+        LOGGER.info("Image hash: %s", image_hash[:16])
     LOGGER.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
     # Save full prompt to debug directory if provided
@@ -139,107 +287,142 @@ def generate_json_with_retry(
         debug_path = Path(debug_dir)
         debug_path.mkdir(parents=True, exist_ok=True)
         prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
-        _dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\n{'='*80}\n\n{prompt}",
+        _dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{prompt}",
               prompt_file)
         LOGGER.info("Full prompt saved to: %s", prompt_file)
-    fence_re = re.compile(r"```json|```", re.I)
-    for attempt in range(1, max_retries + 1):
-        try:
-            LOGGER.info("Calling Gemini API (attempt %d/%d)...", attempt, max_retries)
-            # Handle image if provided
-            if image_b64:
-                parts = [prompt, {"mime_type": "image/png", "data": image_b64}]
-            else:
-                parts = [prompt]
-            resp = model.generate_content(
-                parts,
-                generation_config={
-                    "temperature": temperature,
-                    "max_output_tokens": 4096,
-                }
-            )
-            raw = resp.text.strip()
-            # Log response
-            LOGGER.info("Gemini response length: %d characters", len(raw))
-            LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
-            # Save full response to debug directory
-            if debug_dir:
-                response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
-                _dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}",
-                      response_file)
-                LOGGER.info("Full response saved to: %s", response_file)
+    try:
+        # Store content in bounded LRU caches for the cached function to retrieve
+        _PROMPT_CACHE.put(prompt_hash, prompt)
+        if image_hash and image_b64:
+            _IMAGE_CACHE.put(image_hash, image_b64)
+        # Check if this will be a cache hit
+        cache_info_before = _cached_gemini_call.cache_info()
+        # Use cached Gemini call (only with hash keys)
+        LOGGER.info("Calling cached Gemini API...")
+        raw = _cached_gemini_call(
+            model_name=model.model_name,
+            prompt_hash=prompt_hash,
+            image_hash=image_hash,
+            temperature=temperature,
+            max_retries=max_retries,
+        )
+        # Log cache performance
+        cache_info_after = _cached_gemini_call.cache_info()
+        if cache_info_after.hits > cache_info_before.hits:
+            LOGGER.info("✓ Cache HIT for prompt hash %s", prompt_hash[:16])
+        else:
+            LOGGER.info("✗ Cache MISS for prompt hash %s", prompt_hash[:16])
+        # Log response
+        LOGGER.info("Gemini response length: %d characters", len(raw))
+        LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
+        # Save full response to debug directory
+        if debug_dir:
+            response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
+            _dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{raw}",
+                  response_file)
+            LOGGER.info("Full response saved to: %s", response_file)
-            # Remove common Markdown fences
-            if raw.startswith("```"):
-                raw = fence_re.sub("", raw).strip()
-            # Try to find JSON in the response
-            # First, try to parse as-is
+        # Remove common Markdown fences more carefully
+        if raw.startswith("```json"):
+            raw = raw[7:].strip()  # Remove ```json
+        elif raw.startswith("```"):
+            raw = raw[3:].strip()  # Remove ```
+        if raw.endswith("```"):
+            raw = raw[:-3].strip()  # Remove trailing ```
+        # Simple JSON parsing approach
+        # Try direct parsing first
+        LOGGER.debug(f"Raw JSON length: {len(raw)}")
+        LOGGER.debug(f"Raw JSON first 200 chars: {raw[:200]}")
+        LOGGER.debug(f"Raw JSON last 200 chars: {raw[-200:]}")
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError as e:
+            LOGGER.error(f"JSON parsing failed at position {e.pos}: {e}")
+            LOGGER.error(f"Character at error: {repr(raw[e.pos] if e.pos < len(raw) else 'END')}")
+            LOGGER.error(f"Context: {repr(raw[max(0, e.pos-20):e.pos+20])}")
+            # Count braces and quotes for debugging
+            open_braces = raw.count('{')
+            close_braces = raw.count('}')
+            quotes = raw.count('"')
+            LOGGER.error(f"Braces: {open_braces} open, {close_braces} close. Quotes: {quotes}")
+            # If that fails, try to extract JSON from the response using a simpler method
             try:
-                parsed = json.loads(raw)
-            except json.JSONDecodeError:
-                # If that fails, look for JSON array or object
-                # Find the first '[' or '{' and the matching closing bracket
-                json_start = -1
-                json_end = -1
-                bracket_stack = []
-                in_string = False
-                escape_next = False
+                # Look for the JSON object start and end
+                start_idx = raw.find('{')
+                if start_idx == -1:
+                    raise json.JSONDecodeError("No JSON object found", raw, 0)
-                for i, char in enumerate(raw):
-                    if escape_next:
-                        escape_next = False
-                        continue
+                # Find the matching closing brace by counting
+                brace_count = 0
+                end_idx = -1
+                for i in range(start_idx, len(raw)):
+                    if raw[i] == '{':
+                        brace_count += 1
+                    elif raw[i] == '}':
+                        brace_count -= 1
+                        if brace_count == 0:
+                            end_idx = i + 1
+                            break
+                if end_idx == -1:
+                    raise json.JSONDecodeError("No matching closing brace found", raw, 0)
+                json_str = raw[start_idx:end_idx]
+                LOGGER.debug(f"Extracted JSON string: {json_str[:200]}...")
+                parsed = json.loads(json_str)
+            except json.JSONDecodeError:
+                # Final fallback - try to use eval as a last resort (unsafe but functional)
+                try:
+                    # Replace problematic characters and try to parse as Python dict
+                    safe_raw = raw.replace('null', 'None').replace('true', 'True').replace('false', 'False')
+                    start_idx = safe_raw.find('{')
+                    if start_idx == -1:
+                        raise ValueError("No dict found")
-                    if char == '\\':
-                        escape_next = True
-                        continue
-                    if char == '"' and not escape_next:
-                        in_string = not in_string
-                        continue
+                    brace_count = 0
+                    end_idx = -1
+                    for i in range(start_idx, len(safe_raw)):
+                        if safe_raw[i] == '{':
+                            brace_count += 1
+                        elif safe_raw[i] == '}':
+                            brace_count -= 1
+                            if brace_count == 0:
+                                end_idx = i + 1
+                                break
-                    if in_string:
-                        continue
+                    if end_idx == -1:
+                        raise ValueError("No matching closing brace found")
-                    if char in '[{':
-                        if json_start == -1:
-                            json_start = i
-                        bracket_stack.append(char)
-                    elif char in ']}':
-                        if bracket_stack:
-                            opening = bracket_stack.pop()
-                            if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
-                                if not bracket_stack:  # Found complete JSON
-                                    json_end = i + 1
-                                    break
-                if json_start >= 0 and json_end > json_start:
-                    # Extract the JSON portion
-                    json_str = raw[json_start:json_end]
-                    parsed = json.loads(json_str)
-                else:
-                    # Look for simple [] in the response
+                    dict_str = safe_raw[start_idx:end_idx]
+                    parsed = eval(dict_str)  # This is unsafe but we trust our own generated content
+                    LOGGER.warning("Used eval() fallback for JSON parsing")
+                except Exception:
+                    # If all else fails, return empty dict
+                    LOGGER.error("All JSON parsing methods failed")
                     if '[]' in raw:
                         parsed = []
                     else:
-                        # No JSON structure found, re-raise the original error
                         raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
-            LOGGER.info("Successfully parsed JSON response")
-            return parsed
-        except Exception as exc:
-            LOGGER.warning(
-                "Gemini call failed (attempt %d/%d): %s",
-                attempt, max_retries, exc,
-            )
-            if attempt == max_retries:
-                raise
-            time.sleep(2 ** attempt)
+        LOGGER.info("Successfully parsed JSON response")
+        return parsed
+    except Exception as exc:
+        LOGGER.error("Cached Gemini call failed: %s", exc)
+        raise
 ###############################################################################
@@ -248,14 +431,14 @@ def generate_json_with_retry(
 PROMPT_FIND_LOCATIONS = dedent("""
 You are an expert reader of protein engineering manuscripts.
-Given the following article captions and section titles, identify ALL locations
+Given the following article captions and section titles, identify most promising locations
 (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
-activity, etc.) for enzyme variants.
+activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
 IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
 performance data locations. Pay careful attention to:
 - The caption text to identify which campaign/lineage the data is for
-- Enzyme name prefixes (e.g., PYS vs INS) that indicate different campaigns
+- Enzyme name prefixes that indicate different campaigns
 - Different substrate/product types mentioned in captions
 Respond with a JSON array where each element contains:
@@ -267,8 +450,10 @@ Respond with a JSON array where each element contains:
 - "lineage_hint": any indication of which enzyme group this data is for (or null)
 - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
-Tables are preferred over figures when both contain the same data.
+Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
+Do not include too much sources, just return 2 or 3 sources.
+Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
+When returning confidence scores, be more accurate and avoid scores that are too close together.
 Respond ONLY with **minified JSON**. NO markdown fences.
 Example:
@@ -280,7 +465,8 @@ You are given either (a) the PNG image of a figure panel, or (b) the caption /
 text excerpt that contains numeric reaction performance data for an enzyme.
 Extract ONLY the performance metrics, NOT substrate/product names or reaction conditions.
-Return a JSON object with the following keys (use **null** if not found):
+Return a JSON object with the following keys (use **null** only if the value is not mentioned at all):
   * "yield"              - yield as percentage with ONE decimal place precision
   * "ttn"               - turnover number (total turnovers)
   * "ton"               - turnover number if TTN not available
@@ -296,45 +482,66 @@ IMPORTANT:
 - Do NOT extract substrate/product names - these will come from SI
 - Do NOT extract reaction conditions (temperature, pH, time, solvent)
 - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
+- If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
 Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
 """)
 PROMPT_EXTRACT_FIGURE_METRICS_BATCH = dedent("""
-You are analyzing a figure showing enzyme reaction performance data for multiple variants.
-Extract performance metrics for ALL the following enzyme variants:
+STEP 1: First, identify ALL X-axis labels in the figure
+- Read each X-axis label from left to right
+- List exactly what text appears under each bar/data point
+- Note: Labels may be abbreviated or use different naming conventions
+STEP 2: Match X-axis labels to target enzyme variants
+- Compare each X-axis label against the target enzyme list below
+- Look for partial matches, abbreviations, or similar naming patterns
+- If an X-axis label doesn't match any target enzyme, still include it for completeness
+STEP 3: Identify Y-axis scales and what they measure
+- Look at the Y-axis labels and tick marks to understand what each axis measures
+- If there are multiple Y-axes (left and right), read the axis labels and units
+- Note the minimum and maximum values on each axis scale
+- Identify which visual elements (bars, dots, lines) correspond to which axis
+STEP 4: Extract values for each matched variant
+- For each X-axis position, identify which visual elements belong to that position
+- LEFT Y-axis (bars): Measure bar height against the left scale by reading tick marks
+- RIGHT Y-axis (dots): Measure dot position against the right scale by reading tick marks
+- CRITICAL: Read actual scale values from the axis labels and tick marks
+- Verify: taller bars should have higher values, higher dots should have higher values
+Target enzymes to find and extract:
 {enzyme_names}
-Steps:
-1. CHECK THE Y-AXIS SCALE: What is the maximum value? (e.g., 10%, 30%, 50%, 100%)
-2. For each enzyme variant listed above:
-   - Find its position on the X-axis
-   - Read the bar height or data point value
-   - Calculate the actual value based on the Y-axis scale
-3. Compare all bars to understand relative performance
-Return a JSON object with enzyme names as keys, each containing:
-  * "yield" - yield with ONE decimal place precision
-  * "ttn" - turnover number if shown
-  * "ton" - turnover number if TTN not available
-  * "selectivity" - ee or er value with unit
-  * "conversion" - conversion percentage if different from yield
-  * "tof" - turnover frequency if provided
-  * "activity" - specific activity if provided
-  * "other_metrics" - dictionary of any other metrics
-  * "notes" - any relevant notes (including reaction type if different reactions are shown)
-CRITICAL:
-- Read ALL pages provided in the image
-- If different enzymes are tested for different reactions (e.g., pyrrolidine vs indoline synthesis), note this in "notes"
-- For tables, check if data continues beyond what's shown
-- Read the Y-axis scale carefully for figures
-Example format:
-{{"ApePgb LVQ": {{"yield": 0.0, "ttn": null, "notes": "pyrrolidine synthesis", ...}}, ...}}
-Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
+Instructions:
+1. First, list ALL X-axis labels you can see in the figure
+2. Match each X-axis label to the target enzyme variants
+3. For matched variants, extract both bar heights (left Y-axis) and dot positions (right Y-axis)
+4. Return data only for variants that have clear X-axis labels and are matched to targets
+Return JSON with the identified enzyme variant names as keys containing:
+  * "x_axis_label" - the exact text from the X-axis for this variant
+  * "yield" - percentage from left Y-axis bar height measurement
+  * "ttn" - turnover number from right Y-axis dot position measurement
+  * "ton" - if TTN not available
+  * "selectivity" - if shown
+  * "conversion" - if different from yield
+  * "tof" - if provided
+  * "activity" - if provided
+  * "other_metrics" - other metrics
+  * "notes" - REQUIRED: Describe the X-axis label, bar position, and dot position (e.g., "X-axis shows P411-CIS, leftmost bar is very short, dot is at bottom")
+CRITICAL: Return ONLY valid JSON in this exact format:
+{{"enzyme_name": {{"x_axis_label": "label", "yield": number, "ttn": number, "notes": "description"}}}}
+Rules:
+- Use double quotes for all strings
+- No markdown, no commentary, no explanations
+- All values must be properly formatted
+- Ensure JSON is complete and valid
+- Do not truncate or cut off the response
+- IMPORTANT: When extracting data, prioritize the most complete source that shows data for ALL variants. If there are conflicting values between different sources (e.g., bar graph vs text values), use the source that provides complete data for all target enzymes and ignore partial or conflicting values from other sources
 """)
 # Removed substrate scope IUPAC extraction - now handled in model reaction only
@@ -519,13 +726,25 @@ class ReactionExtractor:
     _TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
     def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
-                 campaign_filter: Optional[str] = None):
+                 campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
         self.manuscript = manuscript
         self.si = si
         self.cfg = cfg
         self.model = get_model(cfg)
         self.debug_dir = debug_dir
         self.campaign_filter = campaign_filter  # Filter for specific campaign
+        self.all_campaigns = all_campaigns or []  # List of all campaigns for context
+        # Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
+        self._figure_cache = LRUCache(maxsize=100)  # Figures are large, so smaller cache
+        self._model_reaction_locations_cache = LRUCache(maxsize=50)
+        # Cache for compound mappings to avoid repeated API calls (bounded to prevent memory leaks)
+        self._compound_mapping_cache = LRUCache(maxsize=1000)
+        self._compound_mapping_text_cache = LRUCache(maxsize=500)  # Cache text extractions too
+        # Cache for reaction locations to avoid repeated API calls (bounded to prevent memory leaks)
+        self._reaction_locations_cache = LRUCache(maxsize=50)
         # Create debug directory if specified
         if self.debug_dir:
@@ -551,23 +770,40 @@ class ReactionExtractor:
     # ------------------------------------------------------------------
     def _collect_captions_and_titles(self) -> str:
-        # Simpler pattern: match any line starting with Table or Figure
-        # This catches all variations like "Table S 2", "Table.", "Figure S1", etc.
-        cap_pattern = re.compile(r"^(Table|Figure).*", re.I | re.M)
+        # Pattern to match Table or Figure with optional leading whitespace
+        # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
+        # Also handles cases where there's whitespace before the caption
+        cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
         captions: List[str] = []
         # Collect from all pages
         all_text = "\n".join(self.all_pages)
-        # Find all figure/table captions
+        # Find all figure/table captions with more context
         for match in cap_pattern.finditer(all_text):
             caption_start = match.start()
-            # Get up to 1200 chars or until double newline
+            # Include some context before the caption (up to 200 chars)
+            context_start = max(0, caption_start - 200)
+            # Find the start of the sentence/paragraph before the caption
+            context_text = all_text[context_start:caption_start]
+            last_period = context_text.rfind('.')
+            if last_period != -1:
+                context_start = context_start + last_period + 1
+            # For tables, include much more content after the caption to show actual table data
+            # For figures, keep the original limit
+            is_table = match.group(1).lower() == 'table'
+            max_chars = 5000 if is_table else 3000
+            # Get up to max_chars or until double newline
             caption_end = all_text.find("\n\n", caption_start)
-            if caption_end == -1 or caption_end - caption_start > 1200:
-                caption_end = caption_start + 1200
-            caption = all_text[caption_start:caption_end].strip()
-            captions.append(caption)
+            if caption_end == -1 or caption_end - caption_start > max_chars:
+                caption_end = caption_start + max_chars
+            # Include the context and full caption with table content
+            full_caption = all_text[context_start:caption_end].strip()
+            captions.append(full_caption)
         # Also look for SI section titles
         si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
@@ -584,15 +820,47 @@ class ReactionExtractor:
     def find_reaction_locations(self) -> List[Dict[str, Any]]:
         """Find all locations containing reaction performance data."""
-        # Add campaign context if available
+        # Create cache key based on campaign filter
+        cache_key = f"locations_{self.campaign_filter or 'all'}"
+        # Check cache first
+        cached_result = self._reaction_locations_cache.get(cache_key)
+        if cached_result is not None:
+            LOGGER.info("Using cached reaction locations for campaign: %s", self.campaign_filter or 'all')
+            return cached_result
+        # Add campaign context - always provide context to help model understanding
         campaign_context = ""
         if self.campaign_filter:
+            campaigns_warning = ""
+            if self.all_campaigns:
+                campaigns_warning = f"""
+            ALL CAMPAIGNS IN THIS PAPER:
+            {chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
+            CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
+            Be extremely careful to only extract data for the {self.campaign_filter} campaign.
+            """
             campaign_context = f"""
-IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
-Only return locations that contain data for this specific campaign.
-Ignore locations that contain data for other campaigns.
+            IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
+            Only return locations that contain data for this specific campaign.
+            Ignore locations that contain data for other campaigns.
+            {campaigns_warning}
-"""
+            """
+        else:
+            # Even for single campaigns, provide context about what to look for
+            campaign_context = f"""
+            IMPORTANT: You are looking for performance data showing enzyme evolution progression.
+            Look for locations that contain actual performance metrics (yield, TTN, TON, activity, etc.)
+            for multiple enzyme variants, not just mutation lists or method descriptions.
+            Tables may only contain mutation information without performance data - check the actual
+            table content below the caption to verify if performance metrics are present.
+            Figures with evolutionary lineage data often contain the actual performance matrix.
+            """
         prompt = campaign_context + PROMPT_FIND_LOCATIONS + "\n\n" + self._collect_captions_and_titles()
         try:
@@ -604,13 +872,20 @@ Ignore locations that contain data for other campaigns.
                 tag="find_locations"
             )
             # Handle both single dict (backwards compatibility) and list
+            result = []
             if isinstance(data, dict):
-                return [data]
+                result = [data]
             elif isinstance(data, list):
-                return data
+                result = data
             else:
                 LOGGER.error("Expected list or dict from Gemini, got: %s", type(data))
-                return []
+                result = []
+            # Cache the result
+            self._reaction_locations_cache.put(cache_key, result)
+            LOGGER.info("Cached reaction locations for campaign: %s", self.campaign_filter or 'all')
+            return result
         except Exception as e:
             LOGGER.error("Failed to find reaction locations: %s", e)
             return []
@@ -686,13 +961,27 @@ Ignore locations that contain data for other campaigns.
         }
     def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
-        """Find the model reaction for a specific lineage group."""
+        """Find the model reaction for a specific lineage group.
+        Returns early if no relevant text is found to avoid unnecessary API calls."""
         # Gather relevant text near this location
         page_text = self._page_with_reference(location) or ""
+        # Early exit if no text found for this location
+        if not page_text or len(page_text.strip()) < 100:
+            LOGGER.info("No sufficient text found for location %s, skipping lineage-specific extraction", location)
+            return {}
         # Also check manuscript introduction for model reaction info
         intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
+        # Quick relevance check - look for reaction-related keywords
+        reaction_keywords = ["substrate", "product", "reaction", "compound", "synthesis", "procedure", "method"]
+        combined_text = (page_text + intro_text).lower()
+        if not any(keyword in combined_text for keyword in reaction_keywords):
+            LOGGER.info("No reaction-related keywords found for location %s, skipping lineage extraction", location)
+            return {}
         # Build the prompt with location and context
         prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
             location=location,
@@ -702,6 +991,7 @@ Ignore locations that contain data for other campaigns.
         prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
         # If we have model reaction locations, include text from those locations too
+        text_added = False
         if model_reaction_locations:
             # Add text from model reaction location
             if model_reaction_locations.get("model_reaction_location", {}).get("location"):
@@ -709,6 +999,7 @@ Ignore locations that contain data for other campaigns.
                 model_text = self._get_text_around_location(model_loc)
                 if model_text:
                     prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
+                    text_added = True
             # Add text from conditions location (often contains reaction details)
             if model_reaction_locations.get("conditions_location", {}).get("location"):
@@ -716,8 +1007,15 @@ Ignore locations that contain data for other campaigns.
                 cond_text = self._get_text_around_location(cond_loc)
                 if cond_text:
                     prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
+                    text_added = True
+        # If we didn't find any model reaction locations and the page text is sparse, skip
+        if not text_added and len(page_text.strip()) < 500:
+            LOGGER.info("Insufficient context for lineage model reaction extraction at %s", location)
+            return {}
         try:
+            LOGGER.info("Attempting lineage-specific model reaction extraction for %s", location)
             data = generate_json_with_retry(
                 self.model,
                 prompt,
@@ -725,7 +1023,15 @@ Ignore locations that contain data for other campaigns.
                 debug_dir=self.debug_dir,
                 tag=f"lineage_model_reaction_{location.replace(' ', '_')}"
             )
-            return data if isinstance(data, dict) else {}
+            # Validate the response has useful information
+            if isinstance(data, dict) and (data.get('substrate_ids') or data.get('product_ids')):
+                LOGGER.info("Lineage model reaction extraction successful for %s", location)
+                return data
+            else:
+                LOGGER.info("Lineage model reaction extraction returned empty results for %s", location)
+                return {}
         except Exception as e:
             LOGGER.error("Failed to find model reaction for lineage at %s: %s", location, e)
             return {}
@@ -777,67 +1083,174 @@ Ignore locations that contain data for other campaigns.
         If False, extracts the entire page (useful for tables).
         Returns a base64-encoded PNG or None."""
+        # Check cache first
+        cache_key = f"{ref}_{extract_figure_only}"
+        cached_result = self._figure_cache.get(cache_key)
+        if cached_result is not None:
+            LOGGER.debug("Using cached figure for %s", ref)
+            return cached_result
         # For table extraction, use multi-page approach
         if not extract_figure_only:
             pages_with_ref = self._find_pages_with_reference(ref)
             if pages_with_ref:
                 LOGGER.debug(f"Found {len(pages_with_ref)} pages containing {ref}")
-                return self._extract_multiple_pages_png(pages_with_ref)
+                return self._extract_multiple_pages_png(pages_with_ref, ref)
             return None
-        # For figure extraction, search both documents
+        # For figure extraction, search both documents for actual figure captions
         for doc in filter(None, [self.ms_doc, self.si_doc]):
             for page_number in range(doc.page_count):
                 page = doc.load_page(page_number)
                 page_text = page.get_text()
-                if ref.lower() not in page_text.lower():
-                    continue
-                # Get caption bbox
-                text_instances = page.search_for(ref, quads=False)
-                if not text_instances:
+                # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
+                # For subfigures like "Figure 1C", extract the main figure "Figure 1"
+                figure_num = ref.replace('Figure ', '').replace('figure ', '')
+                # Extract main figure number from subfigure (e.g., "1C" -> "1")
+                main_figure_num = re.match(r'^(\d+)', figure_num)
+                if main_figure_num:
+                    main_figure_num = main_figure_num.group(1)
+                else:
+                    main_figure_num = figure_num
+                caption_patterns = [
+                    rf"^Figure\s+{re.escape(main_figure_num)}\.",  # "Figure 1."
+                    rf"^Figure\s+{re.escape(main_figure_num)}:",   # "Figure 1:"
+                    rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]",  # "Figure 1 Performance"
+                    rf"^Figure\s+{re.escape(main_figure_num)}\s*$",  # "Figure 1" at end of line
+                    rf"Figure\s+{re.escape(main_figure_num)}\s*\.",  # "Figure 1." anywhere in line
+                    rf"Figure\s+{re.escape(main_figure_num)}\s*:",  # "Figure 1:" anywhere in line
+                ]
+                LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
+                           main_figure_num, ref, caption_patterns)
+                caption_found = False
+                cap_rect = None
+                for pattern in caption_patterns:
+                    matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
+                    if matches:
+                        LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
+                        # Found actual figure caption, get its position
+                        caption_text = matches.group(0)
+                        text_instances = page.search_for(caption_text, quads=False)
+                        if text_instances:
+                            cap_rect = text_instances[0]
+                            caption_found = True
+                            LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
+                            break
+                if not caption_found:
+                    # Debug: show what figure-related text is actually on this page
+                    figure_mentions = [line.strip() for line in page_text.split('\n')
+                                     if 'figure' in line.lower() and main_figure_num.lower() in line.lower()]
+                    if figure_mentions:
+                        LOGGER.debug("Page %d has figure mentions but no caption match: %s",
+                                   page_number, figure_mentions[:3])
                     continue
-                cap_rect = text_instances[0]  # first match
                 if extract_figure_only:
-                    # Sort images by y0 (top) coordinate ascending
-                    images = sorted(page.get_images(full=True), key=lambda im: im[7])
-                    # Find first image whose bottom y is **above** caption top y
-                    for img in images:
-                        xref = img[0]
-                        # Get image rectangles to find position
-                        img_rects = page.get_image_rects(xref)
-                        if img_rects:
-                            img_rect = img_rects[0]  # First rectangle
-                            if img_rect.y1 < cap_rect.y0:  # fully above caption
-                                # Extract image bytes
-                                pix = fitz.Pixmap(doc, xref)
-                                pix = self._ensure_rgb_pixmap(pix)
-                                img_bytes = pix.tobytes("png")
-                                return b64encode(img_bytes).decode()
+                    # Extract only the area above the caption (the actual figure)
+                    # This excludes caption text and focuses on visual elements
+                    LOGGER.info("Extracting figure area above caption for %s", ref)
+                    # Get the page dimensions
+                    page_rect = page.rect
+                    # Extract the area above the caption
+                    if cap_rect:
+                        # Extract from top of page to top of caption
+                        figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
+                        LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
+                    else:
+                        # If no caption found, use top 80% of page
+                        figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
+                        LOGGER.debug("No caption found, using top 80% of page: %s", figure_rect)
+                    # Extract the figure area only
+                    mat = fitz.Matrix(5.0, 5.0)  # 5x zoom for better quality
+                    pix = page.get_pixmap(matrix=mat, clip=figure_rect)
+                    pix = self._ensure_rgb_pixmap(pix)
+                    img_bytes = pix.tobytes("png")
+                    # Save PNG to debug directory if available
+                    if self.debug_dir:
+                        timestamp = int(time.time())
+                        png_file = self.debug_dir / f"figure_{ref.replace(' ', '_')}_{timestamp}.png"
+                        with open(png_file, 'wb') as f:
+                            f.write(img_bytes)
+                        LOGGER.info("Saved figure page to: %s", png_file)
+                    result = b64encode(img_bytes).decode()
+                    # Cache the result
+                    self._figure_cache.put(cache_key, result)
+                    return result
                 else:
                     # Extract the entire page as an image
-                    mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
+                    mat = fitz.Matrix(5.0, 5.0)  # 5x zoom for better quality
                     pix = page.get_pixmap(matrix=mat)
                     pix = self._ensure_rgb_pixmap(pix)
                     img_bytes = pix.tobytes("png")
-                    return b64encode(img_bytes).decode()
+                    # Save PNG to debug directory if available
+                    if self.debug_dir:
+                        timestamp = int(time.time())
+                        png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
+                        with open(png_file, 'wb') as f:
+                            f.write(img_bytes)
+                        LOGGER.info("Saved page image to: %s", png_file)
+                    result = b64encode(img_bytes).decode()
+                    # Cache the result
+                    self._figure_cache.put(cache_key, result)
+                    return result
         return None
     def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
         """Find all pages containing the reference across documents.
+        Prioritizes pages with actual captions over just references.
         Returns list of (document, page_number) tuples."""
         pages_found = []
+        caption_pages = []
         for doc in filter(None, [self.ms_doc, self.si_doc]):
             for page_number in range(doc.page_count):
                 page = doc.load_page(page_number)
                 page_text = page.get_text()
+                # Check for actual figure caption first
+                if ref.lower().startswith('figure'):
+                    figure_num = ref.replace('Figure ', '').replace('figure ', '')
+                    # Extract main figure number from subfigure (e.g., "1C" -> "1")
+                    main_figure_num = re.match(r'^(\d+)', figure_num)
+                    if main_figure_num:
+                        main_figure_num = main_figure_num.group(1)
+                    else:
+                        main_figure_num = figure_num
+                    caption_patterns = [
+                        rf"^Figure\s+{re.escape(main_figure_num)}\.",
+                        rf"^Figure\s+{re.escape(main_figure_num)}:",
+                        rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]"
+                    ]
+                    for pattern in caption_patterns:
+                        if re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE):
+                            caption_pages.append((doc, page_number))
+                            break
+                # Fallback to any mention of the reference
                 if ref.lower() in page_text.lower():
                     pages_found.append((doc, page_number))
-        return pages_found
+        # Return caption pages first, then other pages
+        return caption_pages + [p for p in pages_found if p not in caption_pages]
-    def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]]) -> Optional[str]:
+    def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]], ref: str = "unknown") -> Optional[str]:
         """Extract multiple pages as a combined PNG image."""
         if not pages:
             return None
@@ -854,12 +1267,7 @@ Ignore locations that contain data for other campaigns.
             pix = self._ensure_rgb_pixmap(pix)
             all_images.append(pix)
-            # If this is the last page with the reference, also add the next page
-            if i == len(pages) - 1 and page_num + 1 < doc.page_count:
-                next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
-                next_pix = self._ensure_rgb_pixmap(next_pix)
-                all_images.append(next_pix)
-                LOGGER.info(f"Added next page: page {page_num + 2}")  # +2 because page numbers are 1-based for users
+            # Only extract the page containing the reference (removed next page logic)
         if not all_images:
             return None
@@ -867,7 +1275,17 @@ Ignore locations that contain data for other campaigns.
         # If only one page, return it directly
         if len(all_images) == 1:
             pix = self._ensure_rgb_pixmap(all_images[0])
-            return b64encode(pix.tobytes("png")).decode()
+            img_bytes = pix.tobytes("png")
+            # Save debug file if available
+            if self.debug_dir:
+                timestamp = int(time.time())
+                png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
+                with open(png_file, 'wb') as f:
+                    f.write(img_bytes)
+                LOGGER.info("Saved multi-page image to: %s", png_file)
+            return b64encode(img_bytes).decode()
         # Combine multiple pages vertically
         if not all_images:
@@ -914,7 +1332,7 @@ Ignore locations that contain data for other campaigns.
             y_offset += pix.height * scale
         # Convert the page to a pixmap
-        mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for quality
+        mat = fitz.Matrix(5.0, 5.0)  # 5x zoom for quality
         combined_pix = page.get_pixmap(matrix=mat)
         combined_pix = self._ensure_rgb_pixmap(combined_pix)
@@ -922,6 +1340,14 @@ Ignore locations that contain data for other campaigns.
         img_bytes = combined_pix.tobytes("png")
         output_doc.close()
+        # Save debug file if available
+        if self.debug_dir:
+            timestamp = int(time.time())
+            png_file = self.debug_dir / f"combined_pages_{ref.replace(' ', '_')}_{timestamp}.png"
+            with open(png_file, 'wb') as f:
+                f.write(img_bytes)
+            LOGGER.info("Saved combined multi-page image to: %s", png_file)
         return b64encode(img_bytes).decode()
     # ------------------------------------------------------------------
@@ -930,14 +1356,19 @@ Ignore locations that contain data for other campaigns.
     def _validate_location_exists(self, ref: str) -> bool:
         """Verify that the referenced location actually exists in the document."""
-        # Search for the actual reference in the document
-        for page_num in range(len(self.doc)):
-            page = self.doc[page_num]
-            text = page.get_text()
-            # Look for table references like "Table 1", "Table S1", etc.
-            if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
-                return True
+        # Search for the actual reference in both manuscript and SI documents
+        docs_to_check = [self.ms_doc]
+        if self.si_doc:
+            docs_to_check.append(self.si_doc)
+        for doc in docs_to_check:
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                text = page.get_text()
+                # Look for table references like "Table 1", "Table S1", etc.
+                if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
+                    return True
         return False
@@ -1010,23 +1441,55 @@ Ignore locations that contain data for other campaigns.
                 LOGGER.debug("No page image found for %s - using full page text", ref)
                 snippet = self._extract_table_context(ref)
         elif self._FIG_RE.search(ref_lc):
-            # For figures, extract just the figure image
+            # For figures, extract just the figure image (same logic as compound mapping)
             image_b64 = self._extract_page_png(ref, extract_figure_only=True)
             if not image_b64:
                 LOGGER.debug("No figure image found for %s - using caption text", ref)
                 snippet = self._extract_figure_caption(ref)
+            else:
+                # If figure is found, ignore text information - use image only
+                snippet = ""
         else:
             snippet = self._page_with_reference(ref) or ""
-        # Validate context before sending to Gemini
-        if not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
+        # For figures with images, skip text validation and proceed with image extraction
+        if image_b64 and self._FIG_RE.search(ref_lc):
+            LOGGER.info("Using figure image for %s - ignoring text context", ref)
+        elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
             return []
-        enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
+        # Create enhanced enzyme descriptions with parent/mutation context
+        if hasattr(self, 'enzyme_df') and self.enzyme_df is not None:
+            enzyme_descriptions = []
+            for enzyme in enzyme_list:
+                # Find this enzyme in the dataframe
+                enzyme_row = None
+                if 'enzyme_id' in self.enzyme_df.columns:
+                    enzyme_row = self.enzyme_df[self.enzyme_df['enzyme_id'] == enzyme]
+                elif 'enzyme' in self.enzyme_df.columns:
+                    enzyme_row = self.enzyme_df[self.enzyme_df['enzyme'] == enzyme]
+                if enzyme_row is not None and len(enzyme_row) > 0:
+                    row = enzyme_row.iloc[0]
+                    parent = row.get('parent_enzyme_id', '')
+                    mutations = row.get('mutations', '')
+                    desc = f"- {enzyme}"
+                    if parent and str(parent).strip() and str(parent) != 'nan':
+                        desc += f" (parent: {parent})"
+                    if mutations and str(mutations).strip() and str(mutations) != 'nan':
+                        desc += f" (mutations: {mutations})"
+                    enzyme_descriptions.append(desc)
+                else:
+                    enzyme_descriptions.append(f"- {enzyme}")
+            enzyme_names = "\n".join(enzyme_descriptions)
+        else:
+            enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
         if image_b64:
             # Use batch extraction prompt for image analysis
-            prompt = campaign_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
+            location_context = f"\n\nIMPORTANT: You are extracting data from {ref}, which has been identified as the PRIMARY LOCATION containing the most reliable performance data for these enzymes.\n"
+            prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
             LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
             tag = f"extract_metrics_batch_vision"
         else:
@@ -1048,7 +1511,32 @@ Ignore locations that contain data for other campaigns.
             # Validate response has meaningful data
             if not self._validate_response(data, enzyme_list, ref):
-                return []
+                # If figure extraction failed and we have a figure, try falling back to text
+                if image_b64 and self._FIG_RE.search(ref_lc):
+                    LOGGER.warning("Figure extraction from %s returned empty results - falling back to text", ref)
+                    snippet = self._extract_figure_caption(ref)
+                    if self._validate_context(snippet, enzyme_list, ref):
+                        # Retry with text extraction
+                        format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
+                        prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
+                        LOGGER.info("Gemini: retrying with text extraction for %d enzymes from %s…", len(enzyme_list), ref)
+                        data = generate_json_with_retry(
+                            self.model,
+                            prompt,
+                            temperature=self.cfg.extract_temperature,
+                            debug_dir=self.debug_dir,
+                            tag=f"extract_metrics_batch_text_fallback",
+                            image_b64=None
+                        )
+                        # Validate the text extraction response
+                        if not self._validate_response(data, enzyme_list, ref):
+                            return []
+                    else:
+                        return []
+                else:
+                    return []
             # Handle the response format - expecting a dict with enzyme names as keys
             results = []
@@ -1100,6 +1588,15 @@ Ignore locations that contain data for other campaigns.
     def find_model_reaction_locations(self, enzyme_variants: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
         """Find locations for model reaction scheme, conditions, and IUPAC names."""
+        # Create cache key based on campaign filter and enzyme variants
+        cache_key = f"{self.campaign_filter}_{hash(tuple(sorted(enzyme_variants)) if enzyme_variants else ())}"
+        # Check cache first
+        cached_result = self._model_reaction_locations_cache.get(cache_key)
+        if cached_result is not None:
+            LOGGER.info("Using cached model reaction locations for campaign: %s", self.campaign_filter)
+            return cached_result
         # Collect all text including section titles, captions, and schemes
         all_text = self._collect_captions_and_titles()
@@ -1110,13 +1607,25 @@ Ignore locations that contain data for other campaigns.
         # Add enzyme context if provided
         enzyme_context = ""
         if enzyme_variants and self.campaign_filter:
+            campaigns_context = ""
+            if self.all_campaigns:
+                campaigns_context = f"""
+ALL CAMPAIGNS IN THIS PAPER:
+{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
+CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
+- Different campaigns may use similar enzyme names but different substrates
+- Be extremely careful to only extract data for the {self.campaign_filter} campaign
+- Ignore data from other campaigns even if they seem similar
+"""
             enzyme_context = f"""
 IMPORTANT CONTEXT:
 You are looking for the model reaction used specifically for these enzyme variants:
 {', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
 These variants belong to campaign: {self.campaign_filter}
+{campaigns_context}
 Focus on finding the model reaction that was used to evaluate THESE specific variants.
 Different campaigns may use different model reactions.
 """
@@ -1134,6 +1643,11 @@ Different campaigns may use different model reactions.
             if not isinstance(data, dict):
                 LOGGER.error("Expected dict from Gemini, got: %s", type(data))
                 return None
+            # Cache the result
+            self._model_reaction_locations_cache.put(cache_key, data)
+            LOGGER.info("Cached model reaction locations for campaign: %s", self.campaign_filter)
             return data
         except Exception as e:
             LOGGER.error("Failed to find model reaction locations: %s", e)
@@ -1232,9 +1746,12 @@ Different campaigns may use different model reactions.
         extraction_text: str,
         compound_ids: List[str] = None,
         tag_suffix: str = "",
+        campaign_filter: Optional[str] = None,
     ) -> Dict[str, CompoundMapping]:
         """Helper function to extract compound mappings from provided text."""
         prompt = PROMPT_COMPOUND_MAPPING
+        if campaign_filter:
+            prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
         if compound_ids:
             prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
         prompt += "\n\nTEXT:\n" + extraction_text
@@ -1282,6 +1799,7 @@ Different campaigns may use different model reactions.
         compound_ids: List[str],
         figure_images: Dict[str, str],
         tag_suffix: str = "",
+        campaign_filter: Optional[str] = None,
     ) -> Dict[str, CompoundMapping]:
         """Extract compound mappings using multimodal approach with figures."""
         # Enhanced prompt for figure-based extraction
@@ -1293,8 +1811,26 @@ Use your best knowledge, Look carefully in:
 1. The chemical structures shown in figures - infer IUPAC names from drawn structures
 2. Figure captions that may define compounds
 3. Text that refers to these compound numbers
-4. Reaction schemes showing transformations
+4. Reaction schemes showing transformations"""
+        if campaign_filter:
+            campaigns_warning = ""
+            if self.all_campaigns:
+                campaigns_warning = f"""
+ALL CAMPAIGNS IN THIS PAPER:
+{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
+CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
+"""
+            prompt += f"""
+IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system.
+{campaigns_warning}
+Different campaigns may use different numbering systems for compounds.
+Do NOT include compound information from other campaigns."""
+        prompt += """
 IMPORTANT:
 - Only provide IUPAC names you can determine from the figures or text
@@ -1324,9 +1860,10 @@ TEXT FROM MANUSCRIPT:
             for fig_ref, fig_base64 in figure_images.items():
                 try:
                     img_bytes = b64decode(fig_base64)
-                    image = PIL.Image.open(io.BytesIO(img_bytes))
+                    # Format image for Gemini API
+                    image_part = {"mime_type": "image/png", "data": img_bytes}
                     content_parts.append(f"\n[Figure: {fig_ref}]")
-                    content_parts.append(image)
+                    content_parts.append(image_part)
                     LOGGER.info("Added figure %s to multimodal compound mapping", fig_ref)
                 except Exception as e:
                     LOGGER.warning("Failed to add figure %s: %s", fig_ref, e)
@@ -1356,6 +1893,21 @@ TEXT FROM MANUSCRIPT:
             # Make multimodal API call
             response = self.model.generate_content(content_parts)
+            # Track token usage if available
+            try:
+                if hasattr(response, 'usage_metadata'):
+                    input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
+                    output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
+                    if input_tokens or output_tokens:
+                        try:
+                            from .wrapper import add_token_usage
+                            add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
+                        except ImportError:
+                            pass  # wrapper not available
+            except Exception:
+                pass  # token tracking is best-effort
             raw_text = response.text.strip()
             # Log response
@@ -1402,18 +1954,37 @@ TEXT FROM MANUSCRIPT:
         self,
         compound_ids: List[str],
         initial_sections: List[str] = None,
+        campaign_filter: Optional[str] = None,
     ) -> Dict[str, CompoundMapping]:
-        """Extract compound ID to IUPAC name mappings using adaptive 3-tier strategy.
+        """Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
-        1. First attempts extraction from standard sections
-        2. Expands search to additional sections if compounds are missing
-        3. Uses multimodal figure analysis as final fallback
+        1. First attempts extraction from specific SI sections + 10 manuscript pages
+        2. If compounds missing, uses full manuscript + SI with multimodal figure analysis
         """
         if not compound_ids:
             return {}
-        LOGGER.info("Starting adaptive compound mapping for %d compounds: %s",
-                   len(compound_ids), sorted(compound_ids))
+        # Check cache first - return cached results for compounds we've already processed
+        cached_mappings = {}
+        uncached_compound_ids = []
+        for cid in compound_ids:
+            # Include campaign filter in cache key to prevent cross-campaign contamination
+            cache_key = f"{campaign_filter}_{cid.lower().strip()}" if campaign_filter else cid.lower().strip()
+            cached_mapping = self._compound_mapping_cache.get(cache_key)
+            if cached_mapping is not None:
+                cached_mappings[cid.lower().strip()] = cached_mapping
+                LOGGER.info("Using cached compound mapping for: %s (campaign: %s)", cid, campaign_filter)
+            else:
+                uncached_compound_ids.append(cid)
+        # If all compounds are cached, return immediately
+        if not uncached_compound_ids:
+            LOGGER.info("All %d compounds found in cache, skipping API calls", len(compound_ids))
+            return cached_mappings
+        LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
+                   len(uncached_compound_ids), sorted(uncached_compound_ids))
         # Tier 1: Standard sections (manuscript + initial SI sections)
         initial_sections = initial_sections or [
@@ -1424,118 +1995,118 @@ TEXT FROM MANUSCRIPT:
         # Include manuscript pages (first 10) for model reaction context
         manuscript_text = "\n\n".join(self.ms_pages[:10])
+        # Add campaign context if provided
+        campaign_context = ""
+        if campaign_filter:
+            campaigns_warning = ""
+            if self.all_campaigns:
+                campaigns_warning = f"""
+ALL CAMPAIGNS IN THIS PAPER:
+{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
+CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
+"""
+            campaign_context = f"""
+IMPORTANT CAMPAIGN CONTEXT:
+You are extracting compound information specifically for the {campaign_filter} campaign.
+{campaigns_warning}
+Focus ONLY on compound information relevant to the {campaign_filter} campaign/reaction system.
+Do NOT include compound information from other campaigns.
+"""
         # Extract from initial sections
         extraction_text = self._extract_sections_by_title(initial_sections)
         if extraction_text:
-            extraction_text = manuscript_text + "\n\n" + extraction_text
+            extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
         else:
-            extraction_text = manuscript_text
+            extraction_text = manuscript_text + campaign_context
-        # First extraction attempt
+        # First extraction attempt - only for uncached compounds
         mappings = self._extract_compound_mappings_from_text(
-            extraction_text[:50000], compound_ids, tag_suffix="initial"
+            extraction_text[:50000], uncached_compound_ids, tag_suffix="initial", campaign_filter=campaign_filter
         )
         LOGGER.info("Tier 1: Found %d compound mappings from standard sections", len(mappings))
         # Check for missing compounds
         missing_compounds = []
-        for cid in compound_ids:
+        for cid in uncached_compound_ids:
             mapping = mappings.get(cid.lower().strip())
             if not mapping or not mapping.iupac_name:
                 missing_compounds.append(cid)
-        # Tier 2: Expanded search + multimodal with figures
+        # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
         if missing_compounds:
-            LOGGER.info("Tier 2: %d compounds still missing IUPAC names: %s",
+            LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
                        len(missing_compounds), sorted(missing_compounds))
-            # Additional sections to search
-            additional_sections = [
-                "Engineering strategy", "Evolution campaign",
-                "Screening", "Optimization", "Substrate synthesis",
-                "Supporting Information", "Supplementary Methods"
-            ]
-            # Extract from additional sections
-            additional_text = self._extract_sections_by_title(additional_sections)
-            # Also extract any figures that might contain compound structures
+            # Get all available figures for compound structure analysis
             figure_images = {}
-            figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Scheme 1", "Scheme 2"]
+            # Extract main manuscript figures
+            figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
             for ref in figure_refs:
                 img_b64 = self._extract_page_png(ref, extract_figure_only=True)
                 if img_b64:
                     figure_images[ref] = img_b64
-                    LOGGER.info("Extracted %s for compound mapping", ref)
-            # Try multimodal approach with figures and expanded text
-            if figure_images or additional_text:
-                combined_text = additional_text[:30000] if additional_text else ""
-                expanded_mappings = self._extract_compound_mappings_with_figures(
-                    combined_text, missing_compounds, figure_images, tag_suffix="tier2"
-                )
-                # Merge new mappings
-                new_found = 0
-                for key, mapping in expanded_mappings.items():
-                    if key not in mappings or not mappings[key].iupac_name:
-                        if mapping.iupac_name:
-                            mappings[key] = mapping
-                            new_found += 1
-                            LOGGER.info("Found IUPAC name for '%s': %s",
-                                      key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
-                LOGGER.info("Tier 2: Found %d additional compound mappings", new_found)
-            # Check again for still missing compounds
-            still_missing = []
-            for cid in missing_compounds:
-                mapping = mappings.get(cid.lower().strip())
-                if not mapping or not mapping.iupac_name:
-                    still_missing.append(cid)
-            # Tier 3: Full manuscript search with all available figures
-            if still_missing:
-                LOGGER.info("Tier 3: %d compounds still missing, trying full manuscript search",
-                           len(still_missing))
-                # Get all SI figures
-                si_figure_refs = []
-                for page in self.si_pages[:5]:  # Check first 5 SI pages
-                    matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
-                    si_figure_refs.extend(matches[:5])  # Limit to 5 figures
+                    LOGGER.info("Retrieved %s for compound mapping", ref)
-                # Extract SI figures
-                for ref in set(si_figure_refs):
-                    if ref not in figure_images:
-                        img_b64 = self._extract_page_png(ref, extract_figure_only=True)
-                        if img_b64:
-                            figure_images[ref] = img_b64
-                            LOGGER.info("Extracted %s for final compound mapping", ref)
-                # Full text search including all pages
-                full_text = "\n\n".join(self.all_pages[:30])  # First 30 pages
-                final_mappings = self._extract_compound_mappings_with_figures(
-                    full_text[:50000], still_missing, figure_images, tag_suffix="tier3"
-                )
-                # Merge final mappings
-                final_found = 0
-                for key, mapping in final_mappings.items():
-                    if key not in mappings or not mappings[key].iupac_name:
-                        if mapping.iupac_name:
-                            mappings[key] = mapping
-                            final_found += 1
-                            LOGGER.info("Found IUPAC name for '%s' in final search: %s",
-                                      key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
+            # Get SI figures
+            si_figure_refs = []
+            for page in self.si_pages[:10]:  # Check first 10 SI pages
+                matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
+                si_figure_refs.extend(matches[:10])  # Limit to 10 figures
+            # Extract SI figures
+            for ref in set(si_figure_refs):
+                if ref not in figure_images:
+                    img_b64 = self._extract_page_png(ref, extract_figure_only=True)
+                    if img_b64:
+                        figure_images[ref] = img_b64
+                        LOGGER.info("Extracted %s for compound mapping", ref)
+            # Full text search including all pages
+            full_text = "\n\n".join(self.all_pages[:40])  # First 40 pages (more comprehensive)
+            final_mappings = self._extract_compound_mappings_with_figures(
+                full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
+            )
+            # Merge final mappings with better compound ID matching
+            final_found = 0
+            for key, mapping in final_mappings.items():
+                if key not in mappings or not mappings[key].iupac_name:
+                    if mapping.iupac_name:
+                        mappings[key] = mapping
+                        final_found += 1
+                        iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
+                        LOGGER.info("Found IUPAC name for '%s' in full search: %s", key, iupac_display)
+            LOGGER.info("Tier 2: Found %d additional compound mappings", final_found)
+        # Cache all newly found mappings using campaign-aware cache key
+        for key, mapping in mappings.items():
+            cache_key = f"{campaign_filter}_{key}" if campaign_filter else key
+            if self._compound_mapping_cache.get(cache_key) is None:
+                self._compound_mapping_cache.put(cache_key, mapping)
+                iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
+                LOGGER.info("Cached compound mapping for: %s -> %s (campaign: %s)", key, iupac_display, campaign_filter)
-                LOGGER.info("Tier 3: Found %d additional compound mappings", final_found)
+                # Also cache without campaign prefix for backward compatibility during integration
+                if campaign_filter:
+                    self._compound_mapping_cache.put(key, mapping)
+        # Combine cached and new mappings
+        final_mappings = cached_mappings.copy()
+        final_mappings.update(mappings)
-        LOGGER.info("Adaptive compound mapping complete: %d total mappings", len(mappings))
-        return mappings
+        LOGGER.info("Adaptive compound mapping complete: %d total mappings (%d cached, %d new)",
+                   len(final_mappings), len(cached_mappings), len(mappings))
+        return final_mappings
-    def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
+    def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None, lineage_compound_ids: Optional[Dict[str, List[str]]] = None) -> Dict[str, Any]:
         """Extract model reaction information using identified locations and 3-tier compound mapping."""
         # First find the best locations
         locations = self.find_model_reaction_locations(enzyme_variants)
@@ -1658,9 +2229,15 @@ TEXT FROM MANUSCRIPT:
                             figure_images[fig_ref] = img_b64
                             LOGGER.info("Successfully extracted %s image for model reaction analysis", fig_ref)
-        # Extract compound IDs from locations
+        # Extract compound IDs from locations or use lineage-specific ones
         compound_ids = []
-        if locations and locations.get("model_reaction_location", {}).get("compound_ids"):
+        if lineage_compound_ids:
+            # Use lineage-specific compound IDs if provided
+            substrate_ids = lineage_compound_ids.get("substrate_ids", [])
+            product_ids = lineage_compound_ids.get("product_ids", [])
+            compound_ids = substrate_ids + product_ids
+            LOGGER.info("Using lineage-specific compound IDs: %s", compound_ids)
+        elif locations and locations.get("model_reaction_location", {}).get("compound_ids"):
             compound_ids = locations["model_reaction_location"]["compound_ids"]
             LOGGER.info("Found compound IDs in model reaction: %s", compound_ids)
@@ -1668,7 +2245,7 @@ TEXT FROM MANUSCRIPT:
         compound_mappings = {}
         if compound_ids:
             LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
-            compound_mappings = self._extract_compound_mappings_adaptive(compound_ids)
+            compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
             # Add the mapped IUPAC names to the context for better extraction
             if compound_mappings:
@@ -1679,8 +2256,35 @@ TEXT FROM MANUSCRIPT:
                         mapping_text += f"Compound {cid}: {mapping.iupac_name}\n"
                 text_context += mapping_text
+        # Add campaign context if available
+        campaign_context = ""
+        if enzyme_variants and self.campaign_filter:
+            campaigns_context = ""
+            if self.all_campaigns:
+                campaigns_context = f"""
+ALL CAMPAIGNS IN THIS PAPER:
+{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
+CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
+- Different campaigns may use similar enzyme names but different substrates
+- Be extremely careful to only extract data for the {self.campaign_filter} campaign
+- Ignore data from other campaigns even if they seem similar
+"""
+            campaign_context = f"""
+IMPORTANT CONTEXT:
+You are extracting the model reaction used specifically for these enzyme variants:
+{', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
+These variants belong to campaign: {self.campaign_filter}
+{campaigns_context}
+Focus on extracting the model reaction that was used to evaluate THESE specific variants.
+Different campaigns may use different model reactions and substrates.
+"""
         # Include both manuscript and SI text for better coverage
-        prompt = PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
+        prompt = campaign_context + PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
         try:
             # Use multimodal extraction if we have figure images
@@ -1693,9 +2297,10 @@ TEXT FROM MANUSCRIPT:
                 for fig_ref, fig_base64 in figure_images.items():
                     try:
                         img_bytes = b64decode(fig_base64)
-                        image = PIL.Image.open(io.BytesIO(img_bytes))
+                        # Format image for Gemini API
+                        image_part = {"mime_type": "image/png", "data": img_bytes}
                         content_parts.append(f"\n[Figure: {fig_ref}]")
-                        content_parts.append(image)
+                        content_parts.append(image_part)
                     except Exception as e:
                         LOGGER.warning("Failed to process figure %s: %s", fig_ref, e)
@@ -1714,6 +2319,20 @@ TEXT FROM MANUSCRIPT:
                     response = model.generate_content(content_parts)
+                    # Track token usage if available
+                    try:
+                        if hasattr(response, 'usage_metadata'):
+                            input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
+                            output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
+                            if input_tokens or output_tokens:
+                                try:
+                                    from .wrapper import add_token_usage
+                                    add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
+                                except ImportError:
+                                    pass  # wrapper not available
+                    except Exception:
+                        pass  # token tracking is best-effort
                     # Parse JSON from response
                     if response and response.text:
                         # Save debug output
@@ -1754,32 +2373,59 @@ TEXT FROM MANUSCRIPT:
             if isinstance(data, dict):
                 # If we have compound mappings, enhance the IUPAC names
                 if compound_ids and compound_mappings:
+                    LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
+                               list(compound_mappings.keys()))
                     # Try to map substrate/product lists through compound IDs
-                    substrate_list = data.get("substrate_iupac_list", [])
+                    substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
                     if isinstance(substrate_list, list):
                         enhanced_substrates = []
                         for item in substrate_list:
+                            item_str = str(item).lower().strip()
                             # Check if it's a compound ID that we can map
-                            mapping = compound_mappings.get(str(item).lower().strip())
+                            mapping = compound_mappings.get(item_str)
                             if mapping and mapping.iupac_name:
                                 enhanced_substrates.append(mapping.iupac_name)
+                                LOGGER.info("Mapped substrate '%s' -> '%s'", item, mapping.iupac_name)
                             elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
-                                # Keep valid IUPAC names
+                                # Keep valid IUPAC names that aren't compound IDs
                                 enhanced_substrates.append(str(item))
+                                LOGGER.info("Kept substrate IUPAC name: '%s'", item)
+                            else:
+                                LOGGER.warning("Could not map substrate compound ID '%s'", item)
                         data["substrate_iupac_list"] = enhanced_substrates
-                    product_list = data.get("product_iupac_list", [])
+                    product_list = data.get("product_iupac_list", []) or data.get("product_list", [])
                     if isinstance(product_list, list):
                         enhanced_products = []
                         for item in product_list:
+                            item_str = str(item).lower().strip()
                             # Check if it's a compound ID that we can map
-                            mapping = compound_mappings.get(str(item).lower().strip())
+                            mapping = compound_mappings.get(item_str)
                             if mapping and mapping.iupac_name:
                                 enhanced_products.append(mapping.iupac_name)
+                                LOGGER.info("Mapped product '%s' -> '%s'", item, mapping.iupac_name)
                             elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
-                                # Keep valid IUPAC names
+                                # Keep valid IUPAC names that aren't compound IDs
                                 enhanced_products.append(str(item))
+                                LOGGER.info("Kept product IUPAC name: '%s'", item)
+                            else:
+                                LOGGER.warning("Could not map product compound ID '%s'", item)
                         data["product_iupac_list"] = enhanced_products
+                    # Also try to enhance using both substrate_list and product_list if they contain compound IDs
+                    for list_key, target_key in [("substrate_list", "substrate_iupac_list"), ("product_list", "product_iupac_list")]:
+                        if list_key in data and isinstance(data[list_key], list):
+                            if target_key not in data or not data[target_key]:
+                                enhanced_list = []
+                                for item in data[list_key]:
+                                    item_str = str(item).lower().strip()
+                                    mapping = compound_mappings.get(item_str)
+                                    if mapping and mapping.iupac_name:
+                                        enhanced_list.append(mapping.iupac_name)
+                                        LOGGER.info("Enhanced %s: mapped '%s' -> '%s'", target_key, item, mapping.iupac_name)
+                                if enhanced_list:
+                                    data[target_key] = enhanced_list
                 # Validate and convert arrays to semicolon-separated strings for CSV compatibility
                 if "substrate_iupac_list" in data and isinstance(data["substrate_iupac_list"], list):
@@ -1826,8 +2472,8 @@ TEXT FROM MANUSCRIPT:
         return data
     def _process_single_lineage(self, location: Dict[str, Any], enzyme_df: pd.DataFrame) -> pd.DataFrame:
-        """Process a single lineage case - still extract based on location."""
-        # Even for single lineage, use location-based extraction
+        """Process a single lineage case - use confidence-based processing."""
+        # Create lineage analysis for single location
         lineage_analysis = {
             'has_multiple_lineages': False,
             'lineage_groups': [{
@@ -1839,7 +2485,7 @@ TEXT FROM MANUSCRIPT:
             }]
         }
-        return self._process_multiple_lineages([location], enzyme_df, lineage_analysis)
+        return self._process_multiple_lineages_by_confidence([location], enzyme_df, lineage_analysis)
     def _process_multiple_lineages_by_confidence(self, locations: List[Dict[str, Any]],
                                                  enzyme_df: pd.DataFrame,
@@ -1854,186 +2500,69 @@ TEXT FROM MANUSCRIPT:
         # If enzyme_df has campaign_id column, we can use it to filter
         has_campaign_info = 'campaign_id' in enzyme_df.columns
-        # Process locations in order of confidence
-        for location in locations:
-            if len(variants_with_data) >= len(all_variants):
-                LOGGER.info("All variants have data, stopping extraction")
-                break
-            LOGGER.info("\nProcessing location %s (confidence: %d%%)",
-                       location['location'], location.get('confidence', 0))
+        # Select the most confident source only
+        best_location = None
+        if locations:
+            # Sort by confidence only
+            locations_sorted = sorted(locations, key=lambda x: -x.get('confidence', 0))
+            best_location = locations_sorted[0]
+            LOGGER.info("Selected primary location: %s (type: %s, confidence: %d%%)",
+                       best_location['location'],
+                       best_location.get('type', 'unknown'),
+                       best_location.get('confidence', 0))
-            # Extract metrics from this location for ALL enzymes
-            metrics_rows = self.extract_metrics_batch(all_enzyme_ids, location['location'])
+            # Extract metrics from the most confident source only
+            metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
             # Filter to valid metrics
             valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
             if not valid_metrics:
-                LOGGER.warning("No valid metrics found in %s", location['location'])
-                continue
+                LOGGER.warning("No valid metrics found in primary location %s", best_location['location'])
+                return pd.DataFrame()
-            LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), location['location'])
+            LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), best_location['location'])
-            # Create DataFrame for this location
+            # Create DataFrame for the single best location
             df_location = pd.DataFrame(valid_metrics)
-            # Track which variants we got data for
-            new_variants = set(df_location['enzyme'].tolist()) - variants_with_data
-            LOGGER.info("Found data for %d new variants in %s", len(new_variants), location['location'])
-            variants_with_data.update(new_variants)
+            # Add metadata about the location
+            df_location['data_location'] = best_location['location']
+            df_location['confidence'] = best_location.get('confidence', 0)
-            # Determine which campaign/lineage this location represents
-            # by checking which variants are present
-            location_variants = set(df_location['enzyme'].tolist())
+            LOGGER.info("Successfully extracted data for %d enzymes from primary location", len(df_location))
-            # If we have campaign info, determine the campaign for this location
-            campaign_id = None
-            if has_campaign_info:
-                # Find which campaign(s) these variants belong to
-                if 'enzyme_id' in enzyme_df.columns:
-                    variant_campaigns = enzyme_df[enzyme_df['enzyme_id'].isin(location_variants)]['campaign_id'].unique()
-                else:
-                    variant_campaigns = enzyme_df[enzyme_df['enzyme'].isin(location_variants)]['campaign_id'].unique()
-                if len(variant_campaigns) == 1:
-                    campaign_id = variant_campaigns[0]
-                    LOGGER.info("Location %s contains variants from campaign: %s",
-                               location['location'], campaign_id)
-                elif len(variant_campaigns) > 1:
-                    LOGGER.warning("Location %s contains variants from multiple campaigns: %s",
-                                  location['location'], variant_campaigns)
-            # Extract model reaction specific to this location/campaign
-            location_context = f"Location: {location['location']}"
-            if location.get('caption'):
-                location_context += f"\nCaption: {location['caption']}"
-            # First find model reaction locations for this campaign/enzyme group
+            # Extract model reaction info once for this location
+            location_context = f"Location: {best_location['location']}"
+            if best_location.get('caption'):
+                location_context += f"\nCaption: {best_location['caption']}"
+            # Get enzyme list for model reaction
             location_enzymes = df_location['enzyme'].unique().tolist()
+            # Get model reaction locations for this campaign
             model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
-            # Try to find model reaction for this specific lineage, passing the locations
-            location_model_reaction = self.find_lineage_model_reaction(
-                location['location'],
-                location_context,
-                model_reaction_locations
-            )
-            # Get full model reaction info with IUPAC names
-            if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
-                model_info = self._extract_lineage_model_info(location_model_reaction)
-            else:
-                # Fall back to general model reaction extraction
-                # Pass the enzyme variants from this location
-                model_info = self.gather_model_reaction_info(location_enzymes)
-            # Add model reaction info to all enzymes from this location
-            for key, value in model_info.items():
-                if isinstance(value, list):
-                    value = "; ".join(str(v) for v in value) if value else None
-                df_location[key] = value
-            # Add location and campaign info
-            df_location['data_location'] = location['location']
-            df_location['location_type'] = location.get('type', 'unknown')
-            df_location['location_confidence'] = location.get('confidence', 0)
-            # Remove lineage_group column - not working properly
-            # df_location['lineage_group'] = location.get('lineage_hint', campaign_id or 'unknown')
-            all_results.append(df_location)
+            # Extract model reaction for this location - use unified approach
+            LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
-            # Log progress
-            LOGGER.info("Progress: %d/%d variants have data",
-                       len(variants_with_data), len(all_variants))
-        if all_results:
-            # Combine all results
-            df_combined = pd.concat(all_results, ignore_index=True)
-            # If we have duplicates (same variant in multiple locations), keep the one with highest confidence
-            if df_combined.duplicated(subset=['enzyme']).any():
-                LOGGER.info("Removing duplicates, keeping highest confidence data")
-                df_combined = df_combined.sort_values(
-                    ['enzyme', 'location_confidence'],
-                    ascending=[True, False]
-                ).drop_duplicates(subset=['enzyme'], keep='first')
-            # Log extraction summary
-            LOGGER.info("Extraction complete: %d unique variants from %d locations",
-                       len(df_combined), len(all_results))
-            if 'data_location' in df_combined.columns:
-                for location in df_combined['data_location'].unique():
-                    location_enzymes = df_combined[df_combined['data_location'] == location]
-                    LOGGER.info("  - %s: %d enzymes", location, len(location_enzymes))
-            return df_combined
-        else:
-            LOGGER.warning("No metrics extracted from any location")
-            return pd.DataFrame()
-    def _process_multiple_lineages(self, locations: List[Dict[str, Any]],
-                                  enzyme_df: pd.DataFrame,
-                                  lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
-        """Process multiple lineages where each location represents a different model reaction."""
-        all_metrics = []
-        lineage_groups = lineage_analysis.get('lineage_groups', [])
-        # Get all enzyme IDs for extraction attempts
-        all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else []
-        for group in lineage_groups:
-            group_location = group.get('data_location')
-            group_id = group.get('group_id')
-            # Find the location info
-            location_info = next((loc for loc in locations if loc['location'] == group_location), None)
-            if not location_info:
-                LOGGER.warning("No location info found for group %s at %s", group_id, group_location)
-                continue
-            LOGGER.info("Processing location %s (%s)", group_location, group_id)
-            # Extract metrics from this location for ALL enzymes
-            # The extractor will return only those that actually have data
-            metrics_rows = self.extract_metrics_batch(all_enzyme_ids, group_location)
-            # Filter to enzymes that actually had data in this location
-            valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
-            if not valid_metrics:
-                LOGGER.warning("No valid metrics found in %s", group_location)
-                continue
-            LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), group_location)
-            # Create DataFrame for this location
-            df_location = pd.DataFrame(valid_metrics)
-            # Extract model reaction specific to this location
-            # Different locations = different model reactions
-            location_context = f"Location: {group_location}"
-            if group.get('caption'):
-                location_context += f"\nCaption: {group['caption']}"
-            # First find model reaction locations for this enzyme group
-            location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
-            model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
-            # Try to find model reaction for this specific lineage, passing the locations
+            # Try lineage-specific extraction first
             location_model_reaction = self.find_lineage_model_reaction(
-                group_location,
+                best_location['location'],
                 location_context,
                 model_reaction_locations
             )
-            # Get full model reaction info with IUPAC names
+            # Check if lineage extraction was successful
             if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
-                model_info = self._extract_lineage_model_info(location_model_reaction)
+                LOGGER.info("Using lineage-specific model reaction data")
+                model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
             else:
-                # Try to extract model reaction from this specific location
-                # Pass the enzyme variants that have data in this location
+                LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
+                # Use the comprehensive multimodal approach as fallback
                 model_info = self.gather_model_reaction_info(location_enzymes)
+            LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
             # Add model reaction info to all enzymes from this location
             for key, value in model_info.items():
@@ -2041,30 +2570,18 @@ TEXT FROM MANUSCRIPT:
                     value = "; ".join(str(v) for v in value) if value else None
                 df_location[key] = value
-            # Add location identifier
-            df_location['data_location'] = group_location
-            # Remove lineage_group column - not working properly
-            # df_location['lineage_group'] = group.get('lineage_hint', group_id)
-            all_metrics.append(df_location)
-        if all_metrics:
-            # Combine all metrics
-            df_combined = pd.concat(all_metrics, ignore_index=True)
-            # Log extraction summary
-            LOGGER.info("Extraction complete: %d total enzymes from %d locations",
-                       len(df_combined), len(all_metrics))
+            # Add additional location metadata (data_location already set above)
+            df_location['location_type'] = best_location.get('type', 'unknown')
+            df_location['location_confidence'] = best_location.get('confidence', 0)
-            if 'data_location' in df_combined.columns:
-                for location in df_combined['data_location'].unique():
-                    location_enzymes = df_combined[df_combined['data_location'] == location]
-                    LOGGER.info("  - %s: %d enzymes", location, len(location_enzymes))
+            LOGGER.info("Extraction complete: %d variants from primary location %s",
+                       len(df_location), best_location['location'])
-            return df_combined
-        else:
-            LOGGER.warning("No metrics extracted from any location")
-            return pd.DataFrame()
+            return df_location
+        # No locations found
+        LOGGER.warning("No valid locations found for extraction")
+        return pd.DataFrame()
     def _has_valid_metrics(self, metrics_row: Dict[str, Any]) -> bool:
         """Check if a metrics row contains any valid performance data."""
@@ -2151,14 +2668,18 @@ TEXT FROM MANUSCRIPT:
         return filtered
-    def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any]) -> Dict[str, Any]:
+    def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any], enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
         """Extract full model reaction info including IUPAC names for a lineage."""
         # Get substrate/product IDs from lineage-specific extraction
         substrate_ids = lineage_reaction.get('substrate_ids', [])
         product_ids = lineage_reaction.get('product_ids', [])
-        # Get general model reaction info for conditions
-        general_info = self.gather_model_reaction_info()
+        # Get general model reaction info for conditions, using lineage-specific compound IDs
+        lineage_ids = {
+            "substrate_ids": substrate_ids,
+            "product_ids": product_ids
+        }
+        general_info = self.gather_model_reaction_info(enzyme_variants, lineage_compound_ids=lineage_ids)
         # Override substrate/product lists with lineage-specific ones only if they contain actual compound IDs
         model_info = general_info.copy()
@@ -2304,6 +2825,9 @@ TEXT FROM MANUSCRIPT:
             LOGGER.error("No enzyme DataFrame provided - this module requires enzyme CSV input")
             return pd.DataFrame()
+        # Store enzyme_df for use in extract_metrics_batch
+        self.enzyme_df = enzyme_df
         # Check if we have campaign_id column - if so, process each campaign separately
         if 'campaign_id' in enzyme_df.columns and not self.campaign_filter:
             campaigns = enzyme_df['campaign_id'].unique()
@@ -2322,15 +2846,16 @@ TEXT FROM MANUSCRIPT:
                         si=self.si,
                         cfg=self.cfg,
                         debug_dir=self.debug_dir / campaign_id if self.debug_dir else None,
-                        campaign_filter=campaign_id
+                        campaign_filter=campaign_id,
+                        all_campaigns=campaigns.tolist()
                     )
                     # Run extraction for this campaign
                     campaign_df = campaign_extractor.run(enzyme_df)
                     if not campaign_df.empty:
-                        # Add campaign identifier
-                        campaign_df['campaign_id'] = campaign_id
+                        # Add a temporary campaign identifier for merging
+                        campaign_df['_extraction_campaign'] = campaign_id
                         all_campaign_results.append(campaign_df)
                         LOGGER.info("Extracted %d reactions for campaign %s", len(campaign_df), campaign_id)
@@ -2401,7 +2926,7 @@ TEXT FROM MANUSCRIPT:
 def merge_with_lineage_data(
     df_lineage: pd.DataFrame, df_metrics: pd.DataFrame
 ) -> pd.DataFrame:
-    """Outer-merge on 'enzyme' column. Left CSV defines desired row order."""
+    """Merge lineage and metrics data ensuring one-to-one mapping per campaign."""
     # Handle both 'enzyme' and 'enzyme_id' column names
     if "enzyme_id" in df_lineage.columns and "enzyme" not in df_lineage.columns:
@@ -2410,7 +2935,19 @@ def merge_with_lineage_data(
     if "enzyme" not in df_lineage.columns:
         raise ValueError("Lineage CSV must have an 'enzyme' or 'enzyme_id' column.")
-    merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
+    # Check if we have campaign information to match on
+    if "campaign_id" in df_lineage.columns and "_extraction_campaign" in df_metrics.columns:
+        # Match on both enzyme and campaign to ensure correct pairing
+        df_metrics_temp = df_metrics.copy()
+        df_metrics_temp['campaign_id'] = df_metrics_temp['_extraction_campaign']
+        df_metrics_temp = df_metrics_temp.drop('_extraction_campaign', axis=1)
+        merged = df_lineage.merge(df_metrics_temp, on=["enzyme", "campaign_id"], how="left")
+    else:
+        # Simple merge on enzyme only
+        if "_extraction_campaign" in df_metrics.columns:
+            df_metrics = df_metrics.drop('_extraction_campaign', axis=1)
+        merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
     return merged
 ###############################################################################
@@ -2439,25 +2976,103 @@ def main() -> None:
     if args.verbose:
         LOGGER.setLevel(logging.DEBUG)
     cfg = Config()
-    extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir)
-    # Load enzyme data from CSV if provided
+    # Load enzyme data from CSV if provided to detect campaign information
     enzyme_df = None
+    campaign_filter = None
+    all_campaigns = None
     if args.lineage_csv and args.lineage_csv.exists():
         LOGGER.info("Loading enzyme data from CSV…")
         enzyme_df = pd.read_csv(args.lineage_csv)
-    # Run extraction with enzyme data
-    df_metrics = extractor.run(enzyme_df)
+        # Detect campaign information from the enzyme CSV
+        if 'campaign_id' in enzyme_df.columns:
+            all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
+            if len(all_campaigns) == 1:
+                campaign_filter = all_campaigns[0]
+                LOGGER.info("Detected single campaign: %s", campaign_filter)
+                extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+                                            campaign_filter=campaign_filter, all_campaigns=all_campaigns)
+                df_metrics = extractor.run(enzyme_df)
+            elif len(all_campaigns) > 1:
+                LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
+                all_results = []
+                # Process each campaign separately
+                for campaign in all_campaigns:
+                    LOGGER.info("Processing campaign: %s", campaign)
+                    # Filter enzyme_df to this campaign
+                    campaign_df = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
+                    LOGGER.info("Found %d enzymes for campaign %s", len(campaign_df), campaign)
+                    if len(campaign_df) == 0:
+                        LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
+                        continue
+                    # Create extractor for this campaign
+                    extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+                                                campaign_filter=campaign, all_campaigns=all_campaigns)
+                    # Run extraction for this campaign
+                    campaign_metrics = extractor.run(campaign_df)
+                    if not campaign_metrics.empty:
+                        # Merge with lineage data for this campaign
+                        campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
+                        if "enzyme_id" in campaign_lineage.columns and "enzyme" not in campaign_lineage.columns:
+                            campaign_lineage = campaign_lineage.rename(columns={"enzyme_id": "enzyme"})
+                        # Merge campaign metrics with lineage data
+                        campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
+                        # Save campaign-specific file immediately
+                        output_dir = args.output.parent
+                        base_name = args.output.stem
+                        campaign_file = output_dir / f"{base_name}_{campaign}.csv"
+                        campaign_final.to_csv(campaign_file, index=False)
+                        LOGGER.info("Saved %d rows for campaign %s -> %s", len(campaign_final), campaign, campaign_file)
+                        # Add the merged data (not just metrics) to final results
+                        all_results.append(campaign_final)
+                        LOGGER.info("Added %d merged results for campaign %s", len(campaign_final), campaign)
+                    else:
+                        LOGGER.warning("No results extracted for campaign %s", campaign)
+                        # Still save an empty campaign file with lineage data
+                        campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
+                        if not campaign_lineage.empty:
+                            output_dir = args.output.parent
+                            base_name = args.output.stem
+                            campaign_file = output_dir / f"{base_name}_{campaign}.csv"
+                            campaign_lineage.to_csv(campaign_file, index=False)
+                            LOGGER.info("Saved %d rows (lineage only) for campaign %s -> %s", len(campaign_lineage), campaign, campaign_file)
+                # Combine all campaign results
+                if all_results:
+                    df_metrics = pd.concat(all_results, ignore_index=True)
+                    LOGGER.info("Combined results from %d campaigns: %d total rows", len(all_results), len(df_metrics))
+                else:
+                    LOGGER.warning("No results from any campaign")
+                    df_metrics = pd.DataFrame()
+        else:
+            # No campaign information, process all enzymes together
+            extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+                                        campaign_filter=campaign_filter, all_campaigns=all_campaigns)
+            df_metrics = extractor.run(enzyme_df)
-    if args.lineage_csv and args.lineage_csv.exists() and not df_metrics.empty:
-        LOGGER.info("Merging with lineage CSV…")
-        df_final = merge_with_lineage_data(enzyme_df, df_metrics)
-    else:
-        df_final = df_metrics
+    # Skip final merge since campaign-specific merges already happened during processing
+    # This avoids duplicate entries when same enzyme appears in multiple campaigns
+    df_final = df_metrics
+    LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
     df_final.to_csv(args.output, index=False)
     LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
+    # Campaign-specific files are already saved during processing above
 if __name__ == "__main__":
     main()

debase 0.1.19__py3-none-any.whl → 0.4.0__py3-none-any.whl

debase 0.1.19py3-none-any.whl → 0.4.0py3-none-any.whl