PyPI - debase - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

debase 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

debase/_version.py +1 -1
debase/campaign_utils.py +146 -0
debase/caption_pattern.py +44 -0
debase/cleanup_sequence.py +34 -6
debase/enzyme_lineage_extractor.py +481 -106
debase/lineage_format.py +44 -1
debase/reaction_info_extractor.py +479 -135
debase/substrate_scope_extractor.py +207 -80
debase/wrapper.py +3 -3
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
debase-0.6.2.dist-info/RECORD +18 -0
debase-0.6.0.dist-info/RECORD +0 -16
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0

debase/reaction_info_extractor.py CHANGED Viewed

@@ -37,7 +37,16 @@ from dataclasses import dataclass, field
 from functools import lru_cache
 from pathlib import Path
 from textwrap import dedent
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
+# Import universal caption pattern
+try:
+    from .caption_pattern import get_universal_caption_pattern
+    from .campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
+except ImportError:
+    # Fallback if running as standalone script
+    from caption_pattern import get_universal_caption_pattern
+    from campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
 import fitz  # PyMuPDF - for image extraction
 import google.generativeai as genai  # type: ignore
@@ -433,7 +442,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
 You are an expert reader of protein engineering manuscripts.
 Given the following article captions and section titles, identify most promising locations
 (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
-activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
+activity, etc.) for enzyme variants.
+CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
+- Look for locations showing data for ALL enzyme variants in the evolution lineage
+- Prioritize sources that show the complete evolutionary progression (parent → child variants)
+- Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
+- Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
 IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
 performance data locations. Pay careful attention to:
@@ -441,8 +456,13 @@ performance data locations. Pay careful attention to:
 - Enzyme name prefixes that indicate different campaigns
 - Different substrate/product types mentioned in captions
+IMPORTANT FIGURE REFERENCE RULES:
+- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
+- The extraction system will handle retrieving the entire figure including all sub-panels
+- For tables, return the complete reference as it appears
 Respond with a JSON array where each element contains:
-- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
+- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
 - "type": one of "table", "figure"
 - "confidence": your confidence score (0-100)
 - "caption": the exact caption text for this location
@@ -450,14 +470,29 @@ Respond with a JSON array where each element contains:
 - "lineage_hint": any indication of which enzyme group this data is for (or null)
 - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
-Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
+PRIORITIZATION RULES:
+- HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
+- MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
+- LOWEST PRIORITY: Sources showing data for individual variants only
+Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
+IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
 Do not include too much sources, just return 2 or 3 sources.
 Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
 When returning confidence scores, be more accurate and avoid scores that are too close together.
+CRITICAL:
+- Return "location" EXACTLY as the first reference identifier appears in the actual caption text
+- Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
+- Do NOT modify, standardize, or interpret the location - return it verbatim from the document
+- Include "document" field to specify which PDF contains this location: "manuscript" or "supplementary"
 Respond ONLY with **minified JSON**. NO markdown fences.
-Example:
-[{"location": "Table S1", "type": "table", "confidence": 95, "caption": "Table S1. Detailed information...", "reason": "Complete performance metrics", "lineage_hint": "first enzyme family", "campaign_clues": "PYS lineage, pyrrolidine synthesis"}]
+Format:
+[{"location": "", "type": "", "document": "", "confidence": 0, "caption": "", "reason": "", "lineage_hint": "", "campaign_clues": ""}]
 """)
 PROMPT_EXTRACT_METRICS = dedent("""
@@ -484,6 +519,13 @@ IMPORTANT:
 - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
 - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
+CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
+- Yield (%) measures how much product was formed (0-100%)
+- Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
+- TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
+- These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
+- Be extremely careful when extracting from tables/figures with multiple columns or data series
 Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
 """)
@@ -511,6 +553,17 @@ STEP 4: Extract values for each matched variant
 - CRITICAL: Read actual scale values from the axis labels and tick marks
 - Verify: taller bars should have higher values, higher dots should have higher values
+CRITICAL DATA ACCURACY REQUIREMENTS:
+- DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
+- Yield is typically shown as percentage (0-100%)
+- Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
+- TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
+- Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
+- Carefully match each bar/dot to its corresponding enzyme label on the X-axis
+- If looking at grouped bars, ensure you're reading the correct bar for each metric
+- Double-check that variant A's yield is not confused with variant B's yield
+- If values are unclear or ambiguous, return null rather than guessing
 Target enzymes to find and extract:
 {enzyme_names}
@@ -734,7 +787,8 @@ class ReactionExtractor:
     _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
     def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
-                 campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
+                 campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None,
+                 campaign_info: Optional[Dict[str, Any]] = None):
         self.manuscript = manuscript
         self.si = si
         self.cfg = cfg
@@ -742,6 +796,7 @@ class ReactionExtractor:
         self.debug_dir = debug_dir
         self.campaign_filter = campaign_filter  # Filter for specific campaign
         self.all_campaigns = all_campaigns or []  # List of all campaigns for context
+        self.campaign_info = campaign_info  # Detailed campaign information from campaigns.json
         # Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
         self._figure_cache = LRUCache(maxsize=100)  # Figures are large, so smaller cache
@@ -778,10 +833,8 @@ class ReactionExtractor:
     # ------------------------------------------------------------------
     def _collect_captions_and_titles(self) -> str:
-        # Pattern to match Table or Figure with optional leading whitespace and page numbers
-        # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
-        # Also handles cases where there's whitespace or page numbers before the caption
-        cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
+        # Use universal caption pattern that handles all common formats
+        cap_pattern = get_universal_caption_pattern()
         captions: List[str] = []
         # Process each page individually to avoid TOC entries
@@ -861,7 +914,34 @@ class ReactionExtractor:
         # Add campaign context - always provide context to help model understanding
         campaign_context = ""
-        if self.campaign_filter:
+        # If we have detailed campaign info, use it to provide specific guidance
+        if self.campaign_info:
+            location_hints = get_location_hints_for_campaign(self.campaign_info)
+            campaign_context = f"""
+            IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
+            CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
+            - Campaign ID: {self.campaign_info.get('campaign_id', '')}
+            - Name: {self.campaign_info.get('campaign_name', '')}
+            - Description: {self.campaign_info.get('description', '')}
+            - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
+            - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
+            - Notes: {self.campaign_info.get('notes', '')}
+            KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
+            These locations are known to contain relevant data - prioritize them highly.
+            CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
+            - Look for data showing the entire evolutionary progression of enzyme variants
+            - Prioritize locations that show performance data for ALL variants in the lineage
+            - The campaign description and notes above provide context about the evolution strategy used
+            {f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
+            CRITICAL: Only return locations that contain data for this specific campaign.
+            """
+        elif self.campaign_filter:
             campaigns_warning = ""
             if self.all_campaigns:
                 campaigns_warning = f"""
@@ -1103,11 +1183,114 @@ class ReactionExtractor:
         return toc_score >= 2
+    def _build_caption_index(self) -> Dict[str, Dict[str, Any]]:
+        """Build an index of all captions for quick lookup."""
+        if hasattr(self, '_caption_index'):
+            return self._caption_index
+        cap_pattern = get_universal_caption_pattern()
+        caption_index = {}
+        for idx, page in enumerate(self.all_pages):
+            source = "manuscript" if idx < len(self.ms_pages) else "supplementary"
+            page_num = idx + 1 if idx < len(self.ms_pages) else idx - len(self.ms_pages) + 1
+            for match in cap_pattern.finditer(page):
+                caption_text = match.group(0).strip()
+                # Extract a normalized key (e.g., "table 5", "figure 3")
+                caption_lower = caption_text.lower()
+                # Store multiple access patterns for the same caption
+                caption_info = {
+                    'full_caption': caption_text,
+                    'page_content': page,
+                    'page_idx': idx,
+                    'source': source,
+                    'page_num': page_num,
+                    'match_start': match.start()
+                }
+                # Create multiple keys for flexible matching
+                # Key 1: Full caption text (first 100 chars)
+                key1 = caption_text[:100].lower().strip()
+                caption_index[key1] = caption_info
+                # Key 2: Simplified reference (e.g., "table 5", "figure s3")
+                ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
+                if ref_match:
+                    key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
+                    caption_index[key2] = caption_info
+                    # Also store with 's' prefix if in SI
+                    if source == "supplementary" and 's' not in key2:
+                        key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
+                        caption_index[key3] = caption_info
+        self._caption_index = caption_index
+        return caption_index
     def _page_with_reference(self, ref_id: str) -> Optional[str]:
-        for page in self.all_pages:
-            if ref_id.lower() in page.lower():
-                return page
-        return None
+        """Find page(s) containing a reference using flexible matching."""
+        caption_index = self._build_caption_index()
+        ref_lower = ref_id.lower().strip()
+        # Try multiple matching strategies
+        matches = []
+        # Strategy 1: Direct key lookup
+        if ref_lower in caption_index:
+            matches.append(caption_index[ref_lower])
+        # Strategy 2: Normalized reference lookup (e.g., "table 5", "figure s3")
+        ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
+        if ref_match:
+            ref_type, ref_num = ref_match.groups()
+            if ref_type == 'fig':
+                ref_type = 'figure'
+            # Try different key formats
+            keys_to_try = [
+                f"{ref_type} {ref_num}",
+                f"{ref_type} s{ref_num}",
+                f"table {ref_num}",  # Sometimes figures are mislabeled
+                f"fig {ref_num}",
+                f"figure {ref_num}"
+            ]
+            for key in keys_to_try:
+                if key in caption_index and caption_index[key] not in matches:
+                    matches.append(caption_index[key])
+        # Strategy 3: Fuzzy matching on caption text
+        if not matches:
+            # Look for any caption containing the reference number
+            for key, info in caption_index.items():
+                if ref_match and ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
+                    if info not in matches:
+                        matches.append(info)
+        # Return results
+        if not matches:
+            LOGGER.warning(f"No matches found for reference '{ref_id}'")
+            # Last resort: simple text search
+            for page in self.all_pages:
+                if ref_lower in page.lower():
+                    return page
+            return None
+        # If single match, return it
+        if len(matches) == 1:
+            return matches[0]['page_content']
+        # Multiple matches: combine them with source annotations
+        LOGGER.info(f"Found {len(matches)} potential matches for '{ref_id}'")
+        combined_pages = []
+        for match in matches:
+            header = f"\n\n=== {match['source'].upper()} PAGE {match['page_num']} ===\n"
+            header += f"Caption: {match['full_caption'][:200]}...\n"
+            combined_pages.append(header + match['page_content'])
+        return "\n".join(combined_pages)
     # ---- Table text helper - now returns full page ----
     def _extract_table_context(self, ref: str) -> str:
@@ -1140,19 +1323,29 @@ class ReactionExtractor:
         return pix
     # ---- NEW: Page image helper for both figures and tables ----
-    def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
+    def _extract_page_png(self, ref: str, extract_figure_only: bool = True, caption_hint: str = "", document_hint: str = "") -> Optional[str]:
         """Export the page containing the reference as PNG.
         If extract_figure_only=True, extracts just the figure above the caption.
         If False, extracts the entire page (useful for tables).
-        Returns a base64-encoded PNG or None."""
-        LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
+        Returns a base64-encoded PNG or None.
-        # Check cache first
-        cache_key = f"{ref}_{extract_figure_only}"
+        Args:
+            ref: The reference string (e.g., "Fig. 3")
+            extract_figure_only: Whether to extract just the figure or the entire page
+            caption_hint: Optional caption text from location data to help find the exact figure
+            document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
+        """
+        LOGGER.info("_extract_page_png called with ref='%s', extract_figure_only=%s, caption_hint='%s', document_hint='%s'",
+                    ref, extract_figure_only, caption_hint[:50] + "..." if caption_hint else "EMPTY", document_hint)
+        # Check cache first - include document hint in key to avoid cross-document contamination
+        cache_key = f"{ref}_{extract_figure_only}_{document_hint}" if document_hint else f"{ref}_{extract_figure_only}"
         cached_result = self._figure_cache.get(cache_key)
         if cached_result is not None:
-            LOGGER.debug("Using cached figure for %s", ref)
+            LOGGER.info("Using cached figure for %s (cache key: %s)", ref, cache_key)
             return cached_result
+        else:
+            LOGGER.info("Cache miss for %s (cache key: %s)", ref, cache_key)
         # For table extraction, use multi-page approach
         if not extract_figure_only:
@@ -1162,13 +1355,28 @@ class ReactionExtractor:
                 return self._extract_multiple_pages_png(pages_with_ref, ref)
             return None
-        # For figure extraction, search both documents for actual figure captions
-        docs = list(filter(None, [self.ms_doc, self.si_doc]))
-        LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
+        # For figure extraction, prioritize based on document hint
+        if document_hint == "manuscript" and self.ms_doc:
+            # Search manuscript first, then SI as fallback
+            docs = list(filter(None, [self.ms_doc, self.si_doc]))
+            LOGGER.info("Prioritizing manuscript document for '%s' (hint: %s)", ref, document_hint)
+            LOGGER.info("Search order: 1) Manuscript, 2) SI (fallback)")
+        elif document_hint == "supplementary" and self.si_doc:
+            # Search SI first, then manuscript as fallback
+            docs = list(filter(None, [self.si_doc, self.ms_doc]))
+            LOGGER.info("Prioritizing supplementary document for '%s' (hint: %s)", ref, document_hint)
+            LOGGER.info("Search order: 1) SI, 2) Manuscript (fallback)")
+        else:
+            # Default behavior - search both in order
+            docs = list(filter(None, [self.ms_doc, self.si_doc]))
+            LOGGER.info("Searching for '%s' in %d documents (no document hint)", ref, len(docs))
+            LOGGER.info("Search order: 1) Manuscript, 2) SI (default order)")
         for doc_idx, doc in enumerate(docs):
-            doc_name = "MS" if doc_idx == 0 else "SI"
-            LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
+            # Determine document name based on actual document, not position
+            doc_name = "MS" if doc == self.ms_doc else "SI"
+            LOGGER.info("Searching document %d/%d: %s (has %d pages)",
+                       doc_idx + 1, len(docs), doc_name, doc.page_count)
             for page_number in range(doc.page_count):
                 page = doc.load_page(page_number)
@@ -1181,9 +1389,90 @@ class ReactionExtractor:
                     LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
                     continue
+                # If we have a caption hint, try to find it using fuzzy matching
+                if caption_hint:
+                    LOGGER.info("=== CAPTION HINT SEARCH ===")
+                    LOGGER.info("Caption hint provided: %s", caption_hint[:100])
+                    LOGGER.info("Searching in %s document, page %d", doc_name, page_number + 1)
+                    LOGGER.info("Page text length: %d chars", len(page_text))
+                    # Check if caption exists in raw form
+                    if caption_hint[:50] in page_text:
+                        LOGGER.info("✓ Caption hint found in raw page text!")
+                    else:
+                        LOGGER.info("✗ Caption hint NOT found in raw page text")
+                    # Normalize texts for better matching
+                    def normalize_for_matching(text):
+                        # Remove extra whitespace, normalize spaces around punctuation
+                        text = ' '.join(text.split())
+                        # Normalize different dash types
+                        text = text.replace('–', '-').replace('—', '-')
+                        return text
+                    normalized_hint = normalize_for_matching(caption_hint[:100])  # Use first 100 chars
+                    normalized_page = normalize_for_matching(page_text)
+                    # Try to find the caption using fuzzy matching
+                    best_match_pos = -1
+                    best_match_score = 0
+                    match_found = False
+                    # Slide through the page text looking for best match
+                    hint_len = len(normalized_hint)
+                    for i in range(len(normalized_page) - hint_len + 1):
+                        snippet = normalized_page[i:i + hint_len]
+                        # Simple character-based similarity
+                        matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
+                        score = matches / hint_len
+                        if score > best_match_score and score > 0.8:  # 80% similarity threshold
+                            best_match_score = score
+                            best_match_pos = i
+                            match_found = True
+                    if match_found and best_match_pos >= 0:
+                        LOGGER.info("Found caption match in %s document on page %d with %.1f%% similarity",
+                                   doc_name, page_number + 1, best_match_score * 100)
+                        # Instead of complex position mapping, just search for the beginning of the caption
+                        # Use the first 30 chars which should be unique enough
+                        search_text = caption_hint[:30].strip()
+                        LOGGER.info("Searching for caption text: '%s'", search_text)
+                        caption_instances = page.search_for(search_text)
+                        LOGGER.info("Found %d caption instances", len(caption_instances) if caption_instances else 0)
+                        if caption_instances:
+                            cap_rect = caption_instances[0]
+                            caption_found = True
+                            # Extract figure above this caption
+                            if extract_figure_only:
+                                LOGGER.info("Extracting figure area including caption for %s from %s document", ref, doc_name)
+                                LOGGER.info("Caption found at rect: %s on page %d", cap_rect, page_number + 1)
+                                page_rect = page.rect
+                                # Include the caption in the extraction
+                                # Add some padding below the caption to ensure we get the full text
+                                caption_padding = 30  # pixels below caption
+                                figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
+                                LOGGER.info("Page rect: %s, Figure rect including caption: %s", page_rect, figure_rect)
+                                mat = fitz.Matrix(5.0, 5.0)
+                                pix = page.get_pixmap(matrix=mat, clip=figure_rect)
+                                pix = self._ensure_rgb_pixmap(pix)
+                                img_bytes = pix.tobytes("png")
+                                img_b64 = b64encode(img_bytes).decode('utf-8')
+                                self._figure_cache.put(cache_key, img_b64)
+                                LOGGER.info("Successfully extracted figure using caption hint for %s from %s document, page %d",
+                                           ref, doc_name, page_number + 1)
+                                return img_b64
+                    else:
+                        LOGGER.info("No fuzzy match found for caption hint on page %d (best score: %.1f%%)",
+                                   page_number + 1, best_match_score * 100)
+                # If caption hint didn't work or wasn't provided, fall back to pattern matching
                 # Look for figure caption pattern more flexibly
                 # Normalize the reference to handle variations
-                figure_num = ref.replace('Figure', '').replace('figure', '').strip()
+                figure_num = ref.replace('Figure', '').replace('figure', '').replace('Fig.', '').replace('Fig', '').strip()
                 # Extract main figure number from subfigure (e.g., "1C" -> "1")
                 main_figure_num = re.match(r'^(\d+)', figure_num)
@@ -1193,10 +1482,10 @@ class ReactionExtractor:
                     main_figure_num = figure_num
                 # Create a flexible pattern that handles various spacing and formatting
-                # This pattern looks for "Figure" (case insensitive) followed by optional spaces
-                # then the figure number, then any of: period, colon, space+capital letter, or end of line
+                # This pattern looks for "Figure" or "Fig" (case insensitive) followed by optional spaces
+                # then the figure number, then any of: period, colon, pipe, space+capital letter, or end of line
                 # Also match at the beginning of a line to catch captions
-                flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
+                flexible_pattern = rf"(?i)(?:^|\n)\s*(?:figure|fig\.?)\s*{re.escape(main_figure_num)}(?:\.|:|\||\s+\||(?=\s+[A-Z])|\s*$)"
                 LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
                            main_figure_num, flexible_pattern)
@@ -1252,8 +1541,8 @@ class ReactionExtractor:
                     if text_instances:
                         cap_rect = text_instances[0]
                         caption_found = True
-                        LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
-                                  ref, caption_text, remaining_text[:50])
+                        LOGGER.info("Found actual caption for %s in %s document on page %d: '%s' with following text: '%s...'",
+                                  ref, doc_name, page_number + 1, caption_text, remaining_text[:50])
                         break
                 if not caption_found:
@@ -1274,18 +1563,18 @@ class ReactionExtractor:
                     continue
                 if extract_figure_only:
-                    # Extract only the area above the caption (the actual figure)
-                    # This excludes caption text and focuses on visual elements
-                    LOGGER.info("Extracting figure area above caption for %s", ref)
+                    # Extract the figure area including the caption
+                    LOGGER.info("Extracting figure area including caption for %s", ref)
                     # Get the page dimensions
                     page_rect = page.rect
-                    # Extract the area above the caption
+                    # Extract the area including the caption
                     if cap_rect:
-                        # Extract from top of page to top of caption
-                        figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
-                        LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
+                        # Extract from top of page to bottom of caption plus padding
+                        caption_padding = 30  # pixels below caption
+                        figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
+                        LOGGER.debug("Extracting figure area with caption: %s (caption ends at y=%f)", figure_rect, cap_rect.y1)
                     else:
                         # If no caption found, use top 80% of page
                         figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
@@ -1419,7 +1708,7 @@ class ReactionExtractor:
         # Sort pages by document and page number
         pages.sort(key=lambda x: (id(x[0]), x[1]))
-        # Extract the range of pages including one page after
+        # Extract the range of pages including one page after for tables
         all_images = []
         for i, (doc, page_num) in enumerate(pages):
             # Add the current page
@@ -1428,7 +1717,16 @@ class ReactionExtractor:
             pix = self._ensure_rgb_pixmap(pix)
             all_images.append(pix)
-            # Only extract the page containing the reference (removed next page logic)
+            # Add the next page as well for tables (in case data continues)
+            next_page_num = page_num + 1
+            if next_page_num < doc.page_count:
+                try:
+                    next_pix = doc.load_page(next_page_num).get_pixmap(matrix=mat)
+                    next_pix = self._ensure_rgb_pixmap(next_pix)
+                    all_images.append(next_pix)
+                    LOGGER.info("Including next page (%d) for table %s", next_page_num + 1, ref)
+                except Exception as e:
+                    LOGGER.warning("Failed to extract next page %d for %s: %s", next_page_num + 1, ref, e)
         if not all_images:
             return None
@@ -1533,21 +1831,9 @@ class ReactionExtractor:
     def _validate_location_exists(self, ref: str) -> bool:
         """Verify that the referenced location actually exists in the document."""
-        # Search for the actual reference in both manuscript and SI documents
-        docs_to_check = [self.ms_doc]
-        if self.si_doc:
-            docs_to_check.append(self.si_doc)
-        for doc in docs_to_check:
-            for page_num in range(len(doc)):
-                page = doc[page_num]
-                text = page.get_text()
-                # Look for table references like "Table 1", "Table S1", etc.
-                if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
-                    return True
-        return False
+        # Use the caption index to check if location exists
+        result = self._page_with_reference(ref)
+        return result is not None
     def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
         """Validate that the context contains meaningful content for extraction."""
@@ -1596,15 +1882,31 @@ class ReactionExtractor:
         LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
         return True
-    def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
-        """Extract performance metrics for multiple enzymes from the identified location in batch."""
-        LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
-        ref_lc = ref.lower()
+    def extract_metrics_batch(self, enzyme_list: List[str], ref: Union[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Extract performance metrics for multiple enzymes from the identified location in batch.
+        Args:
+            enzyme_list: List of enzyme names to extract metrics for
+            ref: Either a string reference (e.g., "Fig. 3") or a location dict with 'location' and optionally 'caption'
+        """
+        # Handle both string and dict inputs
+        if isinstance(ref, dict):
+            location_str = ref['location']
+            caption_hint = ref.get('caption', '')
+            document_hint = ref.get('document', '')
+            LOGGER.info("extract_metrics_batch called with location='%s' (with caption hint, document=%s) for %d enzymes",
+                       location_str, document_hint, len(enzyme_list))
+        else:
+            location_str = ref
+            caption_hint = ''
+            document_hint = ''
+            LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", location_str, len(enzyme_list))
+        ref_lc = location_str.lower()
         image_b64: Optional[str] = None
         # First, validate that the location actually exists in the document
-        if not self._validate_location_exists(ref):
-            LOGGER.warning("Location %s not found in document - skipping", ref)
+        if not self._validate_location_exists(location_str):
+            LOGGER.warning("Location %s not found in document - skipping", location_str)
             return []
         # Add campaign context if available
@@ -1614,30 +1916,30 @@ class ReactionExtractor:
         if self._TAB_RE.search(ref_lc):
             # For tables, try to extract the page as an image first
-            image_b64 = self._extract_page_png(ref, extract_figure_only=False)
+            image_b64 = self._extract_page_png(location_str, extract_figure_only=False, document_hint=document_hint)
             if not image_b64:
-                LOGGER.debug("No page image found for %s - using full page text", ref)
-                snippet = self._extract_table_context(ref)
+                LOGGER.debug("No page image found for %s - using full page text", location_str)
+                snippet = self._extract_table_context(location_str)
         elif self._FIG_RE.search(ref_lc):
             # For figures, extract just the figure image (same logic as compound mapping)
-            LOGGER.debug("Attempting to extract figure image for '%s'", ref)
-            image_b64 = self._extract_page_png(ref, extract_figure_only=True)
+            LOGGER.info("Attempting to extract figure image for '%s'", location_str)
+            image_b64 = self._extract_page_png(location_str, extract_figure_only=True, caption_hint=caption_hint, document_hint=document_hint)
             if not image_b64:
-                LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
-                snippet = self._extract_figure_caption(ref)
+                LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", location_str)
+                snippet = self._extract_figure_caption(location_str)
                 LOGGER.debug("Caption extraction result: %s",
                            f"'{snippet[:100]}...'" if snippet else "empty")
             else:
-                LOGGER.info("Successfully extracted figure image for '%s'", ref)
+                LOGGER.info("Successfully extracted figure image for '%s'", location_str)
                 # If figure is found, ignore text information - use image only
                 snippet = ""
         else:
-            snippet = self._page_with_reference(ref) or ""
+            snippet = self._page_with_reference(location_str) or ""
         # For figures with images, skip text validation and proceed with image extraction
         if image_b64 and self._FIG_RE.search(ref_lc):
-            LOGGER.info("Using figure image for %s - ignoring text context", ref)
-        elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
+            LOGGER.info("Using figure image for %s - ignoring text context", location_str)
+        elif not image_b64 and not self._validate_context(snippet, enzyme_list, location_str):
             return []
         # Create enhanced enzyme descriptions with parent/mutation context
@@ -1934,6 +2236,19 @@ Different campaigns may use different model reactions.
         prompt = PROMPT_COMPOUND_MAPPING
         if campaign_filter:
             prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
+        # Add campaign info as hints
+        if self.campaign_info:
+            substrate_id = self.campaign_info.get('substrate_id', '')
+            product_id = self.campaign_info.get('product_id', '')
+            model_substrate = self.campaign_info.get('model_substrate', '')
+            model_product = self.campaign_info.get('model_product', '')
+            if substrate_id and model_substrate:
+                prompt += f"\n\nHINT: The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})"
+            if product_id and model_product:
+                prompt += f"\nHINT: The model product for this campaign is likely '{model_product}' (ID: {product_id})"
         if compound_ids:
             prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
         prompt += "\n\nTEXT:\n" + extraction_text
@@ -2011,6 +2326,22 @@ IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campa
 {campaigns_warning}
 Different campaigns may use different numbering systems for compounds.
 Do NOT include compound information from other campaigns."""
+        # Add campaign info as hints
+        if self.campaign_info:
+            substrate_id = self.campaign_info.get('substrate_id', '')
+            product_id = self.campaign_info.get('product_id', '')
+            model_substrate = self.campaign_info.get('model_substrate', '')
+            model_product = self.campaign_info.get('model_product', '')
+            hints = []
+            if substrate_id and model_substrate:
+                hints.append(f"The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})")
+            if product_id and model_product:
+                hints.append(f"The model product for this campaign is likely '{model_product}' (ID: {product_id})")
+            if hints:
+                prompt += "\n\nHINTS FROM CAMPAIGN INFO:\n" + "\n".join(hints)
         prompt += """
@@ -2268,41 +2599,17 @@ Do NOT include compound information from other campaigns.
             if not mapping or not mapping.iupac_name:
                 missing_compounds.append(cid)
-        # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
+        # Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
         if missing_compounds:
-            LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
+            LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
                        len(missing_compounds), sorted(missing_compounds))
-            # Get all available figures for compound structure analysis
-            figure_images = {}
-            # Extract main manuscript figures
-            figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
-            for ref in figure_refs:
-                img_b64 = self._extract_page_png(ref, extract_figure_only=True)
-                if img_b64:
-                    figure_images[ref] = img_b64
-                    LOGGER.info("Retrieved %s for compound mapping", ref)
-            # Get SI figures
-            si_figure_refs = []
-            for page in self.si_pages[:10]:  # Check first 10 SI pages
-                matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
-                si_figure_refs.extend(matches[:10])  # Limit to 10 figures
-            # Extract SI figures
-            for ref in set(si_figure_refs):
-                if ref not in figure_images:
-                    img_b64 = self._extract_page_png(ref, extract_figure_only=True)
-                    if img_b64:
-                        figure_images[ref] = img_b64
-                        LOGGER.info("Extracted %s for compound mapping", ref)
             # Full text search including ALL pages (manuscript + SI)
             full_text = "\n\n".join(self.all_pages)  # Send everything
-            final_mappings = self._extract_compound_mappings_with_figures(
-                full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
+            # Use text-only extraction for Tier 2 (no images)
+            final_mappings = self._extract_compound_mappings_from_text(
+                full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
             )
             # Merge final mappings with better compound ID matching
@@ -2476,6 +2783,7 @@ Do NOT include compound information from other campaigns.
         compound_mappings = {}
         if compound_ids:
             LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
             # Pass the IUPAC location hint if we have it
             iupac_hint = locations.get("iupac_location") if locations else None
             compound_mappings = self._extract_compound_mappings_adaptive(
@@ -2506,6 +2814,22 @@ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely differ
 - Different campaigns may use similar enzyme names but different substrates
 - Be extremely careful to only extract data for the {self.campaign_filter} campaign
 - Ignore data from other campaigns even if they seem similar
+"""
+            # Add specific campaign info if available
+            campaign_info_context = ""
+            if self.campaign_info:
+                campaign_info_context = f"""
+KNOWN CAMPAIGN INFORMATION:
+- Campaign: {self.campaign_info.get('campaign_name', '')}
+- Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
+- Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
+- Known Data Locations: {', '.join(self.campaign_info.get('data_locations', []))}
+IMPORTANT: Use this information to guide your extraction. The model reaction should involve:
+- Substrate ID: {self.campaign_info.get('substrate_id', '')}
+- Product ID: {self.campaign_info.get('product_id', '')}
 """
             campaign_context = f"""
@@ -2515,6 +2839,7 @@ You are extracting the model reaction used specifically for these enzyme variant
 These variants belong to campaign: {self.campaign_filter}
 {campaigns_context}
+{campaign_info_context}
 Focus on extracting the model reaction that was used to evaluate THESE specific variants.
 Different campaigns may use different model reactions and substrates.
@@ -2625,34 +2950,6 @@ Different campaigns may use different model reactions and substrates.
                     LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
                                list(compound_mappings.keys()))
-                    # First, populate IUPAC lists directly from compound mappings based on compound_type
-                    substrate_iupacs_from_mappings = []
-                    product_iupacs_from_mappings = []
-                    for mapping in compound_mappings.values():
-                        if mapping.iupac_name and mapping.compound_type:
-                            if mapping.compound_type.lower() == "substrate":
-                                substrate_iupacs_from_mappings.append(mapping.iupac_name)
-                                LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
-                            elif mapping.compound_type.lower() == "product":
-                                product_iupacs_from_mappings.append(mapping.iupac_name)
-                                LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
-                    # Initialize or update the IUPAC lists with mapped compounds
-                    if substrate_iupacs_from_mappings:
-                        existing_substrates = data.get("substrate_iupac_list", []) or []
-                        if isinstance(existing_substrates, list):
-                            data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
-                        else:
-                            data["substrate_iupac_list"] = substrate_iupacs_from_mappings
-                    if product_iupacs_from_mappings:
-                        existing_products = data.get("product_iupac_list", []) or []
-                        if isinstance(existing_products, list):
-                            data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
-                        else:
-                            data["product_iupac_list"] = product_iupacs_from_mappings
                     # Try to map substrate/product lists through compound IDs
                     substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
                     if isinstance(substrate_list, list):
@@ -2790,7 +3087,7 @@ Different campaigns may use different model reactions and substrates.
                        best_location.get('confidence', 0))
             # Extract metrics from the most confident source only
-            metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
+            metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location)
             # Filter to valid metrics
             valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
@@ -3034,7 +3331,7 @@ Different campaigns may use different model reactions and substrates.
                        location['location'], location.get('confidence', 0))
             # Extract metrics from this location
-            metrics_rows = self.extract_metrics_batch(list(all_variants), location['location'])
+            metrics_rows = self.extract_metrics_batch(list(all_variants), location)
             # Filter to valid metrics
             valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
@@ -3263,6 +3560,11 @@ def main() -> None:
         LOGGER.info("Loading enzyme data from CSV…")
         enzyme_df = pd.read_csv(args.lineage_csv)
+        # Rename enzyme_id to enzyme if needed
+        if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
+            enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
+            LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
         # Detect campaign information from the enzyme CSV
         if 'campaign_id' in enzyme_df.columns:
             all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
@@ -3277,10 +3579,27 @@ def main() -> None:
                     campaign_debug_dir.mkdir(parents=True, exist_ok=True)
                     LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
+                # Load campaign info from campaigns.json if available
+                campaign_info = None
+                if args.debug_dir:
+                    from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
+                    campaigns_file = Path(args.debug_dir) / "campaigns.json"
+                    if campaigns_file.exists():
+                        campaigns = load_campaigns_from_file(campaigns_file)
+                        campaign_info = find_campaign_by_id(campaigns, campaign_filter)
+                        if campaign_info:
+                            LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign_filter)
                 extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
-                                            campaign_filter=campaign_filter, all_campaigns=all_campaigns)
+                                            campaign_filter=campaign_filter, all_campaigns=all_campaigns,
+                                            campaign_info=campaign_info)
                 df_metrics = extractor.run(enzyme_df)
+                # For single campaign, also merge with lineage data
+                if not df_metrics.empty:
+                    df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
+                    LOGGER.info("Merged metrics with lineage data for single campaign")
             elif len(all_campaigns) > 1:
                 LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
                 all_results = []
@@ -3304,8 +3623,20 @@ def main() -> None:
                         campaign_debug_dir.mkdir(parents=True, exist_ok=True)
                         LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
+                    # Load campaign info from campaigns.json if available
+                    campaign_info = None
+                    if args.debug_dir:
+                        from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
+                        campaigns_file = Path(args.debug_dir) / "campaigns.json"
+                        if campaigns_file.exists():
+                            campaigns = load_campaigns_from_file(campaigns_file)
+                            campaign_info = find_campaign_by_id(campaigns, campaign)
+                            if campaign_info:
+                                LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign)
                     extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
-                                                campaign_filter=campaign, all_campaigns=all_campaigns)
+                                                campaign_filter=campaign, all_campaigns=all_campaigns,
+                                                campaign_info=campaign_info)
                     # Run extraction for this campaign
                     campaign_metrics = extractor.run(campaign_df)
@@ -3319,6 +3650,10 @@ def main() -> None:
                         # Merge campaign metrics with lineage data
                         campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
+                        # Rename aa_seq to protein_sequence for consistency
+                        if 'aa_seq' in campaign_final.columns:
+                            campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
                         # Save campaign-specific file immediately
                         output_dir = args.output.parent
                         base_name = args.output.stem
@@ -3335,6 +3670,10 @@ def main() -> None:
                         # Still save an empty campaign file with lineage data
                         campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
                         if not campaign_lineage.empty:
+                            # Rename aa_seq to protein_sequence for consistency
+                            if 'aa_seq' in campaign_lineage.columns:
+                                campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
                             output_dir = args.output.parent
                             base_name = args.output.stem
                             campaign_file = output_dir / f"{base_name}_{campaign}.csv"
@@ -3365,6 +3704,11 @@ def main() -> None:
     df_final = df_metrics
     LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
+    # Rename aa_seq to protein_sequence for consistency
+    if df_final is not None and 'aa_seq' in df_final.columns:
+        df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
+        LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
     df_final.to_csv(args.output, index=False)
     LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)

debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

debase 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl