PyPI - debase - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

debase 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

debase/_version.py +1 -1
debase/caption_pattern.py +7 -2
debase/cleanup_sequence.py +34 -6
debase/enzyme_lineage_extractor.py +673 -221
debase/lineage_format.py +55 -6
debase/reaction_info_extractor.py +282 -97
debase/substrate_scope_extractor.py +218 -65
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
debase-0.7.0.dist-info/RECORD +18 -0
debase-0.6.1.dist-info/RECORD +0 -18
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0

debase/reaction_info_extractor.py CHANGED Viewed

@@ -29,6 +29,7 @@ import json
 import logging
 import os
 import re
+import subprocess
 import sys
 import time
 from base64 import b64encode, b64decode
@@ -90,6 +91,40 @@ handler.setFormatter(logging.Formatter("%(levelname)s [%(name)s] %(message)s"))
 LOGGER.addHandler(handler)
 LOGGER.setLevel(logging.INFO)
+# === OPSIN VALIDATION === -------------------------------------------------
+def is_valid_iupac_name_with_opsin(name: str) -> bool:
+    """Check if a name is a valid IUPAC name using the local OPSIN command."""
+    if not name or len(name.strip()) < 3:
+        return False
+    # Skip if it looks like a compound ID (e.g., "1a", "S1", etc.)
+    if re.match(r'^[0-9]+[a-z]?$|^S\d+$', name.strip()):
+        return False
+    try:
+        # Use local OPSIN command to check if name can be converted to SMILES
+        process = subprocess.run(
+            ['opsin', '-o', 'smi'],
+            input=name.strip(),
+            text=True,
+            capture_output=True,
+            timeout=30
+        )
+        # If OPSIN successfully converts to SMILES, the name is valid IUPAC
+        if process.returncode == 0 and process.stdout.strip():
+            output = process.stdout.strip()
+            # Check if output looks like a valid SMILES (contains common SMILES characters)
+            if any(char in output for char in 'CNOS()=[]#+-'):
+                return True
+        return False
+    except Exception as e:
+        LOGGER.debug(f"OPSIN check failed for '{name}': {e}")
+        return False
 # --- Debug dump helper ----------------------------------------------------
 def _dump(text: str | bytes, path: Path | str) -> None:
     """Write `text` / `bytes` to `path`, creating parent dirs as needed."""
@@ -442,7 +477,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
 You are an expert reader of protein engineering manuscripts.
 Given the following article captions and section titles, identify most promising locations
 (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
-activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
+activity, etc.) for enzyme variants.
+CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
+- Look for locations showing data for ALL enzyme variants in the evolution lineage
+- Prioritize sources that show the complete evolutionary progression (parent → child variants)
+- Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
+- Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
 IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
 performance data locations. Pay careful attention to:
@@ -450,8 +491,13 @@ performance data locations. Pay careful attention to:
 - Enzyme name prefixes that indicate different campaigns
 - Different substrate/product types mentioned in captions
+IMPORTANT FIGURE REFERENCE RULES:
+- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
+- The extraction system will handle retrieving the entire figure including all sub-panels
+- For tables, return the complete reference as it appears
 Respond with a JSON array where each element contains:
-- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
+- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
 - "type": one of "table", "figure"
 - "confidence": your confidence score (0-100)
 - "caption": the exact caption text for this location
@@ -459,7 +505,12 @@ Respond with a JSON array where each element contains:
 - "lineage_hint": any indication of which enzyme group this data is for (or null)
 - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
-Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
+PRIORITIZATION RULES:
+- HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
+- MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
+- LOWEST PRIORITY: Sources showing data for individual variants only
+Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
 IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
@@ -503,6 +554,13 @@ IMPORTANT:
 - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
 - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
+CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
+- Yield (%) measures how much product was formed (0-100%)
+- Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
+- TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
+- These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
+- Be extremely careful when extracting from tables/figures with multiple columns or data series
 Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
 """)
@@ -530,6 +588,17 @@ STEP 4: Extract values for each matched variant
 - CRITICAL: Read actual scale values from the axis labels and tick marks
 - Verify: taller bars should have higher values, higher dots should have higher values
+CRITICAL DATA ACCURACY REQUIREMENTS:
+- DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
+- Yield is typically shown as percentage (0-100%)
+- Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
+- TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
+- Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
+- Carefully match each bar/dot to its corresponding enzyme label on the X-axis
+- If looking at grouped bars, ensure you're reading the correct bar for each metric
+- Double-check that variant A's yield is not confused with variant B's yield
+- If values are unclear or ambiguous, return null rather than guessing
 Target enzymes to find and extract:
 {enzyme_names}
@@ -572,23 +641,29 @@ Given the following text sections, identify where the MODEL REACTION information
 The model reaction is the STANDARD reaction used to evaluate all enzyme variants
 (not the substrate scope). Look for:
-- Sections titled "Model Reaction", "Standard Reaction", "General Procedure"
-- Text describing the reaction conditions used for enzyme evolution/screening
-- Sections describing which substrates were used as the benchmark
-- Compound numbers (e.g., "6a", "7a") used in the model reaction
+- SPECIFIC compound numbers (e.g., "1a", "2a", "3a") used in the model reaction
+- Reaction SCHEMES or FIGURES showing the model reaction with numbered compounds
+- Tables showing reaction conditions with specific compound IDs
+- Sections titled "Model Reaction", "Standard Reaction", "General Procedure" WITH compound numbers
+CRITICAL REQUIREMENTS:
+1. The location MUST reference SPECIFIC numbered compounds (not generic descriptions)
+2. DO NOT use generic locations like "main text" or "introduction"
+3. MUST be a Figure, Scheme, Table, or specific SI section
+4. Look for actual compound IDs like "1a + 2a → 3a" or "substrate 1a"
 Also identify where the IUPAC names for these specific compounds are listed.
 Respond with a JSON object containing:
 {
   "model_reaction_location": {
-    "location": "section name or description",
+    "location": "SPECIFIC Figure/Scheme/Table number (e.g., 'Figure 2a', 'Scheme 1', 'Table S1')",
     "confidence": 0-100,
-    "reason": "why this contains the model reaction",
-    "compound_ids": ["list", "of", "compound", "IDs", "if", "found"]
+    "reason": "why this contains the model reaction WITH specific compound IDs",
+    "compound_ids": ["list", "of", "SPECIFIC", "compound", "IDs", "found", "e.g.", "1a", "2a", "3a"]
   },
   "conditions_location": {
-    "location": "where reaction conditions are described",
+    "location": "SPECIFIC location where reaction conditions are described",
     "confidence": 0-100
   },
   "iupac_location": {
@@ -598,6 +673,11 @@ Respond with a JSON object containing:
   }
 }
+IMPORTANT:
+- If no SPECIFIC compound IDs are found, set compound_ids to []
+- The model_reaction_location MUST be a Figure, Scheme, Table, or SI section, NOT "main text"
+- Look for numbered compounds like "1a", "2a", not generic terms like "enol acetates"
 Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
 """)
@@ -608,11 +688,20 @@ This is the reaction used for directed evolution screening, NOT the substrate sc
 Look for terms like "model reaction", "standard substrate", "benchmark reaction",
 or the specific reaction mentioned in enzyme screening/evolution sections.
+CRITICAL STEPS FOR COMPOUND IDENTIFICATION:
+1. ALWAYS look for specific compound IDs/numbers in the model reaction (e.g., "1a", "2a", "3a", "6a", "7a")
+2. If the text mentions generic terms like "enol acetates" or "silyl enol ethers", search for the SPECIFIC numbered compounds used
+3. Look in reaction schemes, figures, and experimental sections for numbered compounds
+4. Common patterns:
+   - "compound 1a" or "substrate 1a"
+   - Numbers in bold or italics (1a, 2a, etc.)
+   - References like "using 1a as substrate"
 CRITICAL STEPS FOR IUPAC NAMES:
-1. First identify the compound IDs used in the model reaction (e.g., "6a", "7a")
-2. Then search the provided context for these compound IDs to find their IUPAC names
-3. Look for sections with "Compound 6a", "Product 7a", or similar patterns
-4. The IUPAC names are usually given after the compound ID in parentheses or after a colon
+1. After finding compound IDs, search the context for these IDs to find their IUPAC names
+2. Look for sections with "Compound 1a:", "Product 3a:", or similar patterns
+3. The IUPAC names are usually given after the compound ID in parentheses or after a colon
+4. If no IUPAC name is found for a compound ID, still include the ID in substrate_list/product_list
 CRITICAL FOR SUBSTRATE CONCENTRATION:
 - Look carefully in FIGURES and figure captions for substrate concentration information
@@ -623,10 +712,10 @@ CRITICAL FOR SUBSTRATE CONCENTRATION:
 - The substrate is the molecule being chemically transformed by the enzyme
 Return a JSON object with:
-  * "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["5", "6a"])
-  * "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents
-  * "product_list" - Array of product identifiers as used in the paper (e.g., ["7a"])
-  * "product_iupac_list" - Array of IUPAC names for ALL products formed
+  * "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["1a", "2a"]) - NEVER generic descriptions
+  * "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents (null if not found)
+  * "product_list" - Array of product identifiers as used in the paper (e.g., ["3a"]) - NEVER generic descriptions
+  * "product_iupac_list" - Array of IUPAC names for ALL products formed (null if not found)
   * "reaction_substrate_concentration" - Concentration of actual substrate(s) being transformed, NOT reducing agents like dithionite
   * "cofactor" - Any cofactors used (e.g., "NADH", "NADPH", "FAD", "heme", etc.) or null if none
   * "reaction_temperature" - reaction temperature (e.g., "25°C", "room temperature")
@@ -635,7 +724,8 @@ Return a JSON object with:
   * "reaction_other_conditions" - other important conditions (enzyme loading, reducing agents like dithionite, time, anaerobic, etc.)
 IMPORTANT:
-- Extract the reaction used for ENZYME EVOLUTION/SCREENING (not substrate scope)
+- ALWAYS use specific compound IDs (like "1a", "2a") in substrate_list and product_list, NEVER generic descriptions
+- If you can't find specific compound IDs, look harder in figures, schemes, and experimental sections
 - Substrate concentration = concentration of chemicals being transformed, NOT reducing agents (dithionite, NADH, etc.)
 - Maintain correspondence: substrate_list[i] should map to substrate_iupac_list[i], same for products
 - If a compound ID has no IUPAC name found, still include it in the list with null in the IUPAC list
@@ -749,7 +839,7 @@ Return as JSON:
 ###############################################################################
 class ReactionExtractor:
-    _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
+    _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\.?\s+s?\d+[a-z]?", re.I)
     _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
     def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
@@ -887,15 +977,22 @@ class ReactionExtractor:
             campaign_context = f"""
             IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
-            Campaign Details:
+            CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
+            - Campaign ID: {self.campaign_info.get('campaign_id', '')}
             - Name: {self.campaign_info.get('campaign_name', '')}
             - Description: {self.campaign_info.get('description', '')}
             - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
             - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
+            - Notes: {self.campaign_info.get('notes', '')}
             KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
             These locations are known to contain relevant data - prioritize them highly.
+            CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
+            - Look for data showing the entire evolutionary progression of enzyme variants
+            - Prioritize locations that show performance data for ALL variants in the lineage
+            - The campaign description and notes above provide context about the evolution strategy used
             {f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
             CRITICAL: Only return locations that contain data for this specific campaign.
@@ -1367,6 +1464,10 @@ class ReactionExtractor:
                         text = ' '.join(text.split())
                         # Normalize different dash types
                         text = text.replace('–', '-').replace('—', '-')
+                        # Normalize pipe character and other special chars
+                        text = text.replace('|', ' ').replace('│', ' ')
+                        # Remove multiple spaces
+                        text = ' '.join(text.split())
                         return text
                     normalized_hint = normalize_for_matching(caption_hint[:100])  # Use first 100 chars
@@ -1863,15 +1964,17 @@ class ReactionExtractor:
         ref_lc = location_str.lower()
         image_b64: Optional[str] = None
-        # First, validate that the location actually exists in the document
-        if not self._validate_location_exists(location_str):
+        # Skip validation entirely when we have a caption hint - trust the vision model
+        if caption_hint:
+            LOGGER.info("Skipping validation - using caption hint for %s", location_str)
+        elif not self._validate_location_exists(location_str):
             LOGGER.warning("Location %s not found in document - skipping", location_str)
             return []
         # Add campaign context if available
         campaign_context = ""
         if self.campaign_filter:
-            campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\n"
+            campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\nEXCLUDE reference variants from other publications - only include variants created/tested in THIS study.\n"
         if self._TAB_RE.search(ref_lc):
             # For tables, try to extract the page as an image first
@@ -1935,6 +2038,24 @@ class ReactionExtractor:
             prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
             LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
             tag = f"extract_metrics_batch_vision"
+            # Save the figure image to debug directory
+            if self.debug_dir and isinstance(ref, dict):
+                location_str = ref.get('location', str(ref))
+            else:
+                location_str = str(ref)
+            if self.debug_dir:
+                timestamp = int(time.time())
+                img_file = self.debug_dir / f"metrics_extraction_{location_str.replace(' ', '_').replace('.', '')}_{timestamp}.png"
+                try:
+                    import base64
+                    img_bytes = base64.b64decode(image_b64)
+                    with open(img_file, 'wb') as f:
+                        f.write(img_bytes)
+                    LOGGER.info("Saved metrics extraction figure to: %s", img_file)
+                except Exception as e:
+                    LOGGER.warning("Failed to save metrics extraction figure: %s", e)
         else:
             # Add enzyme names to prompt for batch extraction with explicit format requirement
             format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
@@ -2071,6 +2192,10 @@ These variants belong to campaign: {self.campaign_filter}
 {campaigns_context}
 Focus on finding the model reaction that was used to evaluate THESE specific variants.
 Different campaigns may use different model reactions.
+CRITICAL: These variants should be from THIS study only!
+- EXCLUDE any reference variants cited from other publications
+- Only include variants that were created/engineered in this manuscript
 """
         prompt = enzyme_context + PROMPT_FIND_MODEL_REACTION_LOCATION + "\n\n=== CAPTIONS AND SECTIONS ===\n" + all_text + "\n\n=== MANUSCRIPT TEXT PREVIEW ===\n" + ms_preview + "\n\n=== SI TEXT PREVIEW ===\n" + si_preview
@@ -2558,41 +2683,17 @@ Do NOT include compound information from other campaigns.
             if not mapping or not mapping.iupac_name:
                 missing_compounds.append(cid)
-        # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
+        # Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
         if missing_compounds:
-            LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
+            LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
                        len(missing_compounds), sorted(missing_compounds))
-            # Get all available figures for compound structure analysis
-            figure_images = {}
-            # Extract main manuscript figures
-            figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
-            for ref in figure_refs:
-                img_b64 = self._extract_page_png(ref, extract_figure_only=True)
-                if img_b64:
-                    figure_images[ref] = img_b64
-                    LOGGER.info("Retrieved %s for compound mapping", ref)
-            # Get SI figures
-            si_figure_refs = []
-            for page in self.si_pages[:10]:  # Check first 10 SI pages
-                matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
-                si_figure_refs.extend(matches[:10])  # Limit to 10 figures
-            # Extract SI figures
-            for ref in set(si_figure_refs):
-                if ref not in figure_images:
-                    img_b64 = self._extract_page_png(ref, extract_figure_only=True)
-                    if img_b64:
-                        figure_images[ref] = img_b64
-                        LOGGER.info("Extracted %s for compound mapping", ref)
             # Full text search including ALL pages (manuscript + SI)
             full_text = "\n\n".join(self.all_pages)  # Send everything
-            final_mappings = self._extract_compound_mappings_with_figures(
-                full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
+            # Use text-only extraction for Tier 2 (no images)
+            final_mappings = self._extract_compound_mappings_from_text(
+                full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
             )
             # Merge final mappings with better compound ID matching
@@ -2826,6 +2927,12 @@ These variants belong to campaign: {self.campaign_filter}
 Focus on extracting the model reaction that was used to evaluate THESE specific variants.
 Different campaigns may use different model reactions and substrates.
+CRITICAL: EXCLUDE reference variants from other publications!
+- Only extract data for variants that were actually tested/created in THIS study
+- Do NOT include data for reference enzymes cited from other papers
+- Look for phrases like "from reference", "previously reported", "from [Author] et al." to identify reference variants
+- Focus ONLY on the variants that were engineered/tested in this manuscript
 """
         # Include both manuscript and SI text for better coverage
@@ -2933,34 +3040,6 @@ Different campaigns may use different model reactions and substrates.
                     LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
                                list(compound_mappings.keys()))
-                    # First, populate IUPAC lists directly from compound mappings based on compound_type
-                    substrate_iupacs_from_mappings = []
-                    product_iupacs_from_mappings = []
-                    for mapping in compound_mappings.values():
-                        if mapping.iupac_name and mapping.compound_type:
-                            if mapping.compound_type.lower() == "substrate":
-                                substrate_iupacs_from_mappings.append(mapping.iupac_name)
-                                LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
-                            elif mapping.compound_type.lower() == "product":
-                                product_iupacs_from_mappings.append(mapping.iupac_name)
-                                LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
-                    # Initialize or update the IUPAC lists with mapped compounds
-                    if substrate_iupacs_from_mappings:
-                        existing_substrates = data.get("substrate_iupac_list", []) or []
-                        if isinstance(existing_substrates, list):
-                            data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
-                        else:
-                            data["substrate_iupac_list"] = substrate_iupacs_from_mappings
-                    if product_iupacs_from_mappings:
-                        existing_products = data.get("product_iupac_list", []) or []
-                        if isinstance(existing_products, list):
-                            data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
-                        else:
-                            data["product_iupac_list"] = product_iupacs_from_mappings
                     # Try to map substrate/product lists through compound IDs
                     substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
                     if isinstance(substrate_list, list):
@@ -3053,6 +3132,100 @@ Different campaigns may use different model reactions and substrates.
         ]
         for key in expected_keys:
             data.setdefault(key, None)
+        # === OPSIN VALIDATION AND COMPOUND MAPPING FALLBACK ===
+        # Check if the IUPAC names are actually valid using OPSIN
+        needs_compound_mapping = False
+        # Check substrate IUPAC names
+        substrate_has_invalid = False
+        if data.get("substrate_list") and isinstance(data["substrate_list"], list):
+            # Check if we have substrate IDs but missing or invalid IUPAC names
+            if not data.get("substrate_iupac_list"):
+                LOGGER.warning("Substrate list exists but no IUPAC names provided")
+                substrate_has_invalid = True
+            else:
+                substrate_names = data["substrate_iupac_list"].split("; ") if isinstance(data["substrate_iupac_list"], str) else []
+                # Check each substrate ID has a valid IUPAC name
+                for i, substrate_id in enumerate(data["substrate_list"]):
+                    if i >= len(substrate_names) or not substrate_names[i]:
+                        LOGGER.warning(f"No IUPAC name for substrate '{substrate_id}'")
+                        substrate_has_invalid = True
+                    elif not is_valid_iupac_name_with_opsin(substrate_names[i]):
+                        LOGGER.warning(f"Invalid IUPAC name detected for substrate '{substrate_id}': '{substrate_names[i]}'")
+                        substrate_has_invalid = True
+            if substrate_has_invalid:
+                needs_compound_mapping = True
+                LOGGER.info("Found missing or invalid substrate IUPAC names, will attempt compound mapping")
+        # Check product IUPAC names
+        product_has_invalid = False
+        if data.get("product_list") and isinstance(data["product_list"], list):
+            # Check if we have product IDs but missing or invalid IUPAC names
+            if not data.get("product_iupac_list"):
+                LOGGER.warning("Product list exists but no IUPAC names provided")
+                product_has_invalid = True
+            else:
+                product_names = data["product_iupac_list"].split("; ") if isinstance(data["product_iupac_list"], str) else []
+                # Check each product ID has a valid IUPAC name
+                for i, product_id in enumerate(data["product_list"]):
+                    if i >= len(product_names) or not product_names[i]:
+                        LOGGER.warning(f"No IUPAC name for product '{product_id}'")
+                        product_has_invalid = True
+                    elif not is_valid_iupac_name_with_opsin(product_names[i]):
+                        LOGGER.warning(f"Invalid IUPAC name detected for product '{product_id}': '{product_names[i]}'")
+                        product_has_invalid = True
+            if product_has_invalid:
+                needs_compound_mapping = True
+                LOGGER.info("Found missing or invalid product IUPAC names, will attempt compound mapping")
+        # If we need compound mapping and have substrate/product lists, attempt it
+        if needs_compound_mapping and (data.get("substrate_list") or data.get("product_list")):
+            LOGGER.info("Attempting compound mapping due to invalid IUPAC names")
+            # Collect all compound IDs that need mapping
+            compound_ids_to_map = []
+            if data.get("substrate_list") and isinstance(data["substrate_list"], list):
+                compound_ids_to_map.extend(data["substrate_list"])
+            if data.get("product_list") and isinstance(data["product_list"], list):
+                compound_ids_to_map.extend(data["product_list"])
+            if compound_ids_to_map:
+                LOGGER.info(f"Attempting to map compound IDs: {compound_ids_to_map}")
+                # Use the adaptive compound mapping
+                compound_mappings = self._extract_compound_mappings_adaptive(
+                    compound_ids_to_map,
+                    campaign_filter=self.campaign_filter
+                )
+                # Re-map substrate IUPAC names
+                if data.get("substrate_list") and isinstance(data["substrate_list"], list):
+                    mapped_substrates = []
+                    for substrate_id in data["substrate_list"]:
+                        mapping = compound_mappings.get(substrate_id.lower().strip())
+                        if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
+                            mapped_substrates.append(mapping.iupac_name)
+                            LOGGER.info(f"Successfully mapped substrate '{substrate_id}' to IUPAC: {mapping.iupac_name}")
+                    if mapped_substrates:
+                        data["substrate_iupac_list"] = "; ".join(mapped_substrates)
+                        LOGGER.info(f"Updated substrate IUPAC list with {len(mapped_substrates)} valid names")
+                # Re-map product IUPAC names
+                if data.get("product_list") and isinstance(data["product_list"], list):
+                    mapped_products = []
+                    for product_id in data["product_list"]:
+                        mapping = compound_mappings.get(product_id.lower().strip())
+                        if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
+                            mapped_products.append(mapping.iupac_name)
+                            LOGGER.info(f"Successfully mapped product '{product_id}' to IUPAC: {mapping.iupac_name}")
+                    if mapped_products:
+                        data["product_iupac_list"] = "; ".join(mapped_products)
+                        LOGGER.info(f"Updated product IUPAC list with {len(mapped_products)} valid names")
         return data
@@ -3131,21 +3304,10 @@ Different campaigns may use different model reactions and substrates.
             # Extract model reaction for this location - use unified approach
             LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
-            # Try lineage-specific extraction first
-            location_model_reaction = self.find_lineage_model_reaction(
-                best_location['location'],
-                location_context,
-                model_reaction_locations
-            )
-            # Check if lineage extraction was successful
-            if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
-                LOGGER.info("Using lineage-specific model reaction data")
-                model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
-            else:
-                LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
-                # Use the comprehensive multimodal approach as fallback
-                model_info = self.gather_model_reaction_info(location_enzymes)
+            # Skip lineage-specific extraction and use comprehensive multimodal extraction directly
+            # The lineage-specific extraction often returns generic substrate classes instead of specific compounds
+            LOGGER.info("Using comprehensive multimodal extraction for model reaction")
+            model_info = self.gather_model_reaction_info(location_enzymes)
             LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
@@ -3571,6 +3733,11 @@ def main() -> None:
         LOGGER.info("Loading enzyme data from CSV…")
         enzyme_df = pd.read_csv(args.lineage_csv)
+        # Rename enzyme_id to enzyme if needed
+        if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
+            enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
+            LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
         # Detect campaign information from the enzyme CSV
         if 'campaign_id' in enzyme_df.columns:
             all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
@@ -3601,6 +3768,11 @@ def main() -> None:
                                             campaign_info=campaign_info)
                 df_metrics = extractor.run(enzyme_df)
+                # For single campaign, also merge with lineage data
+                if not df_metrics.empty:
+                    df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
+                    LOGGER.info("Merged metrics with lineage data for single campaign")
             elif len(all_campaigns) > 1:
                 LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
                 all_results = []
@@ -3651,6 +3823,10 @@ def main() -> None:
                         # Merge campaign metrics with lineage data
                         campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
+                        # Rename aa_seq to protein_sequence for consistency
+                        if 'aa_seq' in campaign_final.columns:
+                            campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
                         # Save campaign-specific file immediately
                         output_dir = args.output.parent
                         base_name = args.output.stem
@@ -3667,6 +3843,10 @@ def main() -> None:
                         # Still save an empty campaign file with lineage data
                         campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
                         if not campaign_lineage.empty:
+                            # Rename aa_seq to protein_sequence for consistency
+                            if 'aa_seq' in campaign_lineage.columns:
+                                campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
                             output_dir = args.output.parent
                             base_name = args.output.stem
                             campaign_file = output_dir / f"{base_name}_{campaign}.csv"
@@ -3697,6 +3877,11 @@ def main() -> None:
     df_final = df_metrics
     LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
+    # Rename aa_seq to protein_sequence for consistency
+    if df_final is not None and 'aa_seq' in df_final.columns:
+        df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
+        LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
     df_final.to_csv(args.output, index=False)
     LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)

debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

debase 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl