PyPI - debase - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

debase 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +623 -234
debase/lineage_format.py +113 -11
debase/reaction_info_extractor.py +21 -7
debase/substrate_scope_extractor.py +516 -67
debase/wrapper.py +301 -67
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/METADATA +1 -1
debase-0.1.17.dist-info/RECORD +17 -0
debase-0.1.11.dist-info/RECORD +0 -17
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/WHEEL +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/entry_points.txt +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/licenses/LICENSE +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/top_level.txt +0 -0

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -346,6 +346,103 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
     log.warning("Could not find figure caption for '%s'", figure_ref)
     return None
+def extract_scheme_image(pdf_paths: List[Path], scheme_ref: str) -> Optional[str]:
+    """Extract scheme as a page region, similar to figures.
+    Args:
+        pdf_paths: List of PDF paths to search
+        scheme_ref: Scheme reference to search for (e.g., "Scheme 2" or "Scheme S2")
+    Returns:
+        Base64-encoded PNG string or None if not found
+    """
+    if not pdf_paths:
+        return None
+    for pdf_path in pdf_paths:
+        doc = _open_doc(pdf_path)
+        try:
+            for page_num in range(doc.page_count):
+                page = doc.load_page(page_num)
+                page_text = page.get_text()
+                # Check if this page contains the scheme
+                found = False
+                scheme_instances = None
+                # Look for scheme reference with various patterns
+                variations = [
+                    f"{scheme_ref}.",  # "Scheme 2."
+                    f"{scheme_ref}:",  # "Scheme 2:"
+                    f"{scheme_ref} ",  # "Scheme 2 "
+                    scheme_ref,
+                ]
+                for variation in variations:
+                    scheme_instances = page.search_for(variation, quads=False)
+                    if scheme_instances:
+                        # Check if this is likely a scheme title (not a reference in text)
+                        for rect in scheme_instances:
+                            # Get text around this location
+                            x0, y0, x1, y1 = rect
+                            text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+400, y1+20))
+                            # Check if it looks like a scheme title
+                            if any(keyword in text_around.lower() for keyword in
+                                   ['substrate scope', 'reaction', 'synthesis', 'procedure', 'explored']):
+                                found = True
+                                scheme_rect = rect
+                                break
+                        if found:
+                            break
+                if not found:
+                    continue
+                log.info("Found scheme on page %d at y=%.0f", page_num + 1, scheme_rect.y0)
+                # For schemes, we often want to capture more of the page
+                # since they can be large and include multiple reactions
+                page_rect = page.rect
+                # Define the region to extract
+                # For schemes, we want to capture everything below the title
+                # until we hit significant text (which would be the next section)
+                top_margin = max(0, scheme_rect.y1 + 5)  # Start just below the scheme title
+                # Look for the next major text block that might indicate end of scheme
+                # This is a simple heuristic - look for blocks of text below the scheme
+                text_blocks = page.get_text("blocks")
+                bottom_y = page_rect.height  # Default to full page
+                for block in text_blocks:
+                    block_y = block[1]  # y-coordinate of block
+                    block_text = block[4]  # text content
+                    # If we find a substantial text block below the scheme title
+                    if block_y > scheme_rect.y1 + 50 and len(block_text) > 100:
+                        # This might be the next section
+                        bottom_y = block_y - 10
+                        break
+                # Create the clip rectangle
+                clip_rect = fitz.Rect(0, top_margin, page_rect.width, bottom_y)
+                # Extract the region as an image
+                mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
+                pix = page.get_pixmap(clip=clip_rect, matrix=mat)
+                # Convert to PNG
+                img_bytes = pix.tobytes("png")
+                log.info("Extracted scheme region: %.0fx%.0f pixels from page %d",
+                         clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
+                return b64encode(img_bytes).decode()
+        finally:
+            doc.close()
+    log.warning("Could not find scheme '%s'", scheme_ref)
+    return None
 def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
     """Extract text around a specific reference (e.g., 'Figure 3')."""
@@ -765,21 +862,28 @@ Return JSON:
 """.strip()
 _COMPOUND_MAPPING_PROMPT = """
-Extract compound identifiers and their chemical names EXACTLY as they appear in the text.
+Extract compound identifiers and their chemical names from the provided text and any scheme images.
+TASK:
+1. First, extract all compound IDs and names that are explicitly written in the text
+2. Then, analyze any provided scheme images to identify compound labeling patterns
+3. Look for relationships between compounds (e.g., when multiple variants share a base structure)
+4. Note any systematic naming conventions used in the schemes
-STRICT RULES:
-1. ONLY extract what is explicitly written in the text
-2. Look for patterns where compound IDs are paired with chemical names
-3. DO NOT infer, generate, or guess any chemical names
-4. If a compound ID appears without a chemical name, return null for iupac_name
-5. If a product was "not detected" or "not formed", return null for iupac_name
+ANALYSIS GUIDELINES:
+- Some papers define a base compound and use letter suffixes for variants
+- Schemes often show relationships that aren't explicitly stated in text
+- Pay attention to how compounds are grouped or connected in schemes
+- Identify any patterns in how compounds are numbered/lettered
 For each compound:
-- identifier: The exact compound ID as written (e.g., "1", "2a", "SM-1")
-- iupac_name: The chemical name if explicitly provided, otherwise null
-- common_names: Any alternative names mentioned
+- identifier: The exact compound ID as written
+- iupac_name: The chemical name if found in text
+- common_names: Any alternative names
 - compound_type: substrate/product/reagent/catalyst/other
-- source_location: The exact text excerpt where this information was found
+- source_location: Where found (text excerpt or "Scheme X")
+- related_compounds: List of related compound IDs if a pattern is detected
+- pattern_notes: Description of any labeling pattern observed
 Return as JSON:
 {
@@ -789,12 +893,12 @@ Return as JSON:
       "iupac_name": "string or null",
       "common_names": ["array of strings"],
       "compound_type": "string",
-      "source_location": "string"
+      "source_location": "string",
+      "related_compounds": ["array of related IDs"],
+      "pattern_notes": "string or null"
     }
   ]
 }
-Note: It is better to return null than to hallucinate or infer chemical structures.
 """.strip()
 _SUBSTRATE_SCOPE_PROMPT = """
@@ -803,13 +907,17 @@ Extract ALL substrate scope data from the primary sources in one complete extrac
 For EACH reaction, extract:
 1. Enzyme variant ID
-2. Substrate identifiers (e.g., "6a", "5")
+2. Substrate identifiers (e.g., "6a", "5") - ONLY if explicitly shown
 3. Product identifiers (e.g., "7a", "7b", "7d", "7e") - ALWAYS include even if no yield
 4. Performance metrics (yield%, ee%, dr, TTN)
 5. Reaction conditions (temperature, pH, buffer, substrate concentrations - NOT dithionite/reducing agents)
 6. Data location (which figure/table this comes from)
-CRITICAL - NO HALLUCINATION OR MODIFICATION:
+CRITICAL - NO HALLUCINATION OR INFERENCE OF IDENTIFIERS:
+- SUBSTRATE IDS: Only extract substrate identifiers that are EXPLICITLY WRITTEN in the source
+- DO NOT INFER substrate IDs from patterns (e.g., if you see product "4a", DO NOT assume substrate is "3a")
+- If substrate ID is not explicitly shown, use null for substrate_ids
+- Product IDs should be extracted as shown (since they are usually labeled in schemes)
 - Extract values EXACTLY as written in the primary source - NO CHANGES WHATSOEVER
 - DO NOT round, estimate, convert, or modify any numbers
 - If the text shows "53%", report 53.0, not 53 or 53.00
@@ -821,19 +929,20 @@ CRITICAL - NO HALLUCINATION OR MODIFICATION:
 - If no value is shown, return null, not 0 or empty string
 - Extract ALL reactions from ALL identified locations
 - Use compound identifiers EXACTLY as shown (not IUPAC names)
-- For every entry, there needs to be identifier for both substrates and products, even if yield is null or activity is 0.
 - Extract reaction conditions EXACTLY as written - NO PARAPHRASING
 - IMPORTANT: Substrate concentration refers to the concentration of the actual chemical substrates being transformed in the reaction, NOT reducing agents (e.g., dithionite, NADH) or other additives
-IMPORTANT: Each substrate should have a corresponding product identifier. Even when there is no yield, return
-the exact identifier as seen in the reaction.
+IMPORTANT:
+- Substrate IDs must be EXPLICITLY visible in the source - DO NOT INFER FROM PATTERNS
+- Product IDs should be extracted as labeled in the scheme/figure
+- If only product ID is shown with yields/ee data, substrate_ids should be null
 Return as JSON:
 {{
   "substrate_scope_data": [
     {{
       "enzyme_id": "enzyme variant name",
-      "substrate_ids": ["list of substrate identifiers"],
+      "substrate_ids": null or ["list of EXPLICITLY shown substrate identifiers"],
       "product_ids": ["list of product identifiers"],
       "yield_percent": null or number,
       "ee_percent": null or number,
@@ -959,6 +1068,10 @@ def _extract_compound_mappings_from_text(
                 source_location=item.get("source_location")
             )
+            # Store pattern information for post-processing
+            mapping._related_compounds = item.get("related_compounds", [])
+            mapping._pattern_notes = item.get("pattern_notes", "")
             # Create lookup entries for all identifiers and common names
             for identifier in mapping.identifiers + mapping.common_names:
                 if identifier:
@@ -970,6 +1083,180 @@ def _extract_compound_mappings_from_text(
         log.error("Failed to extract compound mappings: %s", exc)
         return {}
+def _extract_json(text: str) -> str:
+    """Extract JSON content from raw LLM response text."""
+    # Remove common markdown code block markers
+    text = text.strip()
+    if text.startswith('```json'):
+        text = text[7:]
+    elif text.startswith('```'):
+        text = text[3:]
+    if text.endswith('```'):
+        text = text[:-3]
+    # Find JSON structure
+    text = text.strip()
+    # Look for JSON object or array
+    json_start = -1
+    json_end = -1
+    for i, char in enumerate(text):
+        if char in '[{' and json_start == -1:
+            json_start = i
+            break
+    if json_start >= 0:
+        # Find the matching closing bracket
+        bracket_stack = []
+        in_string = False
+        escape_next = False
+        for i in range(json_start, len(text)):
+            char = text[i]
+            if escape_next:
+                escape_next = False
+                continue
+            if char == '\\':
+                escape_next = True
+                continue
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+            if in_string:
+                continue
+            if char in '[{':
+                bracket_stack.append(char)
+            elif char in ']}':
+                if bracket_stack:
+                    opening = bracket_stack.pop()
+                    if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
+                        if not bracket_stack:  # Found complete JSON
+                            json_end = i + 1
+                            break
+        if json_end > json_start:
+            return text[json_start:json_end]
+    # If no JSON found, return the original text
+    return text
+def _resolve_missing_compounds_with_gemini(
+    model,
+    known_compounds: Dict[str, str],
+    missing_compounds: List[str],
+    figure_images: Dict[str, str] = None,
+    primary_location_text: str = None,
+    debug_dir: str | Path | None = None,
+) -> Dict[str, str]:
+    """Use Gemini to resolve missing compound names based on patterns."""
+    prompt = """You are an expert chemist analyzing compound naming patterns in a chemistry paper.
+KNOWN COMPOUNDS WITH IUPAC NAMES:
+"""
+    # Add known compounds
+    for cid, name in sorted(known_compounds.items()):
+        prompt += f"- Compound {cid}: {name}\n"
+    prompt += f"""
+MISSING COMPOUNDS (need IUPAC names):
+{', '.join(sorted(missing_compounds))}
+TASK:
+1. Analyze the numbering/lettering pattern used in this paper
+2. Look for relationships between compounds (e.g., 3 → 3a, 3b as enantiomers)
+3. Determine the IUPAC names for the missing compounds
+IMPORTANT PATTERNS TO CONSIDER:
+- If compound "X" has a known structure and "Xa", "Xb" are missing, they might be stereoisomers
+- Common patterns: 'a' = (S)-enantiomer, 'b' = (R)-enantiomer (but verify from context)
+- Some papers use 'a/b' for different stereogenic centers or regioisomers
+- Look at the scheme images AND the text to understand relationships
+For each missing compound, provide the most likely IUPAC name based on:
+- The pattern analysis from text and schemes
+- Standard chemical nomenclature rules
+- The relationship to known compounds
+Return ONLY compounds where you have high confidence in the IUPAC name.
+If unsure, return null for that compound.
+Return as JSON:
+{{
+  "resolved_compounds": {{
+    "compound_id": "IUPAC name or null"
+  }}
+}}
+"""
+    # Add primary location text if available
+    if primary_location_text:
+        prompt += f"""
+PRIMARY SUBSTRATE SCOPE TEXT (from scheme/table):
+{primary_location_text[:10000]}  # Limit to prevent token overflow
+"""
+    # Add figure images if available
+    content_parts = [prompt]
+    if figure_images:
+        content_parts.append("\n\nANALYZE THE FOLLOWING SCHEME IMAGES TO UNDERSTAND THE COMPOUND RELATIONSHIPS:")
+        import PIL.Image
+        import io
+        import base64
+        for fig_ref, fig_base64 in figure_images.items():
+            if "scheme" in fig_ref.lower():
+                try:
+                    img_bytes = base64.b64decode(fig_base64)
+                    image = PIL.Image.open(io.BytesIO(img_bytes))
+                    content_parts.append(f"\n[{fig_ref}]")
+                    content_parts.append(image)
+                except Exception as e:
+                    log.warning("Failed to add scheme image %s: %s", fig_ref, e)
+    try:
+        # Use multimodal if we have images
+        if len(content_parts) > 1:
+            log.info("Using multimodal API with scheme images for compound resolution")
+            response = model.generate_content(content_parts)
+            raw_text = _extract_text(response).strip()
+        else:
+            # Text-only
+            raw_text = generate_json_with_retry(
+                model,
+                prompt,
+                debug_dir=debug_dir,
+                tag="resolve_compounds",
+                raw_response=True
+            )
+        # Parse response
+        data = json.loads(_extract_json(raw_text))
+        resolved = data.get("resolved_compounds", {})
+        # Filter to only return non-null values
+        result = {}
+        for cid, name in resolved.items():
+            if name and cid in missing_compounds:
+                result[cid] = name
+                log.info("Resolved compound %s: %s", cid, name[:60] + "..." if len(name) > 60 else name)
+        return result
+    except Exception as exc:
+        log.error("Failed to resolve compounds: %s", exc)
+        return {}
 def _extract_compound_mappings_with_figures(
     text: str,
     model,
@@ -1207,6 +1494,7 @@ def extract_compound_mappings(
     pdf_paths: List[Path] = None,
     iupac_sections: List[dict] = None,
     compound_ids: List[str] = None,
+    primary_locations: List[dict] = None,
     debug_dir: str | Path | None = None,
 ) -> Dict[str, CompoundMapping]:
     """Extract compound ID to IUPAC name mappings from identified sections.
@@ -1284,6 +1572,63 @@ def extract_compound_mappings(
             if not mapping or not mapping.iupac_name:
                 still_missing.append(cid)
+        # Step 5.5: Use Gemini to resolve compound relationships and missing names
+        if still_missing and len(mappings) > 0:
+            log.info("Attempting to resolve %d missing compounds using pattern analysis...", len(still_missing))
+            # Prepare data about known compounds and missing ones
+            known_compounds = {}
+            for key, mapping in mappings.items():
+                if mapping.iupac_name:
+                    # Get the primary identifier
+                    primary_id = mapping.identifiers[0] if mapping.identifiers else key
+                    known_compounds[primary_id] = mapping.iupac_name
+            # Extract primary location text if available
+            primary_location_text = None
+            if primary_locations and pdf_paths:
+                # Get text from the first primary location (usually the main scheme)
+                for loc in primary_locations[:1]:  # Just the first one
+                    loc_str = loc.get('location', '')
+                    if loc_str:
+                        primary_text = _extract_text_around_reference(pdf_paths, loc_str, context_chars=10000)
+                        if primary_text:
+                            primary_location_text = primary_text
+                            log.info("Extracted %d chars from primary location %s for pattern analysis",
+                                    len(primary_text), loc_str)
+                            break
+            # Ask Gemini to analyze patterns and resolve missing compounds
+            resolved_mappings = _resolve_missing_compounds_with_gemini(
+                model, known_compounds, still_missing,
+                figure_images=getattr(extract_compound_mappings, '_figure_images_cache', {}),
+                primary_location_text=primary_location_text,
+                debug_dir=debug_dir
+            )
+            # Merge resolved mappings
+            resolved_count = 0
+            for cid, iupac_name in resolved_mappings.items():
+                key = cid.lower().strip()
+                if key in mappings:
+                    if not mappings[key].iupac_name and iupac_name:
+                        mappings[key].iupac_name = iupac_name
+                        resolved_count += 1
+                else:
+                    # Create new mapping
+                    new_mapping = CompoundMapping(
+                        identifiers=[cid],
+                        iupac_name=iupac_name,
+                        common_names=[],
+                        compound_type="product",
+                        source_location="Resolved from pattern analysis"
+                    )
+                    mappings[key] = new_mapping
+                    resolved_count += 1
+            if resolved_count > 0:
+                log.info("Resolved %d compounds using pattern analysis", resolved_count)
         # Step 6: Final fallback - use figures and full manuscript if compounds are still missing
         # COMMENTED OUT: Figure-based IUPAC extraction is unreliable
         # Generating IUPAC names from visual structures leads to errors
@@ -1525,24 +1870,30 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
         try:
             # Parse substrate IDs
             substrates = []
-            substrate_ids = item.get("substrate_ids") or []
-            # Also handle old format
-            if not substrate_ids and item.get("substrates"):
-                substrates_data = item.get("substrates") or []
-                for s in substrates_data:
-                    if isinstance(s, dict):
-                        substrate_ids.append(s.get("identifier") or s.get("name", ""))
-                    else:
-                        substrate_ids.append(str(s))
+            substrate_ids = item.get("substrate_ids")
-            for sid in substrate_ids:
-                # Look up IUPAC name
-                iupac_name = None
-                mapping = compound_mappings.get(str(sid).lower())
-                if mapping:
-                    iupac_name = mapping.iupac_name
+            # Handle null substrate_ids
+            if substrate_ids is None:
+                # Leave substrates empty if substrate_ids is explicitly null
+                pass
+            else:
+                # Also handle old format
+                if not substrate_ids and item.get("substrates"):
+                    substrates_data = item.get("substrates") or []
+                    for s in substrates_data:
+                        if isinstance(s, dict):
+                            substrate_ids.append(s.get("identifier") or s.get("name", ""))
+                        else:
+                            substrate_ids.append(str(s))
-                substrates.append(SubstrateProduct(name=str(sid), iupac_name=iupac_name))
+                for sid in substrate_ids:
+                    # Look up IUPAC name
+                    iupac_name = None
+                    mapping = compound_mappings.get(str(sid).lower())
+                    if mapping:
+                        iupac_name = mapping.iupac_name
+                    substrates.append(SubstrateProduct(name=str(sid), iupac_name=iupac_name))
             # Parse product IDs
             products = []
@@ -1669,31 +2020,46 @@ def get_substrate_scope(
     time.sleep(2)  # Rate limiting
     log.info("Extracting all substrate scope data from all identified sources...")
-    # Extract images for all figure locations
+    # Extract images for all figure and scheme locations
     figure_images = {}
     for loc in locations:
         location_str = loc.get('location', '')
-        # Extract if it's marked as figure type OR if location contains "Figure" or "Fig"
-        if pdf_paths and ('figure' in location_str.lower() or 'fig' in location_str.lower() or loc.get('type') == 'figure'):
+        location_type = loc.get('type', 'unknown')
+        # Extract if it's a figure, scheme, or contains those keywords
+        should_extract = False
+        if pdf_paths:
+            if location_type in ['figure', 'scheme']:
+                should_extract = True
+            elif any(keyword in location_str.lower() for keyword in ['figure', 'fig', 'scheme']):
+                should_extract = True
+        if should_extract:
             figure_ref = location_str
             confidence = loc.get('confidence', 0)
-            log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, loc.get('type', 'unknown'))
-            figure_image = extract_figure_image(pdf_paths, figure_ref)
+            log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
+            # Use appropriate extraction function based on type
+            if 'scheme' in location_str.lower() or location_type == 'scheme':
+                figure_image = extract_scheme_image(pdf_paths, figure_ref)
+            else:
+                figure_image = extract_figure_image(pdf_paths, figure_ref)
             if figure_image:
-                log.info("Successfully extracted figure image for %s (%d bytes)",
-                         figure_ref, len(figure_image))
+                log.info("Successfully extracted %s image for %s (%d bytes)",
+                         location_type, figure_ref, len(figure_image))
                 figure_images[figure_ref] = figure_image
                 # Save figure image if debug_dir is enabled
                 if debug_dir:
                     import base64
                     debug_path = Path(debug_dir)
-                    image_path = debug_path / f"figure_image_{figure_ref.replace(' ', '_')}.png"
+                    image_path = debug_path / f"{location_type}_image_{figure_ref.replace(' ', '_')}.png"
                     with open(image_path, 'wb') as f:
                         f.write(base64.b64decode(figure_image))
-                    log.info("Saved figure image to %s", image_path)
+                    log.info("Saved %s image to %s", location_type, image_path)
             else:
-                log.warning("Failed to extract figure image for %s", figure_ref)
+                log.warning("Failed to extract %s image for %s", location_type, figure_ref)
     # Extract all substrate scope data in one call
     raw_entries = extract_all_substrate_scope_data(
@@ -1734,6 +2100,7 @@ def get_substrate_scope(
                                                  pdf_paths=pdf_paths,
                                                  iupac_sections=iupac_sections,
                                                  compound_ids=list(all_compound_ids),
+                                                 primary_locations=locations,
                                                  debug_dir=debug_dir)
     # Step 5: Parse all entries with compound mappings
@@ -1793,11 +2160,65 @@ def validate_scope_entries(entries: List[ScopeEntry]) -> List[str]:
     return warnings
+def _match_enzymes_with_gemini(
+    scope_enzymes: List[str],
+    lineage_enzymes: List[str],
+    model,
+    debug_dir: Optional[Path] = None
+) -> Dict[str, str]:
+    """Use Gemini to match enzyme names between substrate scope and lineage data."""
+    prompt = """You are an expert at matching enzyme variant names that may have Unicode or formatting differences.
+ENZYME NAMES FROM SUBSTRATE SCOPE DATA:
+""" + "\n".join(f"- {e}" for e in sorted(set(scope_enzymes))) + """
+ENZYME NAMES FROM LINEAGE DATA:
+""" + "\n".join(f"- {e}" for e in sorted(set(lineage_enzymes))) + """
+TASK:
+Match each substrate scope enzyme name to its corresponding lineage enzyme name.
+These are the SAME enzymes but may have different formatting:
+- Unicode vs ASCII characters (e.g., "ʟ" vs "L", "ᴅ" vs "D")
+- Different capitalization
+- Minor formatting differences
+IMPORTANT:
+- Only match enzymes that are clearly the same variant
+- Look for matching generation numbers (G0, G1, G2, etc.)
+- Consider the pattern: [L/D]-ApPgb-αEsA-G[number]
+- If no clear match exists, return null for that enzyme
+Return as JSON:
+{{
+  "enzyme_matches": {{
+    "substrate_scope_enzyme_name": "matching_lineage_enzyme_name_or_null"
+  }}
+}}
+"""
+    try:
+        response = generate_json_with_retry(
+            model,
+            prompt,
+            debug_dir=debug_dir,
+            tag="enzyme_matching"
+        )
+        matches = response.get("enzyme_matches", {})
+        log.info("Gemini matched %d enzyme names", len([v for v in matches.values() if v]))
+        return matches
+    except Exception as exc:
+        log.error("Failed to match enzymes with Gemini: %s", exc)
+        return {}
 def merge_with_lineage(
     entries: List[ScopeEntry],
-    lineage_csv: Optional[Path]
+    lineage_csv: Optional[Path],
+    model=None
 ) -> List[ScopeEntry]:
-    """Merge substrate scope entries with enzyme lineage data."""
+    """Merge substrate scope entries with enzyme lineage data using Gemini for matching."""
     if not lineage_csv or not lineage_csv.exists():
         return entries
@@ -1806,32 +2227,60 @@ def merge_with_lineage(
         lineage_df = pd.read_csv(lineage_csv)
         log.info("Loading lineage data from %s (%d enzymes)", lineage_csv, len(lineage_df))
-        # Create lookup map (case-insensitive)
+        # Get unique enzyme names from both sources
+        scope_enzymes = list(set(entry.enzyme_id for entry in entries if entry.enzyme_id))
+        lineage_enzymes = list(lineage_df['enzyme_id'].dropna().unique())
+        log.info("Found %d unique enzymes in substrate scope data", len(scope_enzymes))
+        log.info("Found %d unique enzymes in lineage data", len(lineage_enzymes))
+        # Use Gemini to match enzyme names if model is provided
+        if model and scope_enzymes and lineage_enzymes:
+            log.info("Using Gemini to match enzyme names between datasets...")
+            enzyme_matches = _match_enzymes_with_gemini(
+                scope_enzymes,
+                lineage_enzymes,
+                model,
+                debug_dir=Path("examples/amino_esters_test/substrate_scope_debug_v4") if Path("examples/amino_esters_test/substrate_scope_debug_v4").exists() else None
+            )
+        else:
+            # Fallback to simple case-insensitive matching
+            log.info("Using simple case-insensitive matching (no model provided)")
+            enzyme_matches = {}
+            for scope_enzyme in scope_enzymes:
+                for lineage_enzyme in lineage_enzymes:
+                    if scope_enzyme.lower() == lineage_enzyme.lower():
+                        enzyme_matches[scope_enzyme] = lineage_enzyme
+                        break
+        # Create lookup map with matched names
         lineage_map = {}
         for _, row in lineage_df.iterrows():
             enzyme_id = str(row.get('enzyme_id', ''))
-            lineage_map[enzyme_id.lower()] = {
-                'parent_id': row.get('parent_id'),
-                'mutations': row.get('mutations'),
+            lineage_map[enzyme_id] = {
+                'parent_id': row.get('parent_enzyme_id', ''),  # Note: might be 'parent_enzyme_id' not 'parent_id'
+                'mutations': row.get('mutations', ''),
                 'generation': row.get('generation'),
-                'aa_seq': row.get('aa_seq'),
-                'dna_seq': row.get('dna_seq'),
-                'confidence': row.get('confidence')
+                'aa_seq': row.get('protein_sequence', '') or row.get('aa_seq', ''),  # Try both column names
+                'dna_seq': row.get('dna_seq', ''),
+                'confidence': row.get('seq_confidence', '') or row.get('confidence', '')
             }
-        # Merge
+        # Merge using matched names
         merged_count = 0
         for entry in entries:
-            key = entry.enzyme_id.lower()
-            if key in lineage_map:
-                data = lineage_map[key]
-                entry.parent_id = data['parent_id']
-                entry.mutations = data['mutations']
-                entry.generation = data['generation']
-                entry.aa_seq = data['aa_seq']
-                entry.dna_seq = data['dna_seq']
-                entry.confidence = data['confidence']
-                merged_count += 1
+            if entry.enzyme_id in enzyme_matches:
+                matched_name = enzyme_matches[entry.enzyme_id]
+                if matched_name and matched_name in lineage_map:
+                    data = lineage_map[matched_name]
+                    entry.parent_id = data['parent_id']
+                    entry.mutations = data['mutations']
+                    entry.generation = data['generation']
+                    entry.aa_seq = data['aa_seq']
+                    entry.dna_seq = data['dna_seq']
+                    entry.confidence = data['confidence']
+                    merged_count += 1
+                    log.debug("Merged %s -> %s", entry.enzyme_id, matched_name)
         log.info("Merged lineage data for %d/%d entries", merged_count, len(entries))
@@ -1957,7 +2406,7 @@ def run_pipeline(
     # 4. Merge with lineage if available ---------------------------------------
     if lineage_csv:
-        entries = merge_with_lineage(entries, Path(lineage_csv))
+        entries = merge_with_lineage(entries, Path(lineage_csv), model)
     # 5. Validate entries ------------------------------------------------------
     warnings = validate_scope_entries(entries)

debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

debase 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl