PyPI - debase - Versions diffs - 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl - Mend

debase 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

debase/_version.py +1 -1
debase/campaign_utils.py +146 -0
debase/caption_pattern.py +44 -0
debase/cleanup_sequence.py +34 -6
debase/enzyme_lineage_extractor.py +481 -106
debase/lineage_format.py +44 -1
debase/reaction_info_extractor.py +479 -135
debase/substrate_scope_extractor.py +207 -80
debase/wrapper.py +3 -3
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
debase-0.6.2.dist-info/RECORD +18 -0
debase-0.6.0.dist-info/RECORD +0 -16
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
{debase-0.6.0.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -28,6 +28,13 @@ import fitz
 import re
 import json
 import time
+# Import universal caption pattern
+try:
+    from .caption_pattern import get_universal_caption_pattern
+except ImportError:
+    # Fallback if running as standalone script
+    from caption_pattern import get_universal_caption_pattern
 import logging
 from pathlib import Path
 from dataclasses import dataclass, field
@@ -113,17 +120,8 @@ _DOI_REGEX = re.compile(r"10\.[0-9]{4,9}/[-._;()/:A-Z0-9]+", re.I)
 # PDB ID regex - matches 4-character PDB codes
 _PDB_REGEX = re.compile(r"\b[1-9][A-Z0-9]{3}\b")
-# Improved caption prefix regex - captures most journal variants
-_CAPTION_PREFIX_RE = re.compile(
-    r"""
-    ^\s*
-    (?:Fig(?:ure)?|Extended\s+Data\s+Fig|ED\s+Fig|Scheme|Chart|
-       Table|Supp(?:lementary|l|\.?)\s+(?:Fig(?:ure)?|Table))  # label part
-    \s*(?:S?\d+[A-Za-z]?|[IVX]+)                               # figure number
-    [.:]?\s*                                                   # trailing punctuation/space
-    """,
-    re.I | re.X,
-)
+# Use universal caption pattern
+_CAPTION_PREFIX_RE = get_universal_caption_pattern()
 def _open_doc(pdf_path: str | Path | bytes):
@@ -467,7 +465,7 @@ def get_model():
         "temperature": 0.0,  # Deterministic: always pick the most likely token
         "top_p": 1.0,      # Consider all tokens (but temperature=0 will pick the best)
         "top_k": 1,        # Only consider the single most likely token
-        "max_output_tokens": 32768,  # Increased from 8192 to handle larger sequence extractions
+        "max_output_tokens": 65536,  # Increased to 2x for handling larger lineage tables and sequences
     }
     # For Gemini 2.5 Flash, disable thinking tokens to save costs
@@ -760,13 +758,24 @@ mutations were introduced){campaign_specific}. Pay attention to the provided con
 ensure the location you return are actually lineage location with variants and mutations.
 Respond with a JSON array of objects, each containing:
-- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
+- "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
 - "type": one of "table", "figure", "section"
 - "confidence": your confidence score (0-100) that this location contains lineage data
 - "reason": brief explanation of why this location likely contains lineage
+- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
+- "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
 {campaign_field}
-IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
-NOT page numbers. Focus on the actual figure/table titles and numbers.
+CRITICAL INSTRUCTIONS:
+1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
+   - Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
+   - Do NOT modify, standardize, or interpret the location - return it verbatim from the document
+2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
+   - This should be the complete caption as it appears in the document
+   - Include at least 200-300 characters to ensure unique matching
+3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
+   - Items like "Table S1", "Figure S2", etc. are typically in the SI
+   - Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
+   - If uncertain, use context clues from the text
 Order by confidence score (highest first). Tables showing complete variant lineages or
 mutation lists should be ranked higher than figures showing complete variant lineages.
@@ -776,9 +785,9 @@ Don't include oligonucleotide results or result from only one round.
 Example output:
 [
-  {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
-  {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
-  {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
+  {{"location": "Table S1.", "type": "table", "confidence": 95, "reason": "Variant lineage table", "source": "si", "caption": "Table S1. Summary of mutations introduced during directed evolution of PA-G8. The table shows all variants tested in each round of SSM with their corresponding mutations and activities..."{campaign_example}}},
+  {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram", "source": "manuscript", "caption": "Figure 2B Phylogenetic tree showing the evolutionary relationships between enzyme variants. Each node represents a variant with mutations indicated on branches..."{campaign_example}}},
+  {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description", "source": "manuscript", "caption": "Section 3.2 Directed Evolution Campaign. We performed eight rounds of site-saturation mutagenesis..."{campaign_example}}}
 ]
 """.strip()
@@ -956,6 +965,9 @@ def identify_evolution_locations(
         campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
         if hasattr(camp, 'notes') and camp.notes:
             campaign_context += f"- Key identifiers: {camp.notes}\n"
+        if hasattr(camp, 'data_locations') and camp.data_locations:
+            campaign_context += f"- KNOWN DATA LOCATIONS: {', '.join(camp.data_locations)}\n"
+            campaign_context += "  IMPORTANT: Prioritize these known locations highly!\n"
         campaign_specific = f" for the '{camp.campaign_name}' campaign"
         campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
         campaign_example = f', "campaign_id": "{camp.campaign_id}"'
@@ -964,7 +976,10 @@ def identify_evolution_locations(
         campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
         for camp in campaigns:
             campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
+            if hasattr(camp, 'data_locations') and camp.data_locations:
+                campaign_context += f"  Known locations: {', '.join(camp.data_locations)}\n"
         campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
+        campaign_context += "IMPORTANT: Prioritize the known locations listed above!\n"
         campaign_specific = " for any of the identified campaigns"
         campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
         campaign_example = ', "campaign_id": "campaign_id_here"'
@@ -1041,6 +1056,7 @@ def extract_complete_lineage(
     campaign_id: Optional[str] = None,
     campaign_info: Optional[Campaign] = None,
     pdf_paths: Optional[List[Path]] = None,
+    location_str: Optional[str] = None,
 ) -> List[Variant]:
     """Prompt Gemini for the full lineage and return a list[Variant]."""
     # Build campaign context
@@ -1060,6 +1076,21 @@ IMPORTANT:
 2. Include "campaign_id": "{campaign_info.campaign_id}" for each variant in your response.
 3. Use the lineage hint pattern above to identify which variants belong to this campaign.
 4. Include parent variants only if they are direct ancestors in this campaign's lineage.
+"""
+    # Add location context if provided
+    location_context = ""
+    if location_str:
+        location_context = f"""
+LOCATION CONTEXT:
+You are extracting data SPECIFICALLY from: {location_str}
+CRITICAL INSTRUCTIONS:
+- ONLY extract enzyme variants that appear in {location_str}
+- DO NOT include variants from other figures, tables, or sections
+- If {location_str} references variants from other locations, DO NOT include those unless they are explicitly shown in {location_str}
+- Focus strictly on the data presented within the boundaries of {location_str}
 """
     # Extract table of contents from PDFs if available
@@ -1096,8 +1127,11 @@ IMPORTANT:
     # Include TOC in the prompt text
     combined_text = toc_text + text if toc_text else text
+    # Combine campaign and location context
+    full_context = campaign_context + location_context
     prompt = _LINEAGE_EXTRACT_PROMPT.format(
-        campaign_context=campaign_context,
+        campaign_context=full_context,
         schema=_LINEAGE_SCHEMA_HINT,
         text=combined_text[:MAX_CHARS],
     )
@@ -1438,10 +1472,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
 # ---- 6.4  Public API -------------------------------------------------------
-def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
-    """Extract text from a specific location (table, section, etc.) in the full text."""
+def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
+    """Extract text from a specific location (table, section, etc.) in the full text.
+    Args:
+        full_text: The full text to search in
+        location: The location identifier (e.g., "Table S1")
+        location_type: Type of location ("table", "figure", "section")
+        caption_hint: Optional full caption text for fuzzy matching
+    """
     import re
+    # If caption hint is provided, try fuzzy matching first
+    if caption_hint and len(caption_hint) > 20:
+        log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
+        # Normalize texts for better matching (similar to reaction_info_extractor)
+        def normalize_for_matching(text):
+            # Remove extra whitespace, normalize spaces around punctuation
+            text = ' '.join(text.split())
+            # Normalize different dash types
+            text = text.replace('–', '-').replace('—', '-')
+            return text
+        normalized_hint = normalize_for_matching(caption_hint[:150])  # Use first 150 chars
+        normalized_text = normalize_for_matching(full_text)
+        # Try to find ALL caption matches using character-based fuzzy matching
+        all_matches = []
+        # Slide through the text looking for all matches above threshold
+        hint_len = len(normalized_hint)
+        for i in range(len(normalized_text) - hint_len + 1):
+            snippet = normalized_text[i:i + hint_len]
+            # Simple character-based similarity
+            matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
+            score = matches / hint_len
+            if score > 0.7:  # 70% similarity threshold
+                all_matches.append({
+                    'norm_pos': i,
+                    'score': score
+                })
+        # If we found matches, extract from all of them
+        if all_matches:
+            log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
+            # Collect all occurrences from fuzzy matches
+            all_occurrences = []
+            seen_positions = set()
+            for match_info in all_matches:
+                # Get the matched text from normalized version
+                matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
+                # Find where this appears in the original text
+                best_original_pos = -1
+                # Search in the original text for this specific match
+                for i in range(len(full_text) - len(caption_hint) + 1):
+                    if i in seen_positions:
+                        continue
+                    original_snippet = full_text[i:i + len(caption_hint)]
+                    # Normalize and compare
+                    normalized_snippet = normalize_for_matching(original_snippet)
+                    if normalized_snippet[:hint_len] == matched_normalized:
+                        # Found exact match after normalization
+                        best_original_pos = i
+                        seen_positions.add(i)
+                        break
+                if best_original_pos >= 0:
+                    # Extract generous context from this match position
+                    start = max(0, best_original_pos - 1000)
+                    end = min(len(full_text), best_original_pos + 10000)
+                    context = full_text[start:end]
+                    all_occurrences.append({
+                        'position': best_original_pos,
+                        'context': context,
+                        'score': match_info['score']
+                    })
+                    log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
+            if all_occurrences:
+                # Sort by position to maintain document order
+                all_occurrences.sort(key=lambda x: x['position'])
+                # Combine all occurrences
+                combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
+                for i, occurrence in enumerate(all_occurrences, 1):
+                    combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
+                    combined_text += occurrence['context']
+                    combined_text += "\n\n"
+                # Apply same limit as table extraction
+                if len(combined_text) > 150000:
+                    combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
+                log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
+                return combined_text
+            else:
+                log.warning(f"Could not map any fuzzy matches back to original text")
+        else:
+            log.warning(f"No fuzzy matches found for caption above 70% threshold")
     if location_type == 'table':
         # Find ALL mentions of this table and combine them
         location_clean = location.strip()
@@ -1483,6 +1621,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
         log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
+        # Sort occurrences by position to maintain document order
+        all_occurrences.sort(key=lambda x: x['position'])
         # Combine all occurrences into one text for Gemini to analyze
         combined_text = f"=== All occurrences of {location_clean} ===\n\n"
@@ -1492,8 +1633,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
             combined_text += "\n\n"
         # Limit total length to avoid overwhelming the model
-        if len(combined_text) > 50000:
-            combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
+        # Increased limit to ensure actual table content is included
+        if len(combined_text) > 150000:
+            combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
         return combined_text
@@ -1577,6 +1719,8 @@ def get_lineage(
     *,
     pdf_paths: Optional[List[Path]] = None,
     debug_dir: str | Path | None = None,
+    manuscript_text: Optional[str] = None,
+    si_text: Optional[str] = None,
 ) -> Tuple[List[Variant], List[Campaign]]:
     """
     High-level wrapper used by the pipeline.
@@ -1690,8 +1834,21 @@ def get_lineage(
                 if location_type in ['table', 'text', 'section'] and not extracted_variants:
                     log.info(f"Attempting text extraction for {location_type}: {location_str}")
-                    # Extract the specific section/table from full text
-                    section_text = _extract_location_text(full_text, location_str, location_type)
+                    # Determine which text to use based on source
+                    location_source = location.get('source', 'manuscript')
+                    if location_source == 'si' and si_text:
+                        text_to_search = si_text
+                        log.info(f"Using SI text for location {location_str}")
+                    elif location_source == 'manuscript' and manuscript_text:
+                        text_to_search = manuscript_text
+                        log.info(f"Using manuscript text for location {location_str}")
+                    else:
+                        text_to_search = full_text
+                        log.info(f"Using combined text for location {location_str} (fallback)")
+                    # Extract the specific section/table from appropriate text
+                    caption_hint = location.get('caption', '')
+                    section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
                     if section_text:
                         log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
                         # Save extracted section if debug enabled
@@ -1705,7 +1862,8 @@ def get_lineage(
                             debug_dir=debug_dir,
                             campaign_id=campaign.campaign_id,
                             campaign_info=campaign,
-                            pdf_paths=pdf_paths
+                            pdf_paths=pdf_paths,
+                            location_str=location_str
                         )
                         if variants:
                             log.info(f"Extracted {len(variants)} variants from {location_type}")
@@ -2004,17 +2162,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
 Look for table of contents entries or section listings that mention sequences.
 Return a JSON array where each element has:
-- "section": the section heading or description
+- "section": the section heading or description EXACTLY as it appears
 - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
+- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
+- "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
 Focus on:
 - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
 - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
 - Prioritize sections that mention "protein" or "amino acid" sequences
-CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
-- Correct: "53", "S12", "147"
-- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
+CRITICAL:
+1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
+   - Correct: "53", "S12", "147"
+   - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
+2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
+   - Pages with "S" prefix (e.g., "S53") are typically in the SI
+   - Regular page numbers (e.g., "53") are typically in the main manuscript
+   - Use context clues from the document structure
 Return [] if no sequence sections are found.
 Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -2254,44 +2419,34 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
 # --- 7.3  Main extraction prompt ---------------------------------------------
 _SEQ_EXTRACTION_PROMPT = """
-Extract EVERY distinct enzyme-variant sequence you can find in the text.
-IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
-- If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
-- Only extract dna_seq if NO amino acid sequence is available for that variant
-- This reduces redundancy since protein sequences are usually more relevant
-CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
-- Papers often use different naming conventions in different sections
-- DO NOT normalize or simplify variant IDs
-- Extract the variant_id exactly as written where the sequence appears
-- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
-SEQUENCE EXTRACTION RULES:
-- Copy sequences EXACTLY as they appear in the text
-- Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
-- Do NOT add, remove, or modify any amino acids, or nucleotides
-- Preserve the exact length and character sequence
-- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
-- Double-check that consecutive identical amino acids or nucleotides  are copied correctly
-For each variant return:
-  * variant_id  - the EXACT label as it appears with the sequence (preserve all formatting)
-  * aa_seq      - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
-  * dna_seq     - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
-Respond ONLY with **minified JSON** that matches the schema below.
-NO markdown, no code fences, no commentary.
-Schema:
-```json
-{schema}
-```
+Extract ALL enzyme variant sequences from the text.
+Rules:
+1. Use EXACT variant IDs as they appear with each sequence
+2. Copy sequences EXACTLY - preserve all amino acids/nucleotides including repeats
+3. For each variant:
+   - If amino acid sequence exists: set aa_seq to the sequence, set dna_seq to null
+   - If ONLY DNA sequence exists: set dna_seq to the sequence, set aa_seq to null
+   - NEVER include both aa_seq and dna_seq for the same variant
+   - IMPORTANT: Always prefer amino acid sequences over DNA sequences when both are available
+4. Return ONLY minified JSON, no markdown or commentary
+CRITICAL SEQUENCE PRIORITY RULE:
+- If you find BOTH amino acid sequence AND DNA sequence for the same variant, ONLY return the amino acid sequence
+- Set dna_seq to null when aa_seq is available, even if DNA sequence is present in the text
+- Only return dna_seq when NO amino acid sequence exists for that variant
+CRITICAL ACCURACY REQUIREMENTS:
+- Extract ONLY sequences that are explicitly present in the provided text
+- DO NOT generate, infer, or hallucinate any sequences
+- Every character in the sequence must be directly copied from the text
+- If a sequence appears truncated or incomplete in the text, extract only what is shown
+- Be extremely careful and accurate - sequence accuracy is critical for scientific validity
+Schema: {schema}
-TEXT (may be truncated):
-```
+TEXT:
 {text}
-```
 """.strip()
 def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
@@ -2366,7 +2521,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
 def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
-    """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
+    """Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
     Can exit early after 2 attempts if the responses match exactly.
@@ -2380,9 +2535,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
         The most common sequence JSON data or None if all attempts failed
     """
     responses = []
-    max_attempts = 6
+    max_attempts = 3  # Reduced from 6 to 3 for performance
-    # Try 6 times with early match detection
+    # Try 3 times with early match detection
     for attempt in range(max_attempts):
         try:
             log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2408,8 +2563,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
             # Try to parse as JSON
             try:
-                parsed = json.loads(raw)
-            except json.JSONDecodeError:
+                # First clean the response - remove any BOM or invisible characters
+                raw_clean = raw.strip()
+                if raw_clean.startswith('\ufeff'):  # Remove BOM if present
+                    raw_clean = raw_clean[1:]
+                parsed = json.loads(raw_clean)
+            except json.JSONDecodeError as e:
+                log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
                 # Look for JSON array or object in the response
                 json_start = -1
                 json_end = -1
@@ -2458,17 +2618,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
             responses.append(parsed)
             log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
-            # Early match detection after 2 attempts
-            if attempt >= 1:  # After 2nd attempt (0-indexed)
-                valid_responses_so_far = [r for r in responses if r is not None]
-                if len(valid_responses_so_far) >= 2:
-                    # Check if the last two valid responses match
-                    if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
-                        log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
-                        # Add the matching response 4 more times to simulate consensus
-                        for _ in range(max_attempts - attempt - 1):
-                            responses.append(valid_responses_so_far[-1])
-                        break
+            # If we got a good response with sequences, we can check for early termination
+            if isinstance(parsed, list) and len(parsed) > 0:
+                # Early match detection after 2 attempts
+                if attempt >= 1:  # After 2nd attempt (0-indexed)
+                    valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
+                    if len(valid_responses_so_far) >= 2:
+                        # Check if the last two valid responses match
+                        if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
+                            log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
+                            # Add the matching response to fill remaining attempts
+                            for _ in range(max_attempts - attempt - 1):
+                                responses.append(valid_responses_so_far[-1])
+                            break
+                # If this is the first attempt and we got sequences, continue to validate with at least one more
+                elif attempt == 0 and len(parsed) > 5:  # Got substantial sequences on first try
+                    log.info("Got substantial sequences on first attempt, will validate with one more")
         except Exception as e:
             log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
@@ -2828,9 +2993,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
             focused_text = ""
             if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
                 page_num = best_location['page']
-                # Extract current page plus next 15 pages
+                # Extract current page plus next 5 pages (6 total) to prevent hallucination
                 all_pages = []
-                for i in range(16):  # Current + next 15
+                for i in range(6):  # Current + next 5 (6 pages total)
                     if isinstance(page_num, str) and page_num.upper().startswith('S'):
                         next_page = f"S{int(page_num[1:]) + i}"
                     else:
@@ -2842,7 +3007,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                         break
                 if all_pages:
                     focused_text = "\n".join(all_pages)
-                    log.info("Extracted %d chars from pages %s through %d more pages",
+                    log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
                              len(focused_text), page_num, len(all_pages) - 1)
             # Fallback to text search if page extraction didn't work
@@ -3128,6 +3293,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
         return {}
+def _match_variant_ids_with_gemini(
+    lineage_variant_ids: List[str],
+    pdb_variant_ids: List[str],
+    model
+) -> Dict[str, str]:
+    """Use Gemini to match variant IDs that may have slight formatting differences.
+    Args:
+        lineage_variant_ids: List of variant IDs from the lineage
+        pdb_variant_ids: List of variant IDs from PDB matching
+        model: Gemini model for matching
+    Returns:
+        Dictionary mapping lineage_variant_id -> pdb_variant_id
+    """
+    if not lineage_variant_ids or not pdb_variant_ids or not model:
+        return {}
+    # If the lists are identical, return direct mapping
+    if set(lineage_variant_ids) == set(pdb_variant_ids):
+        return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
+    # Use Gemini to match variant IDs that may have formatting differences
+    prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
+These represent the same enzyme variants but may be formatted differently.
+Lineage variant IDs:
+{json.dumps(lineage_variant_ids, indent=2)}
+PDB variant IDs:
+{json.dumps(pdb_variant_ids, indent=2)}
+Match variants that represent the SAME enzyme variant, accounting for:
+- Whitespace differences (extra spaces, tabs)
+- Character encoding differences
+- Minor formatting variations
+Return ONLY a JSON object mapping lineage IDs to PDB IDs.
+Format: {{"lineage_id": "pdb_id", ...}}
+Only include matches you are confident represent the same variant.
+Return an empty object {{}} if no matches can be confidently made.
+"""
+    try:
+        response = model.generate_content(prompt)
+        text = _extract_text(response).strip()
+        # Parse JSON response
+        if text.startswith("```"):
+            text = text.split("```")[1].strip()
+            if text.startswith("json"):
+                text = text[4:].strip()
+        # Clean up the text
+        text = text.strip()
+        if not text or text == "{}":
+            return {}
+        matches = json.loads(text)
+        log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
+        # Validate matches
+        valid_matches = {}
+        for lineage_id, pdb_id in matches.items():
+            if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
+                valid_matches[lineage_id] = pdb_id
+                log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
+            else:
+                log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
+        return valid_matches
+    except Exception as e:
+        log.warning(f"Failed to match variant IDs with Gemini: {e}")
+        return {}
 def match_pdb_to_variants(
     pdb_sequences: Dict[str, str],
     variants: List[Variant],
@@ -3211,24 +3453,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
         text = _extract_text(response).strip()
         # Parse JSON response (expecting a single string)
-        if text.startswith("```"):
+        # Look for JSON code blocks first
+        if "```json" in text:
+            # Extract content between ```json and ```
+            import re
+            json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
+            if json_match:
+                json_content = json_match.group(1).strip()
+                try:
+                    # Parse as JSON and extract the string value
+                    parsed = json.loads(json_content)
+                    matched_variant = str(parsed).strip('"\'')
+                except:
+                    # If JSON parsing fails, try to extract the quoted string
+                    quoted_match = re.search(r'"([^"]+)"', json_content)
+                    if quoted_match:
+                        matched_variant = quoted_match.group(1)
+                    else:
+                        matched_variant = json_content.strip('"\'')
+            else:
+                matched_variant = text.strip('"\'')
+        elif text.startswith("```"):
+            # Handle other code blocks
             text = text.split("```")[1].strip()
             if text.startswith("json"):
                 text = text[4:].strip()
+            matched_variant = text.strip('"\'')
+        else:
+            # Look for quoted strings in the response
+            import re
+            quoted_match = re.search(r'"([^"]+)"', text)
+            if quoted_match:
+                matched_variant = quoted_match.group(1)
+            else:
+                # Remove quotes if present
+                matched_variant = text.strip('"\'')
-        # Remove quotes if present
-        text = text.strip('"\'')
-        matched_variant = text
+        log.info(f"Extracted variant name: '{matched_variant}' from response")
         log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
         # Return mapping with all chains pointing to the same variant
         mapping = {}
-        if matched_variant and any(v.variant_id == matched_variant for v in variants):
-            for chain_id in pdb_sequences:
-                mapping[matched_variant] = chain_id
-                break  # Only use the first chain
+        if matched_variant:
+            # Debug logging
+            variant_ids = [v.variant_id for v in variants]
+            log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
+            # Check if the matched variant exists in the lineage
+            found_variant = any(v.variant_id == matched_variant for v in variants)
+            log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
+            if found_variant:
+                for chain_id in pdb_sequences:
+                    mapping[matched_variant] = chain_id
+                    log.info(f"Created mapping: {matched_variant} -> {chain_id}")
+                    break  # Only use the first chain
+            else:
+                log.warning(f"Variant '{matched_variant}' not found in lineage variants")
+                # Try fuzzy matching
+                for variant in variants:
+                    if variant.variant_id.strip() == matched_variant.strip():
+                        log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
+                        for chain_id in pdb_sequences:
+                            mapping[variant.variant_id] = chain_id
+                            log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
+                            break
+                        break
+        else:
+            log.warning("No matched variant extracted from response")
+        log.info(f"Final mapping result: {mapping}")
         return mapping
     except Exception as e:
@@ -3364,6 +3658,9 @@ Only match variants that represent the SAME enzyme, accounting for different nam
 Return ONLY a JSON object mapping lineage IDs to sequence IDs.
 Format: {{"lineage_id": "sequence_id", ...}}
 Only include matches you are confident represent the same variant.
+DO NOT include any explanation, reasoning, or text other than the JSON object.
+Response must be valid JSON that starts with {{ and ends with }}
 """
             try:
@@ -3406,17 +3703,28 @@ Only include matches you are confident represent the same variant.
                         log.error(f"Full cleaned text: {text}")
                         # Try to extract JSON from within the response
                         import re
-                        json_match = re.search(r'\{.*\}', text, re.DOTALL)
-                        if json_match:
+                        # First try to find JSON in code blocks
+                        code_block_match = re.search(r'```json\s*(\{[^`]*\})\s*```', text, re.DOTALL)
+                        if code_block_match:
                             try:
-                                matches = json.loads(json_match.group(0))
-                                log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
+                                matches = json.loads(code_block_match.group(1))
+                                log.info(f"Successfully extracted JSON from code block: {len(matches)} matches")
                             except json.JSONDecodeError:
-                                log.error("Failed to extract JSON from response")
+                                log.error("Failed to parse JSON from code block")
                                 matches = {}
                         else:
-                            log.error("No JSON object found in response")
-                            matches = {}
+                            # Try to find standalone JSON object (non-greedy, looking for balanced braces)
+                            json_match = re.search(r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})', text)
+                            if json_match:
+                                try:
+                                    matches = json.loads(json_match.group(1))
+                                    log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
+                                except json.JSONDecodeError:
+                                    log.error("Failed to extract JSON from response")
+                                    matches = {}
+                            else:
+                                log.error("No JSON object found in response")
+                                matches = {}
                 # Create a mapping of sequence IDs to their data for efficient lookup
                 seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
@@ -3596,14 +3904,28 @@ def run_pipeline(
     caption_text = limited_caption_concat(*pdf_paths)
     full_text = limited_concat(*pdf_paths)
+    # Also load separate texts for manuscript and SI
+    manuscript_text = limited_concat(manuscript) if manuscript else None
+    si_text = limited_concat(si_path) if si_path else None
     log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
              len(caption_text), len(full_text))
+    if manuscript_text:
+        log.info("Loaded %d chars from manuscript", len(manuscript_text))
+    if si_text:
+        log.info("Loaded %d chars from SI", len(si_text))
     # 2. Connect to Gemini -----------------------------------------------------
     model = get_model()
     # 3. Extract lineage (Section 6) ------------------------------------------
-    lineage, campaigns = get_lineage(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+    lineage, campaigns = get_lineage(
+        caption_text, full_text, model,
+        pdf_paths=pdf_paths,
+        debug_dir=debug_dir,
+        manuscript_text=manuscript_text,
+        si_text=si_text
+    )
     if not lineage:
         raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
@@ -3683,12 +4005,40 @@ def run_pipeline(
                         pdb_sequences, lineage, full_text, model, pdb_id
                     )
+                    log.info(f"PDB matching result: {variant_to_chain}")
+                    log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
+                    log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
                     # Convert to SequenceBlock objects
                     pdb_seq_blocks = []
-                    for variant in lineage:
-                        if variant.variant_id in variant_to_chain:
-                            chain_id = variant_to_chain[variant.variant_id]
-                            if chain_id in pdb_sequences:
+                    # Use Gemini-based matching for robust variant ID comparison
+                    if variant_to_chain and model:
+                        # Create a mapping using Gemini for robust string matching
+                        gemini_mapping = _match_variant_ids_with_gemini(
+                            lineage_variant_ids=[v.variant_id for v in lineage],
+                            pdb_variant_ids=list(variant_to_chain.keys()),
+                            model=model
+                        )
+                        for variant in lineage:
+                            log.info(f"Processing variant: {variant.variant_id}")
+                            # Try direct match first
+                            chain_id = variant_to_chain.get(variant.variant_id)
+                            log.info(f"Direct match for {variant.variant_id}: {chain_id}")
+                            # If no direct match, try Gemini-based matching
+                            if not chain_id:
+                                matched_pdb_variant = gemini_mapping.get(variant.variant_id)
+                                log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
+                                if matched_pdb_variant:
+                                    chain_id = variant_to_chain.get(matched_pdb_variant)
+                                    log.info(f"Chain ID from Gemini match: {chain_id}")
+                            if chain_id and chain_id in pdb_sequences:
+                                seq_length = len(pdb_sequences[chain_id])
+                                log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
                                 seq_block = SequenceBlock(
                                     variant_id=variant.variant_id,
                                     aa_seq=pdb_sequences[chain_id],
@@ -3699,6 +4049,26 @@ def run_pipeline(
                                 )
                                 pdb_seq_blocks.append(seq_block)
                                 log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
+                            else:
+                                log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
+                    else:
+                        # Fallback to direct matching if no model or no matches
+                        for variant in lineage:
+                            if variant.variant_id in variant_to_chain:
+                                chain_id = variant_to_chain[variant.variant_id]
+                                if chain_id in pdb_sequences:
+                                    seq_block = SequenceBlock(
+                                        variant_id=variant.variant_id,
+                                        aa_seq=pdb_sequences[chain_id],
+                                        dna_seq=None,
+                                        confidence=1.0,  # High confidence for PDB sequences
+                                        truncated=False,
+                                        metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
+                                    )
+                                    pdb_seq_blocks.append(seq_block)
+                                    log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
+                    log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
                     if pdb_seq_blocks:
                         # Update the dataframe with PDB sequences
@@ -3708,8 +4078,13 @@ def run_pipeline(
                                 df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
                                 df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
                                 df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
+                                log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
+                            else:
+                                log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
                         log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
                         break
+                    else:
+                        log.warning(f"No PDB sequence blocks were created for {pdb_id}")
                 else:
                     log.warning(f"No sequences found in PDB {pdb_id}")
         else:

debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

debase 0.6.0py3-none-any.whl → 0.6.2py3-none-any.whl