PyPI - debase - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

debase 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +123 -0
debase/enzyme_lineage_extractor.py +254 -315
debase/lineage_format.py +22 -18
debase/reaction_info_extractor.py +180 -62
debase/substrate_scope_extractor.py +3 -2
{debase-0.4.4.dist-info → debase-0.5.0.dist-info}/METADATA +1 -1
debase-0.5.0.dist-info/RECORD +16 -0
debase-0.4.4.dist-info/RECORD +0 -16
{debase-0.4.4.dist-info → debase-0.5.0.dist-info}/WHEEL +0 -0
{debase-0.4.4.dist-info → debase-0.5.0.dist-info}/entry_points.txt +0 -0
{debase-0.4.4.dist-info → debase-0.5.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.4.dist-info → debase-0.5.0.dist-info}/top_level.txt +0 -0

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -24,6 +24,7 @@ import pandas as pd
 import networkx as nx  # light dependency, used only for generation inference
 import os
+import fitz
 import re
 import json
 import time
@@ -460,8 +461,32 @@ def get_model():
     if not api_key:
         raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
     _genai.configure(api_key=api_key)
-    # Positional constructor arg works for both SDK flavors
-    return _genai.GenerativeModel(MODEL_NAME)
+    # Create generation config to optimize performance and costs
+    generation_config = {
+        "temperature": 0.0,  # Deterministic: always pick the most likely token
+        "top_p": 1.0,      # Consider all tokens (but temperature=0 will pick the best)
+        "top_k": 1,        # Only consider the single most likely token
+        "max_output_tokens": 32768,  # Increased from 8192 to handle larger sequence extractions
+    }
+    # For Gemini 2.5 Flash, disable thinking tokens to save costs
+    # thinking_budget=0 disables thinking, -1 enables dynamic thinking (default)
+    # Only add if SDK supports it to maintain compatibility
+    try:
+        # Test if thinking_budget is supported by making a minimal API call
+        test_config = {"thinking_budget": 0, "max_output_tokens": 10}
+        test_model = _genai.GenerativeModel(MODEL_NAME, generation_config=test_config)
+        # Actually test the API call to see if thinking_budget is supported
+        test_response = test_model.generate_content("Return 'OK'")
+        # If no error, add thinking_budget to main config
+        generation_config["thinking_budget"] = 0
+        log.debug("Disabled thinking tokens (thinking_budget=0)")
+    except Exception as e:
+        # SDK doesn't support thinking_budget, continue without it
+        log.debug(f"thinking_budget not supported: {e}")
+    return _genai.GenerativeModel(MODEL_NAME, generation_config=generation_config)
 # === 5.3  Unified call helper ----------------------------------------------
@@ -728,22 +753,24 @@ Return a JSON object with:
 _LINEAGE_LOC_PROMPT = """
 You are an expert reader of protein engineering manuscripts.
 {campaign_context}
-Given the following article text, list up to {max_results} *locations* (page
-numbers, figure/table IDs, or section headings) that you would review first to
-find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
-came from which parent and what mutations were introduced){campaign_specific}.
+Given the following article text, list up to {max_results} *locations* (figure/table IDs
+or section headings) that you would review first to find the COMPLETE evolutionary
+lineage of enzyme variants (i.e. which variant came from which parent and what
+mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
+ensure the location you return are actually lineage location with variants and mutations.
 Respond with a JSON array of objects, each containing:
-- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
-- "type": one of "table", "figure", "text", "section"
+- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
+- "type": one of "table", "figure", "section"
 - "confidence": your confidence score (0-100) that this location contains lineage data
 - "reason": brief explanation of why this location likely contains lineage
 {campaign_field}
-IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
+IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
+NOT page numbers. Focus on the actual figure/table titles and numbers.
 Order by confidence score (highest first). Tables showing complete variant lineages or
-mutation lists should be ranked higher than figure showing complete variant lineages.
-Text sections is used when no suitable tables/figurews exist.
+mutation lists should be ranked higher than figures showing complete variant lineages.
+Sections are used when no suitable tables/figures exist.
 Don't include oligonucleotide results or result from only one round.
@@ -1713,7 +1740,6 @@ def get_lineage(
         for pdf_path in pdf_paths:
             # Extract first few pages looking for TOC
             try:
-                import fitz  # PyMuPDF
                 doc = fitz.open(pdf_path)
                 toc_text = ""
                 for page_num in range(min(5, doc.page_count)):  # First 5 pages
@@ -2011,7 +2037,7 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
 # --- 7.2  Page-based extraction helper ---------------------------------------
 def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
-    """Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
+    """Extract plain text sequence using Gemini with 6 attempts, returning most common result.
     Args:
         prompt: The prompt to send to Gemini
@@ -2019,12 +2045,12 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
         context: Additional context for logging (e.g., "validation" or "extraction")
     Returns:
-        The validated sequence or None if no consensus
+        The most common sequence or None if all attempts failed
     """
     sequences = []
-    max_attempts = 5  # Increased from 3 to 5
+    max_attempts = 6
-    # Try up to 5 times
+    # Try 6 times
     for attempt in range(max_attempts):
         try:
             response = model.generate_content(prompt)
@@ -2050,38 +2076,14 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
         except Exception as e:
             log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
             sequences.append("ERROR")
-        # Check for early consensus after 2 attempts
-        if len(sequences) == 2:
-            # Clean sequences before comparison
-            seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
-            seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
-            if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
-                log.info(f"Gemini {context} consensus reached after 2 attempts")
-                return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
-            else:
-                log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
-    # After all attempts, find consensus
+    # After all attempts, find most common result
     valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
     if not valid_sequences:
         log.error(f"All {max_attempts} {context} attempts failed")
         return None
-    # Find any matching pair
-    for i in range(len(sequences)):
-        for j in range(i + 1, len(sequences)):
-            # Clean sequences before comparison
-            seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
-            seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
-            if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
-                log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
-                return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
-    # If no exact match, use adaptive validation
     # Count occurrences of each valid sequence
     sequence_counts = {}
     for seq in valid_sequences:
@@ -2090,80 +2092,16 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
             seq_clean = seq.replace(" ", "").replace("\n", "")
             sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
-    # Return the most common sequence if it appears at least twice
+    # Return the most common sequence
     if sequence_counts:
         most_common = max(sequence_counts.items(), key=lambda x: x[1])
-        if most_common[1] >= 2:
-            log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
-            return most_common[0]
+        log.info(f"Gemini {context} most common: sequence appeared {most_common[1]}/{max_attempts} times")
+        return most_common[0]
-    log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
+    log.warning(f"Gemini {context} no valid sequences after {max_attempts} attempts")
     return None
-def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
-    """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
-    # Extract mutations from variants
-    mutations = []
-    for variant in variants:
-        if variant.mutations:
-            mutations.extend(variant.mutations)
-    if not mutations:
-        return None
-    # Take a sample of mutations for validation
-    sample_mutations = mutations[:10]  # Check first 10 mutations
-    # First do a quick local check for obvious inconsistencies
-    local_issues = []
-    for mutation in sample_mutations:
-        if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
-            pos = mutation.position - 1  # Convert to 0-indexed
-            if 0 <= pos < len(sequence):
-                actual_aa = sequence[pos]
-                expected_aa = mutation.original
-                if actual_aa != expected_aa:
-                    local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
-    if not local_issues:
-        return None  # No obvious issues found
-    log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
-    prompt = f"""
-You are validating a protein sequence that was extracted from a scientific paper.
-The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
-Original sequence (length {len(sequence)}):
-{sequence}
-Known mutations that should be applicable to this sequence:
-{', '.join(str(m) for m in sample_mutations)}
-Potential issues detected:
-{chr(10).join(local_issues)}
-Please check if the sequence is consistent with these mutations:
-1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
-2. If you find inconsistencies, suggest the most likely correction
-3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
-4. Pay special attention to consecutive identical amino acids that might be OCR errors
-Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
-If you cannot determine the correct sequence, return "UNCERTAIN".
-"""
-    # Use triple validation
-    result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
-    if result == "VALID" or result is None:
-        return None  # No changes needed
-    else:
-        log.info(f"Gemini suggested sequence correction (length {len(result)})")
-        return result
 def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
     """Extract text from a specific page number in the PDFs.
@@ -2331,11 +2269,11 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
 SEQUENCE EXTRACTION RULES:
 - Copy sequences EXACTLY as they appear in the text
-- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
-- Do NOT add, remove, or modify any amino acids
+- Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
+- Do NOT add, remove, or modify any amino acids, or nucleotides
 - Preserve the exact length and character sequence
 - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
-- Double-check that consecutive identical amino acids are copied correctly
+- Double-check that consecutive identical amino acids or nucleotides  are copied correctly
 For each variant return:
   * variant_id  - the EXACT label as it appears with the sequence (preserve all formatting)
@@ -2356,8 +2294,81 @@ TEXT (may be truncated):
 ```
 """.strip()
+def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
+    """
+    Check if two sequence extraction responses match.
+    Args:
+        resp1: First response (list of sequences or dict)
+        resp2: Second response (list of sequences or dict)
+    Returns:
+        True if responses match, False otherwise
+    """
+    # Handle None cases
+    if resp1 is None or resp2 is None:
+        return False
+    # Both should be the same type
+    if type(resp1) != type(resp2):
+        return False
+    # If both are lists
+    if isinstance(resp1, list) and isinstance(resp2, list):
+        # Must have same length
+        if len(resp1) != len(resp2):
+            return False
+        # Create normalized sequence sets for comparison
+        seq_set1 = set()
+        seq_set2 = set()
+        for seq in resp1:
+            if isinstance(seq, dict):
+                variant_id = seq.get("variant_id", "")
+                aa_seq = seq.get("aa_seq")
+                dna_seq = seq.get("dna_seq")
+                # Handle None/null values - convert to empty string for comparison
+                if aa_seq is None:
+                    aa_seq = ""
+                else:
+                    aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
+                if dna_seq is None:
+                    dna_seq = ""
+                else:
+                    dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
+                seq_set1.add(f"{variant_id}|{aa_seq}|{dna_seq}")
+        for seq in resp2:
+            if isinstance(seq, dict):
+                variant_id = seq.get("variant_id", "")
+                aa_seq = seq.get("aa_seq")
+                dna_seq = seq.get("dna_seq")
+                # Handle None/null values - convert to empty string for comparison
+                if aa_seq is None:
+                    aa_seq = ""
+                else:
+                    aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
+                if dna_seq is None:
+                    dna_seq = ""
+                else:
+                    dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
+                seq_set2.add(f"{variant_id}|{aa_seq}|{dna_seq}")
+        return seq_set1 == seq_set2
+    # If both are dicts, compare normalized content
+    if isinstance(resp1, dict) and isinstance(resp2, dict):
+        # Normalize and compare
+        return json.dumps(resp1, sort_keys=True) == json.dumps(resp2, sort_keys=True)
+    return False
 def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
-    """Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
+    """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
+    Can exit early after 2 attempts if the responses match exactly.
     Args:
         model: The Gemini model instance
@@ -2366,12 +2377,12 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
         debug_dir: Optional debug directory
     Returns:
-        The validated sequence JSON data or None if no consensus
+        The most common sequence JSON data or None if all attempts failed
     """
     responses = []
-    max_attempts = 5  # Increased from 3 to 5
+    max_attempts = 6
-    # Try up to 5 times
+    # Try 6 times with early match detection
     for attempt in range(max_attempts):
         try:
             log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2443,167 +2454,69 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
                     else:
                         raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
-            # Store both the original and normalized response
-            normalized_response = _normalize_sequence_response(parsed)
-            responses.append((parsed, normalized_response))
-            log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
+            # Store the response
+            responses.append(parsed)
+            log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
+            # Early match detection after 2 attempts
+            if attempt >= 1:  # After 2nd attempt (0-indexed)
+                valid_responses_so_far = [r for r in responses if r is not None]
+                if len(valid_responses_so_far) >= 2:
+                    # Check if the last two valid responses match
+                    if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
+                        log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
+                        # Add the matching response 4 more times to simulate consensus
+                        for _ in range(max_attempts - attempt - 1):
+                            responses.append(valid_responses_so_far[-1])
+                        break
         except Exception as e:
             log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
             responses.append(None)
-        # Check for early consensus after 2 attempts
-        if len(responses) == 2:
-            if (responses[0] and responses[1] and
-                _sequences_match(responses[0][1], responses[1][1])):
-                log.info("Sequence extraction consensus reached after 2 attempts")
-                return responses[0][0]  # Return original parsed data
-            else:
-                log.info("Sequence extraction mismatch after 2 attempts - trying third")
-    # After all attempts, use adaptive validation
+    # After all attempts, find most common sequences
     valid_responses = [r for r in responses if r is not None]
     if not valid_responses:
         log.error(f"All {max_attempts} sequence extraction attempts failed")
         return None
-    # First, try to find exact consensus (any matching pair)
-    for i in range(len(valid_responses)):
-        for j in range(i + 1, len(valid_responses)):
-            if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
-                log.info(f"Sequence extraction consensus found: attempts with matching content")
-                return valid_responses[i][0]  # Return original parsed data
-    # If no exact consensus, use adaptive validation
-    log.info("No exact consensus found, applying adaptive validation...")
-    # Find sequences that appear consistently across multiple attempts
-    consistent_sequences = _find_consistent_sequences(valid_responses)
-    if consistent_sequences:
-        log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
-        return consistent_sequences
-    # If still no consensus, use the attempt with the most sequences
-    best_response = max(valid_responses,
-                       key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
-    if best_response and len(best_response[1]) > 0:
-        log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
-        return best_response[0]
-    log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
-    return None
-def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
-    """Find sequences that appear consistently across multiple extraction attempts.
-    Args:
-        valid_responses: List of (original_data, normalized_data) tuples
-    Returns:
-        List of consistent sequences with confidence scores, or None if none found
-    """
-    if not valid_responses:
-        return None
-    # Count how many times each sequence appears
+    # Count occurrences of each individual sequence across all attempts
     sequence_counts = {}
-    sequence_full_data = {}
-    for original, normalized in valid_responses:
-        if not isinstance(normalized, list):
-            continue
-        for seq in normalized:
-            variant_id = seq.get("variant_id", "")
-            aa_seq = seq.get("aa_seq", "")
-            # Clean sequence before using in key
-            aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
-            # Create a unique key for this sequence
-            key = f"{variant_id}|{aa_seq_clean}"
-            if key not in sequence_counts:
-                sequence_counts[key] = 0
-                sequence_full_data[key] = []
-            sequence_counts[key] += 1
-            # Find the full data for this sequence from the original response
-            if isinstance(original, list):
-                for orig_seq in original:
-                    if (orig_seq.get("variant_id") == variant_id and
-                        orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
-                        sequence_full_data[key].append(orig_seq)
-                        break
-    # Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
-    min_appearances = max(2, len(valid_responses) // 2)
-    consistent_sequences = []
-    for key, count in sequence_counts.items():
-        if count >= min_appearances:
-            # Use the first occurrence of the full data
-            if sequence_full_data[key]:
-                seq_data = sequence_full_data[key][0].copy()
-                # Add confidence based on how many times it appeared
-                seq_data["confidence"] = count / len(valid_responses)
-                seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
-                consistent_sequences.append(seq_data)
+    for resp in valid_responses:
+        if isinstance(resp, list):
+            for seq in resp:
+                if isinstance(seq, dict) and "variant_id" in seq:
+                    # Create a key for this sequence (variant_id + cleaned aa_seq)
+                    variant_id = seq.get("variant_id", "")
+                    aa_seq = seq.get("aa_seq", "")
+                    if aa_seq:
+                        aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
+                    key = f"{variant_id}|{aa_seq}"
+                    if key not in sequence_counts:
+                        sequence_counts[key] = {"count": 0, "data": seq}
+                    sequence_counts[key]["count"] += 1
+    # Build result with sequences that appear in at least 3 attempts
+    result = []
+    for key, info in sequence_counts.items():
+        if info["count"] >= 3:  # Appears in at least 3/6 attempts
+            seq_data = info["data"].copy()
+            seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
+            result.append(seq_data)
+            log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
+    if result:
+        log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
+        return result
-    return consistent_sequences if consistent_sequences else None
+    # If no sequences appear twice, return the most complete attempt
+    best_attempt = max(valid_responses, key=lambda x: len(x) if isinstance(x, list) else 0)
+    log.warning(f"No consensus sequences found, returning best attempt with {len(best_attempt)} sequences")
+    return best_attempt
-def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
-    """Normalize sequence response for comparison."""
-    if not isinstance(data, list):
-        return []
-    normalized = []
-    for item in data:
-        if isinstance(item, dict):
-            # Extract key fields for comparison
-            normalized_item = {
-                "variant_id": item.get("variant_id", ""),
-                "aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
-                "dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
-                "confidence": item.get("confidence", 0.0)
-            }
-            normalized.append(normalized_item)
-    # Sort by variant_id for consistent comparison
-    return sorted(normalized, key=lambda x: x["variant_id"])
-def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
-    """Check if two sequence response lists match on key fields."""
-    if len(seq1) != len(seq2):
-        return False
-    for i, (s1, s2) in enumerate(zip(seq1, seq2)):
-        # Compare variant IDs
-        if s1.get("variant_id") != s2.get("variant_id"):
-            return False
-        # Compare amino acid sequences (most critical)
-        aa1 = s1.get("aa_seq", "")
-        aa2 = s2.get("aa_seq", "")
-        if aa1 and aa2 and aa1 != aa2:
-            return False
-        elif bool(aa1) != bool(aa2):  # One has sequence, other doesn't
-            return False
-        # Compare DNA sequences if present
-        dna1 = s1.get("dna_seq", "")
-        dna2 = s2.get("dna_seq", "")
-        if dna1 and dna2 and dna1 != dna2:
-            return False
-    return True
 def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
@@ -2624,18 +2537,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
     else:
         prompt = base_prompt
-    # Add mutation validation context if we have lineage variants with mutations
-    if lineage_variants:
-        mutation_context = _build_mutation_validation_context(lineage_variants)
-        if mutation_context:
-            prompt = f"""{prompt}
-CRITICAL MUTATION VALIDATION:
-{mutation_context}
-IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
-For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
-"""
+    # Skip mutation validation context
     # Save the complete prompt for debugging
     if debug_dir:
@@ -2662,11 +2564,7 @@ For example, if variant "III" has mutation "A100V" from parent "II", then positi
     extracted_sequences = _parse_sequences(data)
-    # Post-process: validate sequences against mutations if we have lineage info
-    if lineage_variants:
-        validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
-        return validated_sequences
+    # Return extracted sequences without mutation validation
     return extracted_sequences
 # --- 7.4  JSON -> dataclass helpers -------------------------------------------
@@ -2701,6 +2599,19 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
         aa  = _clean_seq(entry.get("aa_seq"),  _VALID_AA)
         dna = _clean_seq(entry.get("dna_seq"), _VALID_DNA)
+        # Check minimum length requirements
+        # AA sequences should be > 50, DNA sequences should be > 150
+        if aa and len(aa) <= 50:
+            log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
+            aa = None
+        if dna and len(dna) <= 150:
+            log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
+            dna = None
+        # Skip if both sequences are too short or missing
+        if not aa and not dna:
+            continue
         conf: float | None = None
         if aa:
             conf = sum(c in _VALID_AA  for c in aa)  / len(aa)
@@ -2943,12 +2854,15 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                     validate_sequences=True
                 )
-            if focused_text and len(focused_text) < len(text):
-                log.info("Reduced text from %d to %d chars using validated location",
-                         len(text), len(focused_text))
-            else:
-                log.warning("Failed to reduce text size - focused_text length: %d, full text length: %d",
-                           len(focused_text) if focused_text else 0, len(text))
+            # Use focused text if we got any content, regardless of size
+            if focused_text:
+                if len(focused_text) < len(text):
+                    log.info("Reduced text from %d to %d chars using validated location",
+                             len(text), len(focused_text))
+                else:
+                    log.info("Extracted focused text (%d chars) from validated location (full text: %d chars)",
+                             len(focused_text), len(text))
                 # Build lineage context if available
                 lineage_context = None
                 if lineage_variants:
@@ -2961,6 +2875,8 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                     lineage_context = "\n".join(variant_info)
                 return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
+            else:
+                log.warning("Failed to extract focused text from validated location, will use full text")
         else:
             log.warning("Location validation failed or returned invalid location: %s",
                        validation.get("reason", "Unknown"))
@@ -3113,12 +3029,6 @@ If you cannot determine certain fields, set them to null.
             seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
             # Validate it looks like a protein sequence
             if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
-                # Sanity check the sequence against known mutations
-                validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
-                if validated_seq:
-                    seq = validated_seq
-                    log.info(f"Sequence validated and potentially corrected by Gemini")
                 # Map to the first variant or wild-type
                 wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
                 if wt_variant:
@@ -3422,7 +3332,7 @@ def _merge_lineage_and_sequences(
     log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
     # 3. If we have unmatched sequences and a model, use Gemini to match
-    if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
+    if model and len(df_seq) > 0 and (df['aa_seq'].isna().any() or df['dna_seq'].isna().any()):
         # Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
         missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
         unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
@@ -3437,14 +3347,9 @@ def _merge_lineage_and_sequences(
             log.info("Using Gemini to match variants")
             # Build prompt for Gemini
-            prompt = f"""Match enzyme variant IDs between two lists from the same paper.
+            prompt = f"""Match enzyme variant IDs between two lists from the same paper using your best judgment.
-Papers often use different naming conventions for the same variant:
-- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
-- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
-Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
-use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
+These IDs come from different sections of the paper and may use different naming conventions for the same variant.
 Lineage variant IDs (need sequences):
 {json.dumps(unmatched_lineage_ids)}
@@ -3452,8 +3357,13 @@ Lineage variant IDs (need sequences):
 Sequence variant IDs (have sequences):
 {json.dumps(unmatched_seqs['variant_id'].tolist())}
+IMPORTANT: A variant with mutations (indicated by mutation codes like letters and numbers after an underscore or space) is a DIFFERENT enzyme from its parent. Do not match mutation variants to their base sequences - they are distinct entities with different sequences due to the mutations.
+Only match variants that represent the SAME enzyme, accounting for different naming conventions between sections.
 Return ONLY a JSON object mapping lineage IDs to sequence IDs.
 Format: {{"lineage_id": "sequence_id", ...}}
+Only include matches you are confident represent the same variant.
 """
             try:
@@ -3733,16 +3643,27 @@ def run_pipeline(
     # 4. Extract sequences (Section 7) ----------------------------------------
     sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
-    # 4a. Try PDB extraction if no sequences found -----------------------------
-    # Check if we need PDB sequences (no sequences or only partial sequences)
-    MIN_PROTEIN_LENGTH = 50  # Most proteins are >50 AA
-    needs_pdb = (not sequences or
-                 all(s.aa_seq is None or (s.aa_seq and len(s.aa_seq) < MIN_PROTEIN_LENGTH)
-                     for s in sequences))
+    # 4a. First try to merge extracted sequences with lineage using Gemini matching
+    # This allows fuzzy matching of complex variant IDs before external lookups
+    doi = extract_doi(manuscript)
+    df_merged = merge_and_score(lineage, sequences, doi, model)
+    # 4b. Check if ALL variants are missing sequences after merging
+    # Only try external sources if no sequences were successfully matched
+    all_missing_sequences = True
+    if 'aa_seq' in df_merged.columns or 'dna_seq' in df_merged.columns:
+        for _, row in df_merged.iterrows():
+            has_aa = pd.notna(row.get('aa_seq'))
+            has_dna = pd.notna(row.get('dna_seq'))
+            if has_aa or has_dna:
+                all_missing_sequences = False
+                break
-    if needs_pdb:
-        log.info("No full-length sequences found in paper (only partial sequences < %d AA), attempting PDB extraction...",
-                 MIN_PROTEIN_LENGTH)
+    if all_missing_sequences:
+        MIN_PROTEIN_LENGTH = 50  # Most proteins are >50 AA
+        MIN_DNA_LENGTH = 150  # DNA sequences should be >150 nt
+        log.info("No full-length sequences found in paper (only partial sequences < %d AA or < %d nt), attempting PDB extraction...",
+                 MIN_PROTEIN_LENGTH, MIN_DNA_LENGTH)
         # Extract PDB IDs from all PDFs
         pdb_ids = []
@@ -3780,7 +3701,13 @@ def run_pipeline(
                                 log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
                     if pdb_seq_blocks:
-                        sequences = pdb_seq_blocks
+                        # Update the dataframe with PDB sequences
+                        for seq_block in pdb_seq_blocks:
+                            mask = df_merged['variant_id'] == seq_block.variant_id
+                            if mask.any():
+                                df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
+                                df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
+                                df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
                         log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
                         break
                 else:
@@ -3788,8 +3715,15 @@ def run_pipeline(
         else:
             log.warning("No PDB IDs found in paper")
-        # 4b. If still no sequences, try Gemini extraction as last resort
-        if not sequences or all(not s.aa_seq for s in sequences):
+        # 4c. If still no sequences after PDB, try Gemini extraction as last resort
+        # Re-check if all variants are still missing sequences
+        still_all_missing = True
+        for _, row in df_merged.iterrows():
+            if pd.notna(row.get('aa_seq')) or pd.notna(row.get('dna_seq')):
+                still_all_missing = False
+                break
+        if still_all_missing:
             log.info("No sequences from PDB, attempting Gemini-based extraction...")
             gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
@@ -3813,14 +3747,19 @@ def run_pipeline(
                         log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
                 if gemini_seq_blocks:
-                    sequences = gemini_seq_blocks
+                    # Update the dataframe with Gemini/UniProt sequences
+                    for seq_block in gemini_seq_blocks:
+                        mask = df_merged['variant_id'] == seq_block.variant_id
+                        if mask.any():
+                            df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
+                            df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
+                            df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'Gemini/UniProt')
                     log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
             else:
                 log.warning("Failed to extract sequences via Gemini")
-    # 5. Merge & score (Section 8) --------------------------------------------
-    doi = extract_doi(manuscript)
-    df_final = merge_and_score(lineage, sequences, doi, model)
+    # 5. Use the merged dataframe (already merged above)
+    df_final = df_merged
     # 6. Write FINAL CSV -------------------------------------------------------
     if output_csv:

debase 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

debase 0.4.4py3-none-any.whl → 0.5.0py3-none-any.whl