PyPI - debase - Versions diffs - 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

debase 0.4.5py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +123 -0
debase/enzyme_lineage_extractor.py +243 -309
debase/reaction_info_extractor.py +152 -68
{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/METADATA +1 -1
debase-0.5.0.dist-info/RECORD +16 -0
debase-0.4.5.dist-info/RECORD +0 -16
{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/WHEEL +0 -0
{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/entry_points.txt +0 -0
{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/top_level.txt +0 -0

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.4.5"
+__version__ = "0.5.0"

debase/cleanup_sequence.py CHANGED Viewed

@@ -30,6 +30,27 @@ except ImportError:  # pragma: no cover
 # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
 VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*")  # Include * for stop codons
+VALID_DNA_BASES = set("ACGT")
+# Genetic code table for DNA to amino acid translation
+GENETIC_CODE = {
+    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
+    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
+    'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+    'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+    'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+    'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+    'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+    'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+    'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+    'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
+}
 # Gemini API configuration
 GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
@@ -182,6 +203,44 @@ class SequenceManipulator:
         """Validate that a sequence contains only valid amino acids."""
         return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
+    @staticmethod
+    def is_dna_sequence(seq: str) -> bool:
+        """Check if a sequence is DNA (contains only ACGT)."""
+        seq_upper = seq.upper().replace(" ", "").replace("\n", "")
+        return all(base in VALID_DNA_BASES for base in seq_upper) and len(seq_upper) > 0
+    @staticmethod
+    def translate_dna_to_protein(dna_seq: str) -> str:
+        """Translate DNA sequence to protein sequence.
+        Args:
+            dna_seq: DNA sequence string
+        Returns:
+            Protein sequence string
+        """
+        # Clean the DNA sequence
+        dna_seq = dna_seq.upper().replace(" ", "").replace("\n", "")
+        # Check if sequence length is multiple of 3
+        if len(dna_seq) % 3 != 0:
+            log.warning(f"DNA sequence length ({len(dna_seq)}) is not a multiple of 3. Truncating to nearest codon.")
+            dna_seq = dna_seq[:-(len(dna_seq) % 3)]
+        protein_seq = []
+        for i in range(0, len(dna_seq), 3):
+            codon = dna_seq[i:i+3]
+            if len(codon) == 3:
+                # Handle unknown codons (with N or other non-standard bases)
+                if codon in GENETIC_CODE:
+                    protein_seq.append(GENETIC_CODE[codon])
+                else:
+                    # If codon contains non-standard bases, add 'X' for unknown amino acid
+                    protein_seq.append('X')
+                    log.debug(f"Unknown codon '{codon}' at position {i}, using 'X' for unknown amino acid")
+        return ''.join(protein_seq)
     @staticmethod
     def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
         """Determine whether mutations use 0-based or 1-based indexing."""
@@ -1141,6 +1200,9 @@ class SequenceProcessor:
         # Detect and handle column format automatically
         self._normalize_columns()
+        # Translate DNA sequences to protein sequences if needed
+        self._translate_dna_sequences()
         log.info(
             f"Loaded {len(self.df)} rows, "
             f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
@@ -1153,6 +1215,67 @@ class SequenceProcessor:
         # Initialize generator
         self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
+    def _translate_dna_sequences(self) -> None:
+        """Translate DNA sequences to protein sequences if no amino acid sequences exist."""
+        manipulator = SequenceManipulator()
+        # First check if ANY sequences are amino acid sequences
+        has_amino_acid = False
+        for idx, row in self.df.iterrows():
+            seq = str(row.get("protein_sequence", "")).strip()
+            if seq and seq.lower() not in ["nan", "none", ""]:
+                if not manipulator.is_dna_sequence(seq):
+                    has_amino_acid = True
+                    break
+        # If we found amino acid sequences, don't translate anything
+        if has_amino_acid:
+            log.info("Found amino acid sequences in data, skipping DNA translation")
+            return
+        # No amino acid sequences found, check for DNA sequences in dna_seq column
+        if "dna_seq" in self.df.columns:
+            dna_count = 0
+            for idx, row in self.df.iterrows():
+                protein_seq = str(row.get("protein_sequence", "")).strip()
+                dna_seq = str(row.get("dna_seq", "")).strip()
+                # If protein_sequence is empty but dna_seq has content, translate it
+                if (not protein_seq or protein_seq.lower() in ["nan", "none", ""]) and \
+                   (dna_seq and dna_seq.lower() not in ["nan", "none", ""]):
+                    if manipulator.is_dna_sequence(dna_seq):
+                        # Translate DNA to protein
+                        translated_seq = manipulator.translate_dna_to_protein(dna_seq)
+                        self.df.at[idx, "protein_sequence"] = translated_seq
+                        # Add flag to indicate this was translated from DNA
+                        if "flag" not in self.df.columns:
+                            self.df["flag"] = ""
+                        existing_flag = str(self.df.at[idx, "flag"]).strip()
+                        self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
+                        dna_count += 1
+            if dna_count > 0:
+                log.info(f"Translated {dna_count} DNA sequences from dna_seq column to protein sequences")
+        # Also check if DNA sequences are mistakenly in protein_sequence column
+        dna_count = 0
+        for idx, row in self.df.iterrows():
+            seq = str(row.get("protein_sequence", "")).strip()
+            if seq and seq.lower() not in ["nan", "none", ""]:
+                if manipulator.is_dna_sequence(seq):
+                    # Translate DNA to protein
+                    protein_seq = manipulator.translate_dna_to_protein(seq)
+                    self.df.at[idx, "protein_sequence"] = protein_seq
+                    # Add flag to indicate this was translated from DNA
+                    existing_flag = str(self.df.at[idx, "flag"]).strip()
+                    self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
+                    dna_count += 1
+        if dna_count > 0:
+            log.info(f"Translated {dna_count} DNA sequences to protein sequences")
     def _normalize_columns(self) -> None:
         """Automatically detect and normalize column names from different formats."""
         # Check if this is enzyme_lineage_extractor format

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -24,6 +24,7 @@ import pandas as pd
 import networkx as nx  # light dependency, used only for generation inference
 import os
+import fitz
 import re
 import json
 import time
@@ -460,8 +461,32 @@ def get_model():
     if not api_key:
         raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
     _genai.configure(api_key=api_key)
-    # Positional constructor arg works for both SDK flavors
-    return _genai.GenerativeModel(MODEL_NAME)
+    # Create generation config to optimize performance and costs
+    generation_config = {
+        "temperature": 0.0,  # Deterministic: always pick the most likely token
+        "top_p": 1.0,      # Consider all tokens (but temperature=0 will pick the best)
+        "top_k": 1,        # Only consider the single most likely token
+        "max_output_tokens": 32768,  # Increased from 8192 to handle larger sequence extractions
+    }
+    # For Gemini 2.5 Flash, disable thinking tokens to save costs
+    # thinking_budget=0 disables thinking, -1 enables dynamic thinking (default)
+    # Only add if SDK supports it to maintain compatibility
+    try:
+        # Test if thinking_budget is supported by making a minimal API call
+        test_config = {"thinking_budget": 0, "max_output_tokens": 10}
+        test_model = _genai.GenerativeModel(MODEL_NAME, generation_config=test_config)
+        # Actually test the API call to see if thinking_budget is supported
+        test_response = test_model.generate_content("Return 'OK'")
+        # If no error, add thinking_budget to main config
+        generation_config["thinking_budget"] = 0
+        log.debug("Disabled thinking tokens (thinking_budget=0)")
+    except Exception as e:
+        # SDK doesn't support thinking_budget, continue without it
+        log.debug(f"thinking_budget not supported: {e}")
+    return _genai.GenerativeModel(MODEL_NAME, generation_config=generation_config)
 # === 5.3  Unified call helper ----------------------------------------------
@@ -728,22 +753,24 @@ Return a JSON object with:
 _LINEAGE_LOC_PROMPT = """
 You are an expert reader of protein engineering manuscripts.
 {campaign_context}
-Given the following article text, list up to {max_results} *locations* (page
-numbers, figure/table IDs, or section headings) that you would review first to
-find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
-came from which parent and what mutations were introduced){campaign_specific}.
+Given the following article text, list up to {max_results} *locations* (figure/table IDs
+or section headings) that you would review first to find the COMPLETE evolutionary
+lineage of enzyme variants (i.e. which variant came from which parent and what
+mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
+ensure the location you return are actually lineage location with variants and mutations.
 Respond with a JSON array of objects, each containing:
-- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
-- "type": one of "table", "figure", "text", "section"
+- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
+- "type": one of "table", "figure", "section"
 - "confidence": your confidence score (0-100) that this location contains lineage data
 - "reason": brief explanation of why this location likely contains lineage
 {campaign_field}
-IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
+IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
+NOT page numbers. Focus on the actual figure/table titles and numbers.
 Order by confidence score (highest first). Tables showing complete variant lineages or
-mutation lists should be ranked higher than figure showing complete variant lineages.
-Text sections is used when no suitable tables/figurews exist.
+mutation lists should be ranked higher than figures showing complete variant lineages.
+Sections are used when no suitable tables/figures exist.
 Don't include oligonucleotide results or result from only one round.
@@ -1713,7 +1740,6 @@ def get_lineage(
         for pdf_path in pdf_paths:
             # Extract first few pages looking for TOC
             try:
-                import fitz  # PyMuPDF
                 doc = fitz.open(pdf_path)
                 toc_text = ""
                 for page_num in range(min(5, doc.page_count)):  # First 5 pages
@@ -2011,7 +2037,7 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
 # --- 7.2  Page-based extraction helper ---------------------------------------
 def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
-    """Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
+    """Extract plain text sequence using Gemini with 6 attempts, returning most common result.
     Args:
         prompt: The prompt to send to Gemini
@@ -2019,12 +2045,12 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
         context: Additional context for logging (e.g., "validation" or "extraction")
     Returns:
-        The validated sequence or None if no consensus
+        The most common sequence or None if all attempts failed
     """
     sequences = []
-    max_attempts = 5  # Increased from 3 to 5
+    max_attempts = 6
-    # Try up to 5 times
+    # Try 6 times
     for attempt in range(max_attempts):
         try:
             response = model.generate_content(prompt)
@@ -2050,38 +2076,14 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
         except Exception as e:
             log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
             sequences.append("ERROR")
-        # Check for early consensus after 2 attempts
-        if len(sequences) == 2:
-            # Clean sequences before comparison
-            seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
-            seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
-            if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
-                log.info(f"Gemini {context} consensus reached after 2 attempts")
-                return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
-            else:
-                log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
-    # After all attempts, find consensus
+    # After all attempts, find most common result
     valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
     if not valid_sequences:
         log.error(f"All {max_attempts} {context} attempts failed")
         return None
-    # Find any matching pair
-    for i in range(len(sequences)):
-        for j in range(i + 1, len(sequences)):
-            # Clean sequences before comparison
-            seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
-            seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
-            if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
-                log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
-                return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
-    # If no exact match, use adaptive validation
     # Count occurrences of each valid sequence
     sequence_counts = {}
     for seq in valid_sequences:
@@ -2090,80 +2092,16 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
             seq_clean = seq.replace(" ", "").replace("\n", "")
             sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
-    # Return the most common sequence if it appears at least twice
+    # Return the most common sequence
     if sequence_counts:
         most_common = max(sequence_counts.items(), key=lambda x: x[1])
-        if most_common[1] >= 2:
-            log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
-            return most_common[0]
+        log.info(f"Gemini {context} most common: sequence appeared {most_common[1]}/{max_attempts} times")
+        return most_common[0]
-    log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
+    log.warning(f"Gemini {context} no valid sequences after {max_attempts} attempts")
     return None
-def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
-    """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
-    # Extract mutations from variants
-    mutations = []
-    for variant in variants:
-        if variant.mutations:
-            mutations.extend(variant.mutations)
-    if not mutations:
-        return None
-    # Take a sample of mutations for validation
-    sample_mutations = mutations[:10]  # Check first 10 mutations
-    # First do a quick local check for obvious inconsistencies
-    local_issues = []
-    for mutation in sample_mutations:
-        if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
-            pos = mutation.position - 1  # Convert to 0-indexed
-            if 0 <= pos < len(sequence):
-                actual_aa = sequence[pos]
-                expected_aa = mutation.original
-                if actual_aa != expected_aa:
-                    local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
-    if not local_issues:
-        return None  # No obvious issues found
-    log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
-    prompt = f"""
-You are validating a protein sequence that was extracted from a scientific paper.
-The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
-Original sequence (length {len(sequence)}):
-{sequence}
-Known mutations that should be applicable to this sequence:
-{', '.join(str(m) for m in sample_mutations)}
-Potential issues detected:
-{chr(10).join(local_issues)}
-Please check if the sequence is consistent with these mutations:
-1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
-2. If you find inconsistencies, suggest the most likely correction
-3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
-4. Pay special attention to consecutive identical amino acids that might be OCR errors
-Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
-If you cannot determine the correct sequence, return "UNCERTAIN".
-"""
-    # Use triple validation
-    result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
-    if result == "VALID" or result is None:
-        return None  # No changes needed
-    else:
-        log.info(f"Gemini suggested sequence correction (length {len(result)})")
-        return result
 def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
     """Extract text from a specific page number in the PDFs.
@@ -2331,11 +2269,11 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
 SEQUENCE EXTRACTION RULES:
 - Copy sequences EXACTLY as they appear in the text
-- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
-- Do NOT add, remove, or modify any amino acids
+- Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
+- Do NOT add, remove, or modify any amino acids, or nucleotides
 - Preserve the exact length and character sequence
 - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
-- Double-check that consecutive identical amino acids are copied correctly
+- Double-check that consecutive identical amino acids or nucleotides  are copied correctly
 For each variant return:
   * variant_id  - the EXACT label as it appears with the sequence (preserve all formatting)
@@ -2356,8 +2294,81 @@ TEXT (may be truncated):
 ```
 """.strip()
+def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
+    """
+    Check if two sequence extraction responses match.
+    Args:
+        resp1: First response (list of sequences or dict)
+        resp2: Second response (list of sequences or dict)
+    Returns:
+        True if responses match, False otherwise
+    """
+    # Handle None cases
+    if resp1 is None or resp2 is None:
+        return False
+    # Both should be the same type
+    if type(resp1) != type(resp2):
+        return False
+    # If both are lists
+    if isinstance(resp1, list) and isinstance(resp2, list):
+        # Must have same length
+        if len(resp1) != len(resp2):
+            return False
+        # Create normalized sequence sets for comparison
+        seq_set1 = set()
+        seq_set2 = set()
+        for seq in resp1:
+            if isinstance(seq, dict):
+                variant_id = seq.get("variant_id", "")
+                aa_seq = seq.get("aa_seq")
+                dna_seq = seq.get("dna_seq")
+                # Handle None/null values - convert to empty string for comparison
+                if aa_seq is None:
+                    aa_seq = ""
+                else:
+                    aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
+                if dna_seq is None:
+                    dna_seq = ""
+                else:
+                    dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
+                seq_set1.add(f"{variant_id}|{aa_seq}|{dna_seq}")
+        for seq in resp2:
+            if isinstance(seq, dict):
+                variant_id = seq.get("variant_id", "")
+                aa_seq = seq.get("aa_seq")
+                dna_seq = seq.get("dna_seq")
+                # Handle None/null values - convert to empty string for comparison
+                if aa_seq is None:
+                    aa_seq = ""
+                else:
+                    aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
+                if dna_seq is None:
+                    dna_seq = ""
+                else:
+                    dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
+                seq_set2.add(f"{variant_id}|{aa_seq}|{dna_seq}")
+        return seq_set1 == seq_set2
+    # If both are dicts, compare normalized content
+    if isinstance(resp1, dict) and isinstance(resp2, dict):
+        # Normalize and compare
+        return json.dumps(resp1, sort_keys=True) == json.dumps(resp2, sort_keys=True)
+    return False
 def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
-    """Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
+    """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
+    Can exit early after 2 attempts if the responses match exactly.
     Args:
         model: The Gemini model instance
@@ -2366,12 +2377,12 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
         debug_dir: Optional debug directory
     Returns:
-        The validated sequence JSON data or None if no consensus
+        The most common sequence JSON data or None if all attempts failed
     """
     responses = []
-    max_attempts = 5  # Increased from 3 to 5
+    max_attempts = 6
-    # Try up to 5 times
+    # Try 6 times with early match detection
     for attempt in range(max_attempts):
         try:
             log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2443,167 +2454,69 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
                     else:
                         raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
-            # Store both the original and normalized response
-            normalized_response = _normalize_sequence_response(parsed)
-            responses.append((parsed, normalized_response))
-            log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
+            # Store the response
+            responses.append(parsed)
+            log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
+            # Early match detection after 2 attempts
+            if attempt >= 1:  # After 2nd attempt (0-indexed)
+                valid_responses_so_far = [r for r in responses if r is not None]
+                if len(valid_responses_so_far) >= 2:
+                    # Check if the last two valid responses match
+                    if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
+                        log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
+                        # Add the matching response 4 more times to simulate consensus
+                        for _ in range(max_attempts - attempt - 1):
+                            responses.append(valid_responses_so_far[-1])
+                        break
         except Exception as e:
             log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
             responses.append(None)
-        # Check for early consensus after 2 attempts
-        if len(responses) == 2:
-            if (responses[0] and responses[1] and
-                _sequences_match(responses[0][1], responses[1][1])):
-                log.info("Sequence extraction consensus reached after 2 attempts")
-                return responses[0][0]  # Return original parsed data
-            else:
-                log.info("Sequence extraction mismatch after 2 attempts - trying third")
-    # After all attempts, use adaptive validation
+    # After all attempts, find most common sequences
     valid_responses = [r for r in responses if r is not None]
     if not valid_responses:
         log.error(f"All {max_attempts} sequence extraction attempts failed")
         return None
-    # First, try to find exact consensus (any matching pair)
-    for i in range(len(valid_responses)):
-        for j in range(i + 1, len(valid_responses)):
-            if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
-                log.info(f"Sequence extraction consensus found: attempts with matching content")
-                return valid_responses[i][0]  # Return original parsed data
-    # If no exact consensus, use adaptive validation
-    log.info("No exact consensus found, applying adaptive validation...")
-    # Find sequences that appear consistently across multiple attempts
-    consistent_sequences = _find_consistent_sequences(valid_responses)
-    if consistent_sequences:
-        log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
-        return consistent_sequences
-    # If still no consensus, use the attempt with the most sequences
-    best_response = max(valid_responses,
-                       key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
-    if best_response and len(best_response[1]) > 0:
-        log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
-        return best_response[0]
-    log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
-    return None
-def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
-    """Find sequences that appear consistently across multiple extraction attempts.
-    Args:
-        valid_responses: List of (original_data, normalized_data) tuples
-    Returns:
-        List of consistent sequences with confidence scores, or None if none found
-    """
-    if not valid_responses:
-        return None
-    # Count how many times each sequence appears
+    # Count occurrences of each individual sequence across all attempts
     sequence_counts = {}
-    sequence_full_data = {}
-    for original, normalized in valid_responses:
-        if not isinstance(normalized, list):
-            continue
-        for seq in normalized:
-            variant_id = seq.get("variant_id", "")
-            aa_seq = seq.get("aa_seq", "")
-            # Clean sequence before using in key
-            aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
-            # Create a unique key for this sequence
-            key = f"{variant_id}|{aa_seq_clean}"
-            if key not in sequence_counts:
-                sequence_counts[key] = 0
-                sequence_full_data[key] = []
-            sequence_counts[key] += 1
-            # Find the full data for this sequence from the original response
-            if isinstance(original, list):
-                for orig_seq in original:
-                    if (orig_seq.get("variant_id") == variant_id and
-                        orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
-                        sequence_full_data[key].append(orig_seq)
-                        break
-    # Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
-    min_appearances = max(2, len(valid_responses) // 2)
-    consistent_sequences = []
-    for key, count in sequence_counts.items():
-        if count >= min_appearances:
-            # Use the first occurrence of the full data
-            if sequence_full_data[key]:
-                seq_data = sequence_full_data[key][0].copy()
-                # Add confidence based on how many times it appeared
-                seq_data["confidence"] = count / len(valid_responses)
-                seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
-                consistent_sequences.append(seq_data)
-    return consistent_sequences if consistent_sequences else None
-def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
-    """Normalize sequence response for comparison."""
-    if not isinstance(data, list):
-        return []
-    normalized = []
-    for item in data:
-        if isinstance(item, dict):
-            # Extract key fields for comparison
-            normalized_item = {
-                "variant_id": item.get("variant_id", ""),
-                "aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
-                "dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
-                "confidence": item.get("confidence", 0.0)
-            }
-            normalized.append(normalized_item)
+    for resp in valid_responses:
+        if isinstance(resp, list):
+            for seq in resp:
+                if isinstance(seq, dict) and "variant_id" in seq:
+                    # Create a key for this sequence (variant_id + cleaned aa_seq)
+                    variant_id = seq.get("variant_id", "")
+                    aa_seq = seq.get("aa_seq", "")
+                    if aa_seq:
+                        aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
+                    key = f"{variant_id}|{aa_seq}"
+                    if key not in sequence_counts:
+                        sequence_counts[key] = {"count": 0, "data": seq}
+                    sequence_counts[key]["count"] += 1
+    # Build result with sequences that appear in at least 3 attempts
+    result = []
+    for key, info in sequence_counts.items():
+        if info["count"] >= 3:  # Appears in at least 3/6 attempts
+            seq_data = info["data"].copy()
+            seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
+            result.append(seq_data)
+            log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
+    if result:
+        log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
+        return result
-    # Sort by variant_id for consistent comparison
-    return sorted(normalized, key=lambda x: x["variant_id"])
+    # If no sequences appear twice, return the most complete attempt
+    best_attempt = max(valid_responses, key=lambda x: len(x) if isinstance(x, list) else 0)
+    log.warning(f"No consensus sequences found, returning best attempt with {len(best_attempt)} sequences")
+    return best_attempt
-def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
-    """Check if two sequence response lists match on key fields."""
-    if len(seq1) != len(seq2):
-        return False
-    for i, (s1, s2) in enumerate(zip(seq1, seq2)):
-        # Compare variant IDs
-        if s1.get("variant_id") != s2.get("variant_id"):
-            return False
-        # Compare amino acid sequences (most critical)
-        aa1 = s1.get("aa_seq", "")
-        aa2 = s2.get("aa_seq", "")
-        if aa1 and aa2 and aa1 != aa2:
-            return False
-        elif bool(aa1) != bool(aa2):  # One has sequence, other doesn't
-            return False
-        # Compare DNA sequences if present
-        dna1 = s1.get("dna_seq", "")
-        dna2 = s2.get("dna_seq", "")
-        if dna1 and dna2 and dna1 != dna2:
-            return False
-    return True
 def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
@@ -2624,18 +2537,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
     else:
         prompt = base_prompt
-    # Add mutation validation context if we have lineage variants with mutations
-    if lineage_variants:
-        mutation_context = _build_mutation_validation_context(lineage_variants)
-        if mutation_context:
-            prompt = f"""{prompt}
-CRITICAL MUTATION VALIDATION:
-{mutation_context}
-IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
-For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
-"""
+    # Skip mutation validation context
     # Save the complete prompt for debugging
     if debug_dir:
@@ -2662,11 +2564,7 @@ For example, if variant "III" has mutation "A100V" from parent "II", then positi
     extracted_sequences = _parse_sequences(data)
-    # Post-process: validate sequences against mutations if we have lineage info
-    if lineage_variants:
-        validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
-        return validated_sequences
+    # Return extracted sequences without mutation validation
     return extracted_sequences
 # --- 7.4  JSON -> dataclass helpers -------------------------------------------
@@ -2701,6 +2599,19 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
         aa  = _clean_seq(entry.get("aa_seq"),  _VALID_AA)
         dna = _clean_seq(entry.get("dna_seq"), _VALID_DNA)
+        # Check minimum length requirements
+        # AA sequences should be > 50, DNA sequences should be > 150
+        if aa and len(aa) <= 50:
+            log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
+            aa = None
+        if dna and len(dna) <= 150:
+            log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
+            dna = None
+        # Skip if both sequences are too short or missing
+        if not aa and not dna:
+            continue
         conf: float | None = None
         if aa:
             conf = sum(c in _VALID_AA  for c in aa)  / len(aa)
@@ -3118,12 +3029,6 @@ If you cannot determine certain fields, set them to null.
             seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
             # Validate it looks like a protein sequence
             if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
-                # Sanity check the sequence against known mutations
-                validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
-                if validated_seq:
-                    seq = validated_seq
-                    log.info(f"Sequence validated and potentially corrected by Gemini")
                 # Map to the first variant or wild-type
                 wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
                 if wt_variant:
@@ -3427,7 +3332,7 @@ def _merge_lineage_and_sequences(
     log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
     # 3. If we have unmatched sequences and a model, use Gemini to match
-    if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
+    if model and len(df_seq) > 0 and (df['aa_seq'].isna().any() or df['dna_seq'].isna().any()):
         # Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
         missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
         unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
@@ -3442,14 +3347,9 @@ def _merge_lineage_and_sequences(
             log.info("Using Gemini to match variants")
             # Build prompt for Gemini
-            prompt = f"""Match enzyme variant IDs between two lists from the same paper.
+            prompt = f"""Match enzyme variant IDs between two lists from the same paper using your best judgment.
-Papers often use different naming conventions for the same variant:
-- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
-- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
-Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
-use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
+These IDs come from different sections of the paper and may use different naming conventions for the same variant.
 Lineage variant IDs (need sequences):
 {json.dumps(unmatched_lineage_ids)}
@@ -3457,8 +3357,13 @@ Lineage variant IDs (need sequences):
 Sequence variant IDs (have sequences):
 {json.dumps(unmatched_seqs['variant_id'].tolist())}
+IMPORTANT: A variant with mutations (indicated by mutation codes like letters and numbers after an underscore or space) is a DIFFERENT enzyme from its parent. Do not match mutation variants to their base sequences - they are distinct entities with different sequences due to the mutations.
+Only match variants that represent the SAME enzyme, accounting for different naming conventions between sections.
 Return ONLY a JSON object mapping lineage IDs to sequence IDs.
 Format: {{"lineage_id": "sequence_id", ...}}
+Only include matches you are confident represent the same variant.
 """
             try:
@@ -3738,16 +3643,27 @@ def run_pipeline(
     # 4. Extract sequences (Section 7) ----------------------------------------
     sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
-    # 4a. Try PDB extraction if no sequences found -----------------------------
-    # Check if we need PDB sequences (no sequences or only partial sequences)
-    MIN_PROTEIN_LENGTH = 50  # Most proteins are >50 AA
-    needs_pdb = (not sequences or
-                 all(s.aa_seq is None or (s.aa_seq and len(s.aa_seq) < MIN_PROTEIN_LENGTH)
-                     for s in sequences))
+    # 4a. First try to merge extracted sequences with lineage using Gemini matching
+    # This allows fuzzy matching of complex variant IDs before external lookups
+    doi = extract_doi(manuscript)
+    df_merged = merge_and_score(lineage, sequences, doi, model)
+    # 4b. Check if ALL variants are missing sequences after merging
+    # Only try external sources if no sequences were successfully matched
+    all_missing_sequences = True
+    if 'aa_seq' in df_merged.columns or 'dna_seq' in df_merged.columns:
+        for _, row in df_merged.iterrows():
+            has_aa = pd.notna(row.get('aa_seq'))
+            has_dna = pd.notna(row.get('dna_seq'))
+            if has_aa or has_dna:
+                all_missing_sequences = False
+                break
-    if needs_pdb:
-        log.info("No full-length sequences found in paper (only partial sequences < %d AA), attempting PDB extraction...",
-                 MIN_PROTEIN_LENGTH)
+    if all_missing_sequences:
+        MIN_PROTEIN_LENGTH = 50  # Most proteins are >50 AA
+        MIN_DNA_LENGTH = 150  # DNA sequences should be >150 nt
+        log.info("No full-length sequences found in paper (only partial sequences < %d AA or < %d nt), attempting PDB extraction...",
+                 MIN_PROTEIN_LENGTH, MIN_DNA_LENGTH)
         # Extract PDB IDs from all PDFs
         pdb_ids = []
@@ -3785,7 +3701,13 @@ def run_pipeline(
                                 log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
                     if pdb_seq_blocks:
-                        sequences = pdb_seq_blocks
+                        # Update the dataframe with PDB sequences
+                        for seq_block in pdb_seq_blocks:
+                            mask = df_merged['variant_id'] == seq_block.variant_id
+                            if mask.any():
+                                df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
+                                df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
+                                df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
                         log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
                         break
                 else:
@@ -3793,8 +3715,15 @@ def run_pipeline(
         else:
             log.warning("No PDB IDs found in paper")
-        # 4b. If still no sequences, try Gemini extraction as last resort
-        if not sequences or all(not s.aa_seq for s in sequences):
+        # 4c. If still no sequences after PDB, try Gemini extraction as last resort
+        # Re-check if all variants are still missing sequences
+        still_all_missing = True
+        for _, row in df_merged.iterrows():
+            if pd.notna(row.get('aa_seq')) or pd.notna(row.get('dna_seq')):
+                still_all_missing = False
+                break
+        if still_all_missing:
             log.info("No sequences from PDB, attempting Gemini-based extraction...")
             gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
@@ -3818,14 +3747,19 @@ def run_pipeline(
                         log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
                 if gemini_seq_blocks:
-                    sequences = gemini_seq_blocks
+                    # Update the dataframe with Gemini/UniProt sequences
+                    for seq_block in gemini_seq_blocks:
+                        mask = df_merged['variant_id'] == seq_block.variant_id
+                        if mask.any():
+                            df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
+                            df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
+                            df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'Gemini/UniProt')
                     log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
             else:
                 log.warning("Failed to extract sequences via Gemini")
-    # 5. Merge & score (Section 8) --------------------------------------------
-    doi = extract_doi(manuscript)
-    df_final = merge_and_score(lineage, sequences, doi, model)
+    # 5. Use the merged dataframe (already merged above)
+    df_final = df_merged
     # 6. Write FINAL CSV -------------------------------------------------------
     if output_csv:

debase/reaction_info_extractor.py CHANGED Viewed

@@ -54,11 +54,11 @@ class Config:
     """Centralised tunables so tests can override them easily."""
     model_name: str = "gemini-2.5-flash"
-    location_temperature: float = 0.2
+    location_temperature: float = 0.0
     extract_temperature: float = 0.0
     model_reaction_temperature: float = 0.0
     top_p: float = 1.0
-    max_tokens: int = 12288  # Increased 3x from 4096
+    max_tokens: int = 12288
     pdf_cache_size: int = 8
     retries: int = 2
@@ -778,50 +778,62 @@ class ReactionExtractor:
     # ------------------------------------------------------------------
     def _collect_captions_and_titles(self) -> str:
-        # Pattern to match Table or Figure with optional leading whitespace
+        # Pattern to match Table or Figure with optional leading whitespace and page numbers
         # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
-        # Also handles cases where there's whitespace before the caption
-        cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
+        # Also handles cases where there's whitespace or page numbers before the caption
+        cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
         captions: List[str] = []
-        # Collect from all pages
-        all_text = "\n".join(self.all_pages)
-        # Find all figure/table captions with more context
-        for match in cap_pattern.finditer(all_text):
-            caption_start = match.start()
-            # Include some context before the caption (up to 200 chars)
-            context_start = max(0, caption_start - 200)
-            # Find the start of the sentence/paragraph before the caption
-            context_text = all_text[context_start:caption_start]
-            last_period = context_text.rfind('.')
-            if last_period != -1:
-                context_start = context_start + last_period + 1
-            # For tables, include much more content after the caption to show actual table data
-            # For figures, include more content to ensure complete captions
-            is_table = 'table' in match.group(1).lower()
-            max_chars = 8000 if is_table else 5000
-            # Get up to max_chars or until double newline (but ensure we get complete caption)
-            # First, try to find the end of the caption sentence
-            caption_end = caption_start
-            period_pos = all_text.find('. ', caption_start)
-            if period_pos != -1 and period_pos < caption_start + 1000:
-                # Include at least to the end of the caption sentence
-                caption_end = period_pos + 1
-            # Then extend to include more context or until double newline
-            double_newline_pos = all_text.find("\n\n", caption_end)
-            if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
-                caption_end = caption_start + max_chars
-            else:
-                caption_end = double_newline_pos
-            # Include the context and full caption with table content
-            full_caption = all_text[context_start:caption_end].strip()
-            captions.append(full_caption)
+        # Process each page individually to avoid TOC entries
+        for page_idx, page_text in enumerate(self.all_pages):
+            # Skip if this looks like a TOC page
+            if self._is_toc_page(page_text):
+                LOGGER.debug("Skipping TOC page %d for caption collection", page_idx + 1)
+                continue
+            # Find all figure/table captions with more context
+            for match in cap_pattern.finditer(page_text):
+                caption_line = match.group(0).strip()
+                # Skip if this looks like a TOC entry (has page number at end or dots)
+                if re.search(r'\.{3,}|\.{2,}\s*\d+\s*$|\s+\d+\s*$', caption_line):
+                    LOGGER.debug("Skipping TOC-style entry: %s", caption_line[:50])
+                    continue
+                caption_start = match.start()
+                # For tables, include much more content after the caption to show actual table data
+                # For figures, include substantial content to show what the figure contains
+                is_table = 'table' in match.group(1).lower()
+                # Increase context for figures to ensure we capture descriptive text
+                max_chars = 8000 if is_table else 3000
+                # Get context including text before and after the caption
+                # Include some text before to help identify the location
+                context_before = max(0, caption_start - 200)
+                context_after = min(len(page_text), caption_start + max_chars)
+                # Extract the full context
+                full_context = page_text[context_before:context_after].strip()
+                # Find the actual caption text (not just the "Figure X" part)
+                # Look for text after the figure/table identifier that forms the caption
+                caption_text = page_text[caption_start:context_after]
+                # Try to find the end of the caption (usually ends with a period before next paragraph)
+                caption_end_match = re.search(r'^[^\n]+\.[^\n]*(?:\n\n|\n(?=[A-Z]))', caption_text)
+                if caption_end_match:
+                    actual_caption = caption_text[:caption_end_match.end()].strip()
+                else:
+                    # Fallback: take first few lines
+                    lines = caption_text.split('\n')
+                    actual_caption = '\n'.join(lines[:3]).strip()
+                # Ensure we have meaningful content, not just the figure number
+                if len(actual_caption) > 20:  # More than just "Figure S23."
+                    # For the prompt, include the full context to help identify what's in the figure
+                    caption_with_context = f"{actual_caption}\n\n[Context around figure/table:]\n{full_context}"
+                    captions.append(caption_with_context)
         # Also look for SI section titles
         si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
@@ -1058,6 +1070,39 @@ class ReactionExtractor:
     # 6.2 Figure / Table context helpers
     # ------------------------------------------------------------------
+    def _is_toc_page(self, page_text: str) -> bool:
+        """Detect if a page is a Table of Contents page."""
+        # Look for common TOC indicators
+        toc_indicators = [
+            "table of contents",
+            "contents",
+            r"\.{5,}",  # Multiple dots (common in TOCs)
+            r"\d+\s*\n\s*\d+\s*\n\s*\d+",  # Multiple page numbers in sequence
+        ]
+        # Count how many TOC-like patterns we find
+        toc_score = 0
+        text_lower = page_text.lower()
+        # Check for explicit TOC title
+        if "table of contents" in text_lower or (
+            "contents" in text_lower and text_lower.index("contents") < 200
+        ):
+            toc_score += 3
+        # Check for multiple figure/table references with page numbers
+        figure_with_page = re.findall(r'figure\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
+        table_with_page = re.findall(r'table\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
+        if len(figure_with_page) + len(table_with_page) > 5:
+            toc_score += 2
+        # Check for many dotted lines
+        if len(re.findall(r'\.{5,}', page_text)) > 3:
+            toc_score += 1
+        return toc_score >= 2
     def _page_with_reference(self, ref_id: str) -> Optional[str]:
         for page in self.all_pages:
             if ref_id.lower() in page.lower():
@@ -1131,9 +1176,14 @@ class ReactionExtractor:
                 LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
                            page_number + 1, doc_name, len(page_text))
-                # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
-                # For subfigures like "Figure 1C", extract the main figure "Figure 1"
-                figure_num = ref.replace('Figure ', '').replace('figure ', '')
+                # Skip Table of Contents pages
+                if self._is_toc_page(page_text):
+                    LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
+                    continue
+                # Look for figure caption pattern more flexibly
+                # Normalize the reference to handle variations
+                figure_num = ref.replace('Figure', '').replace('figure', '').strip()
                 # Extract main figure number from subfigure (e.g., "1C" -> "1")
                 main_figure_num = re.match(r'^(\d+)', figure_num)
@@ -1142,33 +1192,62 @@ class ReactionExtractor:
                 else:
                     main_figure_num = figure_num
-                caption_patterns = [
-                    rf"^Figure\s+{re.escape(main_figure_num)}\.",  # "Figure 1."
-                    rf"^Figure\s+{re.escape(main_figure_num)}:",   # "Figure 1:"
-                    rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]",  # "Figure 1 Performance"
-                    rf"^Figure\s+{re.escape(main_figure_num)}\s*$",  # "Figure 1" at end of line
-                    rf"Figure\s+{re.escape(main_figure_num)}\s*\.",  # "Figure 1." anywhere in line
-                    rf"Figure\s+{re.escape(main_figure_num)}\s*:",  # "Figure 1:" anywhere in line
-                ]
+                # Create a flexible pattern that handles various spacing and formatting
+                # This pattern looks for "Figure" (case insensitive) followed by optional spaces
+                # then the figure number, then any of: period, colon, space+capital letter, or end of line
+                flexible_pattern = rf"(?i)figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
-                LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
-                           main_figure_num, ref, caption_patterns)
+                LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
+                           main_figure_num, flexible_pattern)
                 caption_found = False
                 cap_rect = None
-                for pattern in caption_patterns:
-                    matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
-                    if matches:
-                        LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
-                        # Found actual figure caption, get its position
-                        caption_text = matches.group(0)
-                        text_instances = page.search_for(caption_text, quads=False)
-                        if text_instances:
-                            cap_rect = text_instances[0]
-                            caption_found = True
-                            LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
-                            break
+                # Search for all matches of the flexible pattern
+                for match in re.finditer(flexible_pattern, page_text, re.MULTILINE):
+                    LOGGER.debug("Found potential figure caption: %s at position %d", match.group(0), match.start())
+                    # Check if this is likely an actual caption (not just a reference)
+                    match_start = match.start()
+                    match_end = match.end()
+                    # Get surrounding context
+                    context_start = max(0, match_start - 50)
+                    context_end = min(len(page_text), match_end + 100)
+                    context = page_text[context_start:context_end]
+                    # Check if this looks like a real caption (not just a reference)
+                    # Look for words that typically precede figure references
+                    preceding_text = page_text[max(0, match_start-20):match_start].lower()
+                    if any(word in preceding_text for word in ['see ', 'in ', 'from ', 'shown in ', 'refer to ']):
+                        LOGGER.debug("Skipping reference preceded by: %s", preceding_text.strip())
+                        continue
+                    # Check if there's descriptive text after the figure number
+                    remaining_text = page_text[match_end:match_end+100].strip()
+                    # For actual captions, there should be substantial descriptive text
+                    if len(remaining_text) < 20:
+                        LOGGER.debug("Skipping potential reference: insufficient text after (%d chars)", len(remaining_text))
+                        continue
+                    # Check if the remaining text looks like a caption (contains descriptive words)
+                    first_words = remaining_text[:50].lower()
+                    if not any(word in first_words for word in ['detailed', 'representative', 'shows', 'comparison',
+                                                                 'illustrates', 'demonstrates', 'results', 'data',
+                                                                 'chromatogram', 'spectra', 'analysis', 'site-directed',
+                                                                 'mutagenesis', 'mutants']):
+                        LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
+                        continue
+                    # Found actual figure caption, get its position
+                    caption_text = match.group(0)
+                    text_instances = page.search_for(caption_text, quads=False)
+                    if text_instances:
+                        cap_rect = text_instances[0]
+                        caption_found = True
+                        LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
+                                  ref, caption_text, remaining_text[:50])
+                        break
                 if not caption_found:
                     # Debug: show what figure-related text is actually on this page
@@ -1258,6 +1337,11 @@ class ReactionExtractor:
                 page = doc.load_page(page_number)
                 page_text = page.get_text()
+                # Skip Table of Contents pages
+                if self._is_toc_page(page_text):
+                    LOGGER.debug("Skipping TOC page %d in _find_pages_with_reference", page_number + 1)
+                    continue
                 # Check for actual figure caption first
                 if ref.lower().startswith('figure'):
                     figure_num = ref.replace('Figure ', '').replace('figure ', '')

{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.5
+Version: 0.5.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.5.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=sJMwhIVyUE0G4qRHUUpEgw2beNe5jCSb9uQVOTV6krw,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
+debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
+debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
+debase/reaction_info_extractor.py,sha256=8ilu5o2FbXTV9R1Nhxd4m4TdgHOd6GsC3rxxHvqu9f4,165555
+debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
+debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
+debase-0.5.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.5.0.dist-info/METADATA,sha256=2Csgtf4gF8egVAvq8CsY4jpad2yWw_6c1iuOj55L5n8,4047
+debase-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.5.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.5.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.5.0.dist-info/RECORD,,

debase-0.4.5.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
-debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
-debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
-debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
-debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
-debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
-debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
-debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.4.5.dist-info/RECORD,,

{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.4.5.dist-info → debase-0.5.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl

debase 0.4.5py3-none-any.whl → 0.5.0py3-none-any.whl