PyPI - debase - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

debase 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +656 -27
debase/enzyme_lineage_extractor.py +1077 -109
debase/lineage_format.py +221 -12
debase/reaction_info_extractor.py +133 -23
debase/substrate_scope_extractor.py +49 -2
debase/wrapper.py +155 -151
debase-0.4.4.dist-info/METADATA +121 -0
debase-0.4.4.dist-info/RECORD +16 -0
debase-0.4.2.dist-info/METADATA +0 -296
debase-0.4.2.dist-info/RECORD +0 -16
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.2.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -30,7 +30,7 @@ import time
 import logging
 from pathlib import Path
 from dataclasses import dataclass, field
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Union, Tuple, Dict, Any
 MODEL_NAME: str = "gemini-2.5-flash"
 MAX_CHARS: int = 150_000           # Max characters sent to LLM
@@ -142,21 +142,36 @@ def extract_text(pdf_path: str | Path | bytes) -> str:
 def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
-    """Extract figure/table captions using the improved regex.
+    """Extract ALL figure/table captions with extensive surrounding context.
     The function scans every text line on every page and keeps lines whose first
     token matches `_CAPTION_PREFIX_RE`. This covers labels such as:
-      * Fig. 1, Figure 2A, Extended Data Fig 3
+      * Fig. 1, Figure 2A, Figure 2B, Figure 2C (ALL sub-captions)
       * Table S1, Table 4, Scheme 2, Chart 1B
-      * Supplementary Fig. S5, Supp Table 2
+      * Supplementary Fig. S5A, S5B, S5C (ALL variations)
+    For SI documents, includes extensive context since understanding what each
+    section contains is crucial for accurate location identification.
     """
     doc = _open_doc(pdf_path)
     captions: list[str] = []
     try:
-        for page in doc:
+        for page_num, page in enumerate(doc):
             page_dict = page.get_text("dict")
+            # Get all text blocks on this page for broader context
+            page_text_blocks = []
             for block in page_dict.get("blocks", []):
+                block_text = ""
+                for line in block.get("lines", []):
+                    text_line = "".join(span["text"] for span in line.get("spans", []))
+                    if text_line.strip():
+                        block_text += text_line.strip() + " "
+                if block_text.strip():
+                    page_text_blocks.append(block_text.strip())
+            for block_idx, block in enumerate(page_dict.get("blocks", [])):
                 # Get all lines in this block
                 block_lines = []
                 for line in block.get("lines", []):
@@ -166,21 +181,94 @@ def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -
                 # Check if any line starts with a caption prefix
                 for i, line in enumerate(block_lines):
                     if _CAPTION_PREFIX_RE.match(line):
-                        # Found a caption start - collect this line and subsequent lines
-                        # until we hit an empty line or the end of the block
+                        context_parts = []
+                        # Add page context for SI documents (more critical there)
+                        context_parts.append(f"Page {page_num + 1}")
+                        # Add extensive context before the caption (5-7 lines for SI context)
+                        context_before = []
+                        # First try to get context from current block
+                        for k in range(max(0, i-7), i):
+                            if k < len(block_lines) and block_lines[k].strip():
+                                if not _CAPTION_PREFIX_RE.match(block_lines[k]):
+                                    context_before.append(block_lines[k])
+                        # If not enough context, look at previous text blocks on the page
+                        if len(context_before) < 3 and block_idx > 0:
+                            prev_block_text = page_text_blocks[block_idx - 1] if block_idx < len(page_text_blocks) else ""
+                            if prev_block_text:
+                                # Get last few sentences from previous block
+                                sentences = prev_block_text.split('. ')
+                                context_before = sentences[-2:] + context_before if len(sentences) > 1 else [prev_block_text] + context_before
+                        if context_before:
+                            # Include more extensive context for better understanding
+                            context_text = " ".join(context_before[-5:])  # Last 5 lines/sentences of context
+                            context_parts.append("Context: " + context_text)
+                        # Extract the COMPLETE caption including all sub-parts
                         caption_parts = [line]
-                        for j in range(i + 1, len(block_lines)):
+                        j = i + 1
+                        # Continue collecting caption text until we hit a clear break
+                        while j < len(block_lines):
                             next_line = block_lines[j]
-                            if not next_line:  # Empty line signals end of caption
-                                break
-                            # Check if next line is a new caption
+                            # Stop if we hit an empty line followed by non-caption text
+                            if not next_line:
+                                # Check if the line after empty is a new caption
+                                if j + 1 < len(block_lines) and _CAPTION_PREFIX_RE.match(block_lines[j + 1]):
+                                    break
+                                # If next non-empty line is not a caption, continue collecting
+                                elif j + 1 < len(block_lines):
+                                    j += 1
+                                    continue
+                                else:
+                                    break
+                            # Stop if we hit a new caption
                             if _CAPTION_PREFIX_RE.match(next_line):
                                 break
+                            # Include this line as part of the caption
                             caption_parts.append(next_line)
+                            j += 1
-                        # Join the parts with spaces
+                        # Join the caption parts
                         full_caption = " ".join(caption_parts)
-                        captions.append(full_caption)
+                        context_parts.append("Caption: " + full_caption)
+                        # Add extensive context after the caption (especially important for SI)
+                        context_after = []
+                        # Look for descriptive text following the caption
+                        for k in range(j, min(len(block_lines), j + 10)):  # Look ahead up to 10 lines
+                            if k < len(block_lines) and block_lines[k].strip():
+                                if not _CAPTION_PREFIX_RE.match(block_lines[k]):
+                                    context_after.append(block_lines[k])
+                        # If not enough context, look at next text blocks
+                        if len(context_after) < 3 and block_idx + 1 < len(page_text_blocks):
+                            next_block_text = page_text_blocks[block_idx + 1]
+                            if next_block_text:
+                                # Get first few sentences from next block
+                                sentences = next_block_text.split('. ')
+                                context_after.extend(sentences[:3] if len(sentences) > 1 else [next_block_text])
+                        if context_after:
+                            # Include extensive following context
+                            following_text = " ".join(context_after[:7])  # First 7 lines of following context
+                            context_parts.append("Following: " + following_text)
+                        # For SI documents, add section context if this appears to be a section header
+                        if any(keyword in full_caption.lower() for keyword in ['supplementary', 'supporting', 'si ', 's1', 's2', 's3']):
+                            context_parts.append("SI_SECTION: This appears to be supplementary material content")
+                        # Combine all parts with proper separation
+                        full_caption_with_context = " | ".join(context_parts)
+                        captions.append(full_caption_with_context)
     finally:
         doc.close()
@@ -639,17 +727,18 @@ Return a JSON object with:
 _LINEAGE_LOC_PROMPT = """
 You are an expert reader of protein engineering manuscripts.
+{campaign_context}
 Given the following article text, list up to {max_results} *locations* (page
 numbers, figure/table IDs, or section headings) that you would review first to
 find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
-came from which parent and what mutations were introduced).
+came from which parent and what mutations were introduced){campaign_specific}.
 Respond with a JSON array of objects, each containing:
 - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
 - "type": one of "table", "figure", "text", "section"
 - "confidence": your confidence score (0-100) that this location contains lineage data
 - "reason": brief explanation of why this location likely contains lineage
+{campaign_field}
 IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
 Order by confidence score (highest first). Tables showing complete variant lineages or
@@ -660,9 +749,9 @@ Don't include oligonucleotide results or result from only one round.
 Example output:
 [
-  {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"}},
-  {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"}},
-  {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"}}
+  {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
+  {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
+  {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
 ]
 """.strip()
@@ -827,7 +916,39 @@ def identify_evolution_locations(
     # Include TOC before the main text
     combined_text = toc_text + text if toc_text else text
-    prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
+    # Add campaign context if provided
+    campaign_context = ""
+    campaign_specific = ""
+    campaign_field = ""
+    campaign_example = ""
+    if campaigns and len(campaigns) == 1:
+        # Single campaign - make it specific
+        camp = campaigns[0]
+        campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
+        if hasattr(camp, 'notes') and camp.notes:
+            campaign_context += f"- Key identifiers: {camp.notes}\n"
+        campaign_specific = f" for the '{camp.campaign_name}' campaign"
+        campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
+        campaign_example = f', "campaign_id": "{camp.campaign_id}"'
+    elif campaigns and len(campaigns) > 1:
+        # Multiple campaigns - list them all
+        campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
+        for camp in campaigns:
+            campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
+        campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
+        campaign_specific = " for any of the identified campaigns"
+        campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
+        campaign_example = ', "campaign_id": "campaign_id_here"'
+    prompt = _LINEAGE_LOC_PROMPT.format(
+        campaign_context=campaign_context,
+        max_results=max_results,
+        campaign_specific=campaign_specific,
+        campaign_field=campaign_field,
+        campaign_example=campaign_example
+    ) + "\n\nTEXT:\n" + combined_text
     locs: List[dict] = []
     try:
         locs = generate_json_with_retry(
@@ -1290,6 +1411,138 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
 # ---- 6.4  Public API -------------------------------------------------------
+def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
+    """Extract text from a specific location (table, section, etc.) in the full text."""
+    import re
+    if location_type == 'table':
+        # Find ALL mentions of this table and combine them
+        location_clean = location.strip()
+        # Different ways the table might be referenced
+        search_patterns = [
+            location_clean,  # Exact match
+            location_clean.replace("Supplementary ", "Supp. "),  # Common abbreviation
+            location_clean.replace("Supplementary ", "S"),  # E.g., "Table S3"
+            location_clean.replace("Supplementary Table ", "Table S"),  # Another common format
+        ]
+        # Collect all occurrences
+        all_occurrences = []
+        seen_positions = set()
+        for search_term in search_patterns:
+            pattern = re.compile(re.escape(search_term), re.IGNORECASE)
+            for match in pattern.finditer(full_text):
+                # Avoid duplicates from overlapping patterns
+                if match.start() in seen_positions:
+                    continue
+                seen_positions.add(match.start())
+                # Extract generous context around each mention
+                start = max(0, match.start() - 1000)
+                end = min(len(full_text), match.end() + 10000)
+                context = full_text[start:end]
+                all_occurrences.append({
+                    'position': match.start(),
+                    'context': context,
+                    'match': match.group()
+                })
+        if not all_occurrences:
+            log.warning(f"No occurrences of table '{location_clean}' found in text")
+            return None
+        log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
+        # Combine all occurrences into one text for Gemini to analyze
+        combined_text = f"=== All occurrences of {location_clean} ===\n\n"
+        for i, occurrence in enumerate(all_occurrences, 1):
+            combined_text += f"--- Occurrence {i} at position {occurrence['position']} ---\n"
+            combined_text += occurrence['context']
+            combined_text += "\n\n"
+        # Limit total length to avoid overwhelming the model
+        if len(combined_text) > 50000:
+            combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
+        return combined_text
+    elif location_type == 'figure':
+        # For figures, we mainly want the caption and any text description
+        location_clean = location.strip()
+        patterns = [
+            rf'({re.escape(location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})',
+            rf'(Figure\s+S?\d+[^\n]*{re.escape(location_clean.split()[-1] if location_clean.split() else location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
+            if match:
+                # For figures, include surrounding context as the data might be described nearby
+                start = max(0, match.start() - 1000)
+                end = min(match.end() + 2000, len(full_text))
+                return full_text[start:end]
+    elif location_type == 'section':
+        # Look for section heading
+        location_clean = location.strip()
+        patterns = [
+            # Section with number
+            rf'((?:^|\n)\d+\.?\s*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]).*\n){{0,500}})',
+            # Section without number
+            rf'((?:^|\n){re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]|\n[A-Z]{{2,}}).*\n){{0,500}})',
+            # More flexible section matching
+            rf'((?:^|\n)[^\n]*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+|\n[A-Z]{{2,}}).*\n){{0,500}})'
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
+            if match:
+                return match.group(1)
+    elif location_type == 'text':
+        # Try to find the location as a page marker or general text
+        if location.isdigit():
+            # Page number - look for page markers
+            page_num = int(location)
+            # Look for page breaks or page numbers
+            patterns = [
+                rf'(?:^|\n)\s*-?\s*{page_num}\s*-?\s*\n((?:.*\n){{0,300}})',
+                rf'(?:page|p\.?)\s*{page_num}[^\n]*\n((?:.*\n){{0,300}})',
+                rf'\n{page_num}\n((?:.*\n){{0,300}})'
+            ]
+            for pattern in patterns:
+                match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE)
+                if match:
+                    start = match.start()
+                    end = min(start + 15000, len(full_text))
+                    return full_text[start:end]
+    # Fallback: try fuzzy search for the location string
+    location_words = location.split()
+    if len(location_words) >= 2:
+        # Try to find at least the first two words together
+        search_pattern = rf'{re.escape(location_words[0])}\s+{re.escape(location_words[1])}'
+        match = re.search(search_pattern, full_text, re.IGNORECASE)
+        if match:
+            start = max(0, match.start() - 500)
+            end = min(match.start() + 8000, len(full_text))
+            return full_text[start:end]
+    # Last resort: find any occurrence of the location string
+    idx = full_text.lower().find(location.lower())
+    if idx != -1:
+        start = max(0, idx - 500)
+        end = min(idx + 8000, len(full_text))
+        return full_text[start:end]
+    log.warning(f"Could not find location '{location}' of type '{location_type}' in text")
+    return None
 def get_lineage(
     caption_text: str,
     full_text: str,
@@ -1328,63 +1581,220 @@ def get_lineage(
         campaigns = [default_campaign]
         log.info(f"Created default campaign: {default_campaign.campaign_name}")
-    # Use captions for identification - they're concise and focused
-    locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
     all_variants = []
     if campaigns:
-        # If we have campaigns but no specific locations, use general extraction
-        if not locations:
-            log.info("No specific lineage locations found, extracting from full text with campaign context")
-            # Extract lineage for each campaign using full text
+        log.info("Using campaign-aware location identification")
+        # Process each campaign separately
+        for campaign in campaigns:
+            log.info(f"\nProcessing campaign: {campaign.campaign_id} - {campaign.campaign_name}")
+            # Use identify_evolution_locations with campaign context
+            locations = identify_evolution_locations(
+                caption_text,
+                model,
+                max_results=5,
+                debug_dir=debug_dir,
+                campaigns=[campaign],  # Pass single campaign for focused search
+                pdf_paths=pdf_paths
+            )
+            if not locations:
+                log.warning(f"No locations found for campaign {campaign.campaign_id}, trying full text extraction")
+                # Fall back to full text extraction
+                campaign_variants = extract_complete_lineage(
+                    full_text, model,
+                    debug_dir=debug_dir,
+                    campaign_id=campaign.campaign_id,
+                    campaign_info=campaign,
+                    pdf_paths=pdf_paths
+                )
+                all_variants.extend(campaign_variants)
+                continue
+            log.info(f"Found {len(locations)} potential locations for campaign {campaign.campaign_id}")
+            for loc in locations:
+                log.info(f"  - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
+            # Try to extract from the best location
+            extracted_variants = []
+            for location in locations:
+                if extracted_variants:
+                    break  # Already got variants
+                location_str = location.get('location', '')
+                location_type = location.get('type', '')
+                confidence = location.get('confidence', 0)
+                # Try figure extraction for high-confidence figures
+                if location_type == 'figure' and confidence >= 70 and pdf_paths:
+                    log.info(f"Attempting to extract figure: {location_str}")
+                    figure_bytes = None
+                    for pdf_path in pdf_paths:
+                        figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
+                        if figure_bytes:
+                            log.info(f"Successfully extracted figure from {pdf_path.name}")
+                            break
+                    if figure_bytes:
+                        # Save figure if debug enabled
+                        if debug_dir:
+                            debug_path = Path(debug_dir)
+                            debug_path.mkdir(parents=True, exist_ok=True)
+                            figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
+                            _dump(figure_bytes, figure_file)
+                            log.info(f"Saved figure to: {figure_file}")
+                        # Extract lineage from figure
+                        variants = extract_lineage_from_figure(
+                            figure_bytes, model,
+                            debug_dir=debug_dir,
+                            campaign_id=campaign.campaign_id,
+                            campaign_info=campaign
+                        )
+                        if variants:
+                            log.info(f"Extracted {len(variants)} variants from figure")
+                            extracted_variants = variants
+                            continue
+                # Try table/text extraction
+                if location_type in ['table', 'text', 'section'] and not extracted_variants:
+                    log.info(f"Attempting text extraction for {location_type}: {location_str}")
+                    # Extract the specific section/table from full text
+                    section_text = _extract_location_text(full_text, location_str, location_type)
+                    if section_text:
+                        log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
+                        # Save extracted section if debug enabled
+                        if debug_dir:
+                            debug_path = Path(debug_dir)
+                            section_file = debug_path / f"extracted_{location_type}_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
+                            _dump(f"=== EXTRACTED {location_type.upper()} ===\nLocation: {location_str}\nLength: {len(section_text)} chars\n{'='*80}\n\n{section_text}", section_file)
+                        variants = extract_complete_lineage(
+                            section_text, model,
+                            debug_dir=debug_dir,
+                            campaign_id=campaign.campaign_id,
+                            campaign_info=campaign,
+                            pdf_paths=pdf_paths
+                        )
+                        if variants:
+                            log.info(f"Extracted {len(variants)} variants from {location_type}")
+                            extracted_variants = variants
+                    else:
+                        log.warning(f"Could not extract text from {location_type}: {location_str}")
+            # If no variants extracted from specific locations, try full text
+            if not extracted_variants:
+                log.warning(f"Could not extract from specific locations, trying full text for campaign {campaign.campaign_id}")
+                extracted_variants = extract_complete_lineage(
+                    full_text, model,
+                    debug_dir=debug_dir,
+                    campaign_id=campaign.campaign_id,
+                    campaign_info=campaign,
+                    pdf_paths=pdf_paths
+                )
+            all_variants.extend(extracted_variants)
+        return all_variants, campaigns
+    # Original fallback code for when no campaigns are identified
+    log.info("Processing campaigns with direct caption and TOC analysis (skipping global location finding)")
+    # Prepare all captions and TOC with context for campaign-specific selection
+    caption_entries = []
+    # Add table of contents entries if available
+    if pdf_paths:
+        toc_sections = []
+        for pdf_path in pdf_paths:
+            # Extract first few pages looking for TOC
+            try:
+                import fitz  # PyMuPDF
+                doc = fitz.open(pdf_path)
+                toc_text = ""
+                for page_num in range(min(5, doc.page_count)):  # First 5 pages
+                    page = doc[page_num]  # Correct PyMuPDF syntax
+                    page_text = page.get_text()
+                    if any(keyword in page_text.lower() for keyword in ['contents', 'table of contents', 'overview']):
+                        toc_text += f"\n--- Page {page_num + 1} TOC ---\n{page_text}\n"
+                doc.close()
+                if toc_text:
+                    toc_sections.append(toc_text)
+            except Exception as e:
+                log.warning(f"Failed to extract TOC from {pdf_path}: {e}")
+            if toc_sections:
+                caption_entries.append({
+                    'type': 'table_of_contents',
+                    'location': 'Table of Contents',
+                    'context': '\n'.join(toc_sections)[:1000] + "..."
+                })
+        # Parse figure and table captions from caption_text
+        # Split by common caption patterns
+        caption_patterns = [
+            r'(?:^|\n)(?:Figure|Fig\.?)\s*\d+[:\.]',
+            r'(?:^|\n)(?:Table|Tab\.?)\s*\d+[:\.]',
+            r'(?:^|\n)(?:Scheme|Sch\.?)\s*\d+[:\.]'
+        ]
+        import re
+        for pattern in caption_patterns:
+            matches = list(re.finditer(pattern, caption_text, re.MULTILINE | re.IGNORECASE))
+            for i, match in enumerate(matches):
+                start_pos = match.start()
+                # Find the end of this caption (start of next caption or end of text)
+                if i + 1 < len(matches):
+                    end_pos = matches[i + 1].start()
+                else:
+                    end_pos = min(start_pos + 2000, len(caption_text))  # Max 2000 chars per caption
+                caption_content = caption_text[start_pos:end_pos].strip()
+                if len(caption_content) > 20:  # Skip very short captions
+                    # Extract context from full text around this caption
+                    context_start = max(0, full_text.find(caption_content[:100]) - 500)
+                    context_end = min(len(full_text), context_start + 2000)
+                    context = full_text[context_start:context_end]
+                    caption_entries.append({
+                        'type': 'figure' if 'fig' in pattern.lower() else 'table' if 'tab' in pattern.lower() else 'scheme',
+                        'location': caption_content.split('\n')[0][:100] + "..." if len(caption_content.split('\n')[0]) > 100 else caption_content.split('\n')[0],
+                        'context': context
+                    })
+        log.info(f"Prepared {len(caption_entries)} caption/TOC entries for campaign-specific analysis")
+        # If no caption entries found, fall back to full text extraction
+        if not caption_entries:
+            log.info("No caption entries found, extracting from full text with campaign context")
             for campaign in campaigns:
                 log.info(f"Processing campaign: {campaign.campaign_id}")
-                campaign_variants = extract_campaign_lineage(
-                    full_text, model, campaign_id=campaign.campaign_id,
-                    debug_dir=debug_dir, pdf_paths=pdf_paths,
-                    campaign_info=campaign
+                campaign_variants = extract_complete_lineage(
+                    full_text, model,
+                    debug_dir=debug_dir,
+                    campaign_id=campaign.campaign_id,
+                    campaign_info=campaign,
+                    pdf_paths=pdf_paths
                 )
                 all_variants.extend(campaign_variants)
             return all_variants, campaigns
-        # Original logic for when we have both locations and campaigns
-        # Log location information
-        location_summary = []
-        for loc in locations[:5]:
-            if isinstance(loc, dict):
-                location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
-            else:
-                location_summary.append(str(loc))
-        log.info("Gemini identified %d potential lineage locations: %s",
-                 len(locations), ", ".join(location_summary))
-        # Extract context around each location for better decision making
-        locations_with_context = []
-        for loc in locations:
-            location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
-            # Extract 1000 chars of context around the location
-            context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
-            locations_with_context.append({
-                'location': loc,
-                'context': context_text  # Full extracted context
-            })
-        # For each campaign, ask Gemini to select the best location
+        # For each campaign, ask Gemini to select the best location from captions/TOC
         for campaign in campaigns:
             log.info(f"Processing campaign: {campaign.campaign_id}")
-            # Build locations context string
+            # Build locations context string from caption entries
             locations_str = ""
-            for i, loc_ctx in enumerate(locations_with_context):
-                loc = loc_ctx['location']
-                context = loc_ctx['context']
-                location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
-                location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
-                confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
-                reason = loc.get('reason', '') if isinstance(loc, dict) else ''
+            for i, entry in enumerate(caption_entries):
+                location_str = entry['location']
+                location_type = entry['type']
+                context = entry['context']
-                locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
-                locations_str += f"   Reason: {reason}\n"
+                locations_str += f"\n{i+1}. {location_str} (Type: {location_type})\n"
                 locations_str += f"   Context (first 500 chars):\n   {context[:500]}...\n"
             # Ask Gemini to select best location for this campaign
@@ -1426,26 +1836,39 @@ def get_lineage(
                 log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
-                # Find the actual location object
-                for loc in locations:
-                    loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
-                    if loc_str == selected_location:
-                        primary_location = loc
+                # Find the actual caption entry
+                selected_entry = None
+                for entry in caption_entries:
+                    if entry['location'] == selected_location:
+                        selected_entry = entry
                         break
-                if not primary_location:
-                    log.warning(f"Could not find selected location '{selected_location}' in locations list")
-                    # Fall back to highest confidence location
-                    primary_location = sorted(locations,
-                                            key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
-                                            reverse=True)[0] if locations else None
+                if not selected_entry:
+                    log.warning(f"Could not find selected location '{selected_location}' in caption entries")
+                    # Fall back to first entry
+                    selected_entry = caption_entries[0] if caption_entries else None
+                # Convert caption entry to location format for compatibility
+                if selected_entry:
+                    primary_location = {
+                        'location': selected_entry['location'],
+                        'type': selected_entry['type'],
+                        'confidence': 0.8,  # Default confidence for caption-based selection
+                        'reason': f"Selected from {selected_entry['type']} captions"
+                    }
             except Exception as e:
                 log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
-                # Fall back to highest confidence location
-                primary_location = sorted(locations,
-                                        key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
-                                        reverse=True)[0] if locations else None
+                # Fall back to first caption entry
+                if caption_entries:
+                    primary_location = {
+                        'location': caption_entries[0]['location'],
+                        'type': caption_entries[0]['type'],
+                        'confidence': 0.5,  # Lower confidence for fallback
+                        'reason': f"Fallback to first {caption_entries[0]['type']} caption"
+                    }
+                else:
+                    primary_location = None
             if not primary_location:
                 log.warning(f"No location found for campaign {campaign.campaign_id}")
@@ -1587,6 +2010,97 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
         return []
 # --- 7.2  Page-based extraction helper ---------------------------------------
+def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
+    """Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
+    Args:
+        prompt: The prompt to send to Gemini
+        model: The Gemini model instance
+        context: Additional context for logging (e.g., "validation" or "extraction")
+    Returns:
+        The validated sequence or None if no consensus
+    """
+    sequences = []
+    max_attempts = 5  # Increased from 3 to 5
+    # Try up to 5 times
+    for attempt in range(max_attempts):
+        try:
+            response = model.generate_content(prompt)
+            result = _extract_text(response).strip()
+            # Parse the result to extract just the sequence
+            if result == "VALID":
+                sequences.append("VALID")
+            elif result == "UNCERTAIN":
+                sequences.append("UNCERTAIN")
+            elif result.startswith("M") and len(result) > 50:
+                # Clean the sequence
+                clean_seq = result.upper().replace(" ", "").replace("\n", "")
+                if all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in clean_seq):
+                    sequences.append(clean_seq)
+                else:
+                    sequences.append("INVALID")
+            else:
+                sequences.append("INVALID")
+            log.info(f"Gemini {context} attempt {attempt + 1}: {len(result) if result.startswith('M') else result}")
+        except Exception as e:
+            log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
+            sequences.append("ERROR")
+        # Check for early consensus after 2 attempts
+        if len(sequences) == 2:
+            # Clean sequences before comparison
+            seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
+            seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
+            if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
+                log.info(f"Gemini {context} consensus reached after 2 attempts")
+                return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
+            else:
+                log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
+    # After all attempts, find consensus
+    valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
+    if not valid_sequences:
+        log.error(f"All {max_attempts} {context} attempts failed")
+        return None
+    # Find any matching pair
+    for i in range(len(sequences)):
+        for j in range(i + 1, len(sequences)):
+            # Clean sequences before comparison
+            seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
+            seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
+            if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
+                log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
+                return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
+    # If no exact match, use adaptive validation
+    # Count occurrences of each valid sequence
+    sequence_counts = {}
+    for seq in valid_sequences:
+        if seq not in ["VALID", "UNCERTAIN"]:
+            # Clean sequence before counting
+            seq_clean = seq.replace(" ", "").replace("\n", "")
+            sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
+    # Return the most common sequence if it appears at least twice
+    if sequence_counts:
+        most_common = max(sequence_counts.items(), key=lambda x: x[1])
+        if most_common[1] >= 2:
+            log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
+            return most_common[0]
+    log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
+    return None
 def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
     """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
@@ -1616,7 +2130,7 @@ def _validate_sequence_against_mutations(sequence: str, variants: List[Variant],
     if not local_issues:
         return None  # No obvious issues found
-    log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
+    log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
     prompt = f"""
 You are validating a protein sequence that was extracted from a scientific paper.
@@ -1641,26 +2155,14 @@ Return ONLY the corrected sequence if changes are needed, or "VALID" if no chang
 If you cannot determine the correct sequence, return "UNCERTAIN".
 """
-    try:
-        response = model.generate_content(prompt)
-        result = _extract_text(response).strip()
-        if result == "VALID":
-            return None  # No changes needed
-        elif result == "UNCERTAIN":
-            log.warning("Gemini could not validate sequence against mutations")
-            return None
-        elif result.startswith("M") and len(result) > 50:
-            # Gemini returned a corrected sequence
-            log.info(f"Gemini suggested sequence correction (length {len(result)})")
-            return result
-        else:
-            log.warning(f"Unexpected validation response: {result[:100]}...")
-            return None
-    except Exception as e:
-        log.warning(f"Failed to validate sequence: {e}")
-        return None
+    # Use triple validation
+    result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
+    if result == "VALID" or result is None:
+        return None  # No changes needed
+    else:
+        log.info(f"Gemini suggested sequence correction (length {len(result)})")
+        return result
 def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
@@ -1827,10 +2329,18 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
 - Extract the variant_id exactly as written where the sequence appears
 - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
+SEQUENCE EXTRACTION RULES:
+- Copy sequences EXACTLY as they appear in the text
+- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
+- Do NOT add, remove, or modify any amino acids
+- Preserve the exact length and character sequence
+- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
+- Double-check that consecutive identical amino acids are copied correctly
 For each variant return:
   * variant_id  - the EXACT label as it appears with the sequence (preserve all formatting)
-  * aa_seq      - amino-acid sequence (uppercase), or null
-  * dna_seq     - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
+  * aa_seq      - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
+  * dna_seq     - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
 Respond ONLY with **minified JSON** that matches the schema below.
 NO markdown, no code fences, no commentary.
@@ -1846,8 +2356,258 @@ TEXT (may be truncated):
 ```
 """.strip()
-def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
-    """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
+def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
+    """Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
+    Args:
+        model: The Gemini model instance
+        prompt: The prompt to send to Gemini
+        schema_hint: The JSON schema hint
+        debug_dir: Optional debug directory
+    Returns:
+        The validated sequence JSON data or None if no consensus
+    """
+    responses = []
+    max_attempts = 5  # Increased from 3 to 5
+    # Try up to 5 times
+    for attempt in range(max_attempts):
+        try:
+            log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
+            resp = model.generate_content(prompt)
+            raw = _extract_text(resp).strip()
+            # Save debug info
+            if debug_dir:
+                debug_path = Path(debug_dir)
+                debug_path.mkdir(parents=True, exist_ok=True)
+                response_file = debug_path / f"sequences_attempt_{attempt + 1}_{int(time.time())}.txt"
+                with open(response_file, 'w') as f:
+                    f.write(f"=== SEQUENCE EXTRACTION ATTEMPT {attempt + 1} ===\n")
+                    f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+                    f.write(f"Length: {len(raw)} characters\n")
+                    f.write("="*80 + "\n\n")
+                    f.write(raw)
+            # Parse JSON response (similar to generate_json_with_retry logic)
+            fence_re = re.compile(r"```json|```", re.I)
+            if raw.startswith("```"):
+                raw = fence_re.sub("", raw).strip()
+            # Try to parse as JSON
+            try:
+                parsed = json.loads(raw)
+            except json.JSONDecodeError:
+                # Look for JSON array or object in the response
+                json_start = -1
+                json_end = -1
+                bracket_stack = []
+                in_string = False
+                escape_next = False
+                for i, char in enumerate(raw):
+                    if escape_next:
+                        escape_next = False
+                        continue
+                    if char == '\\':
+                        escape_next = True
+                        continue
+                    if char == '"' and not escape_next:
+                        in_string = not in_string
+                        continue
+                    if in_string:
+                        continue
+                    if char in '[{':
+                        if json_start == -1:
+                            json_start = i
+                        bracket_stack.append(char)
+                    elif char in ']}':
+                        if bracket_stack:
+                            opening = bracket_stack.pop()
+                            if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
+                                if not bracket_stack:  # Found complete JSON
+                                    json_end = i + 1
+                                    break
+                if json_start >= 0 and json_end > json_start:
+                    json_str = raw[json_start:json_end]
+                    parsed = json.loads(json_str)
+                else:
+                    if '[]' in raw:
+                        parsed = []
+                    else:
+                        raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
+            # Store both the original and normalized response
+            normalized_response = _normalize_sequence_response(parsed)
+            responses.append((parsed, normalized_response))
+            log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
+        except Exception as e:
+            log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
+            responses.append(None)
+        # Check for early consensus after 2 attempts
+        if len(responses) == 2:
+            if (responses[0] and responses[1] and
+                _sequences_match(responses[0][1], responses[1][1])):
+                log.info("Sequence extraction consensus reached after 2 attempts")
+                return responses[0][0]  # Return original parsed data
+            else:
+                log.info("Sequence extraction mismatch after 2 attempts - trying third")
+    # After all attempts, use adaptive validation
+    valid_responses = [r for r in responses if r is not None]
+    if not valid_responses:
+        log.error(f"All {max_attempts} sequence extraction attempts failed")
+        return None
+    # First, try to find exact consensus (any matching pair)
+    for i in range(len(valid_responses)):
+        for j in range(i + 1, len(valid_responses)):
+            if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
+                log.info(f"Sequence extraction consensus found: attempts with matching content")
+                return valid_responses[i][0]  # Return original parsed data
+    # If no exact consensus, use adaptive validation
+    log.info("No exact consensus found, applying adaptive validation...")
+    # Find sequences that appear consistently across multiple attempts
+    consistent_sequences = _find_consistent_sequences(valid_responses)
+    if consistent_sequences:
+        log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
+        return consistent_sequences
+    # If still no consensus, use the attempt with the most sequences
+    best_response = max(valid_responses,
+                       key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
+    if best_response and len(best_response[1]) > 0:
+        log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
+        return best_response[0]
+    log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
+    return None
+def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
+    """Find sequences that appear consistently across multiple extraction attempts.
+    Args:
+        valid_responses: List of (original_data, normalized_data) tuples
+    Returns:
+        List of consistent sequences with confidence scores, or None if none found
+    """
+    if not valid_responses:
+        return None
+    # Count how many times each sequence appears
+    sequence_counts = {}
+    sequence_full_data = {}
+    for original, normalized in valid_responses:
+        if not isinstance(normalized, list):
+            continue
+        for seq in normalized:
+            variant_id = seq.get("variant_id", "")
+            aa_seq = seq.get("aa_seq", "")
+            # Clean sequence before using in key
+            aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
+            # Create a unique key for this sequence
+            key = f"{variant_id}|{aa_seq_clean}"
+            if key not in sequence_counts:
+                sequence_counts[key] = 0
+                sequence_full_data[key] = []
+            sequence_counts[key] += 1
+            # Find the full data for this sequence from the original response
+            if isinstance(original, list):
+                for orig_seq in original:
+                    if (orig_seq.get("variant_id") == variant_id and
+                        orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
+                        sequence_full_data[key].append(orig_seq)
+                        break
+    # Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
+    min_appearances = max(2, len(valid_responses) // 2)
+    consistent_sequences = []
+    for key, count in sequence_counts.items():
+        if count >= min_appearances:
+            # Use the first occurrence of the full data
+            if sequence_full_data[key]:
+                seq_data = sequence_full_data[key][0].copy()
+                # Add confidence based on how many times it appeared
+                seq_data["confidence"] = count / len(valid_responses)
+                seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
+                consistent_sequences.append(seq_data)
+    return consistent_sequences if consistent_sequences else None
+def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
+    """Normalize sequence response for comparison."""
+    if not isinstance(data, list):
+        return []
+    normalized = []
+    for item in data:
+        if isinstance(item, dict):
+            # Extract key fields for comparison
+            normalized_item = {
+                "variant_id": item.get("variant_id", ""),
+                "aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
+                "dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
+                "confidence": item.get("confidence", 0.0)
+            }
+            normalized.append(normalized_item)
+    # Sort by variant_id for consistent comparison
+    return sorted(normalized, key=lambda x: x["variant_id"])
+def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
+    """Check if two sequence response lists match on key fields."""
+    if len(seq1) != len(seq2):
+        return False
+    for i, (s1, s2) in enumerate(zip(seq1, seq2)):
+        # Compare variant IDs
+        if s1.get("variant_id") != s2.get("variant_id"):
+            return False
+        # Compare amino acid sequences (most critical)
+        aa1 = s1.get("aa_seq", "")
+        aa2 = s2.get("aa_seq", "")
+        if aa1 and aa2 and aa1 != aa2:
+            return False
+        elif bool(aa1) != bool(aa2):  # One has sequence, other doesn't
+            return False
+        # Compare DNA sequences if present
+        dna1 = s1.get("dna_seq", "")
+        dna2 = s2.get("dna_seq", "")
+        if dna1 and dna2 and dna1 != dna2:
+            return False
+    return True
+def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
+    """Prompt Gemini and convert its JSON reply into SequenceBlock objects with triple validation."""
     base_prompt = _SEQ_EXTRACTION_PROMPT.format(
         schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
     )
@@ -1864,8 +2624,50 @@ Match sequences to these known variants when possible. Variants may be labeled d
     else:
         prompt = base_prompt
-    data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
-    return _parse_sequences(data)
+    # Add mutation validation context if we have lineage variants with mutations
+    if lineage_variants:
+        mutation_context = _build_mutation_validation_context(lineage_variants)
+        if mutation_context:
+            prompt = f"""{prompt}
+CRITICAL MUTATION VALIDATION:
+{mutation_context}
+IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
+For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
+"""
+    # Save the complete prompt for debugging
+    if debug_dir:
+        debug_path = Path(debug_dir)
+        debug_path.mkdir(parents=True, exist_ok=True)
+        prompt_file = debug_path / f"sequence_extraction_prompt_{int(time.time())}.txt"
+        with open(prompt_file, 'w') as f:
+            f.write(f"=== SEQUENCE EXTRACTION PROMPT ===\n")
+            f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(f"Text length: {len(text)} characters\n")
+            f.write(f"Truncated to: {len(text[:MAX_CHARS])} characters\n")
+            f.write(f"Total prompt length: {len(prompt)} characters\n")
+            f.write("="*80 + "\n\n")
+            f.write(prompt)
+        log.info(f"Saved sequence extraction prompt to {prompt_file}")
+    # Use triple validation for sequence extraction
+    log.info("Extracting sequences with triple validation to ensure accuracy")
+    data = _extract_sequences_with_triple_validation(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir)
+    if not data:
+        log.warning("Failed to get consistent sequence extraction after triple validation")
+        return []
+    extracted_sequences = _parse_sequences(data)
+    # Post-process: validate sequences against mutations if we have lineage info
+    if lineage_variants:
+        validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
+        return validated_sequences
+    return extracted_sequences
 # --- 7.4  JSON -> dataclass helpers -------------------------------------------
 _VALID_AA  = set("ACDEFGHIKLMNPQRSTVWY*")  # Include * for stop codon
@@ -1916,6 +2718,167 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
         )
     return blocks
+def _build_mutation_validation_context(lineage_variants: List[Variant]) -> str:
+    """Build mutation context for sequence validation."""
+    mutation_info = []
+    for variant in lineage_variants:
+        if variant.mutations and variant.parent_id:
+            mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
+            mutation_info.append(f"Variant '{variant.variant_id}' (parent: '{variant.parent_id}') has mutations: {mutations_str}")
+    if not mutation_info:
+        return ""
+    context = "Known mutation relationships:\n" + "\n".join(mutation_info[:10])  # Limit to first 10 for context
+    if len(mutation_info) > 10:
+        context += f"\n... and {len(mutation_info) - 10} more variants with mutations"
+    return context
+def _validate_sequences_against_mutations(sequences: List[SequenceBlock], lineage_variants: List[Variant], model, debug_dir: str | Path | None = None) -> List[SequenceBlock]:
+    """Validate extracted sequences against known mutations and fix inconsistencies."""
+    # Create lookups for easier access
+    seq_lookup = {seq.variant_id: seq for seq in sequences}
+    variant_lookup = {var.variant_id: var for var in lineage_variants}
+    validation_issues = []
+    corrected_sequences = []
+    for seq in sequences:
+        variant = variant_lookup.get(seq.variant_id)
+        if not variant or not variant.parent_id or not variant.mutations or not seq.aa_seq:
+            corrected_sequences.append(seq)
+            continue
+        parent_seq = seq_lookup.get(variant.parent_id)
+        if not parent_seq or not parent_seq.aa_seq:
+            corrected_sequences.append(seq)
+            continue
+        # Check if mutations are consistent
+        issues = _check_mutation_consistency(seq.aa_seq, parent_seq.aa_seq, variant.mutations, seq.variant_id, variant.parent_id)
+        if issues:
+            validation_issues.extend(issues)
+            log.warning(f"Sequence validation issues for {seq.variant_id}: {'; '.join(issues)}")
+            # Try to get corrected sequence from Gemini
+            corrected_seq = _get_corrected_sequence_from_gemini(seq, parent_seq, variant, issues, model, debug_dir)
+            if corrected_seq:
+                corrected_sequences.append(corrected_seq)
+                log.info(f"Corrected sequence for {seq.variant_id} using Gemini validation")
+            else:
+                corrected_sequences.append(seq)  # Keep original if correction fails
+        else:
+            corrected_sequences.append(seq)
+    if validation_issues:
+        log.warning(f"Found {len(validation_issues)} sequence validation issues across {len([s for s in sequences if s.variant_id in [v.variant_id for v in lineage_variants if v.mutations]])} variants with mutations")
+    return corrected_sequences
+def _check_mutation_consistency(child_seq: str, parent_seq: str, mutations, child_id: str, parent_id: str) -> List[str]:
+    """Check if mutations are consistent between parent and child sequences."""
+    import re
+    issues = []
+    # Parse mutations (handle both string and list formats)
+    if isinstance(mutations, list):
+        mutation_strs = mutations
+    else:
+        mutation_strs = [m.strip() for m in str(mutations).split(',') if m.strip()]
+    for mut_str in mutation_strs:
+        # Parse mutation like "A100V"
+        match = re.match(r'^([A-Z])(\d+)([A-Z])$', mut_str.strip())
+        if not match:
+            continue  # Skip non-standard mutation formats
+        orig_aa, pos_str, new_aa = match.groups()
+        pos = int(pos_str) - 1  # Convert to 0-based indexing
+        # Check bounds
+        if pos >= len(parent_seq) or pos >= len(child_seq):
+            issues.append(f"Mutation {mut_str} position out of bounds")
+            continue
+        # Check parent sequence has expected original amino acid
+        if parent_seq[pos] != orig_aa:
+            issues.append(f"Mutation {mut_str}: parent {parent_id} has {parent_seq[pos]} at position {pos+1}, expected {orig_aa}")
+        # Check child sequence has expected new amino acid
+        if child_seq[pos] != new_aa:
+            issues.append(f"Mutation {mut_str}: child {child_id} has {child_seq[pos]} at position {pos+1}, expected {new_aa}")
+    return issues
+def _get_corrected_sequence_from_gemini(seq: SequenceBlock, parent_seq: SequenceBlock, variant: Variant, issues: List[str], model, debug_dir: str | Path | None = None) -> SequenceBlock | None:
+    """Use Gemini to get a corrected sequence based on mutation validation issues."""
+    if not model:
+        return None
+    mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
+    issues_str = "; ".join(issues)
+    prompt = f"""You extracted a sequence for variant "{seq.variant_id}" but there are mutation validation issues:
+ISSUES: {issues_str}
+PARENT SEQUENCE ({variant.parent_id}):
+{parent_seq.aa_seq}
+EXTRACTED SEQUENCE ({seq.variant_id}):
+{seq.aa_seq}
+EXPECTED MUTATIONS: {mutations_str}
+Based on the parent sequence and the expected mutations, provide the CORRECT sequence for {seq.variant_id}.
+Apply each mutation to the parent sequence in order.
+For example, if parent has "A" at position 100 and mutation is "A100V", then child should have "V" at position 100.
+IMPORTANT SEQUENCE RULES:
+- Copy the sequence EXACTLY - do not add, remove, or modify any amino acids
+- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
+- Preserve the exact length of the sequence
+- Only change the specific positions indicated by the mutations
+- Double-check that consecutive identical amino acids are copied correctly
+Return ONLY the corrected amino acid sequence (no explanation, no formatting).
+If you cannot determine the correct sequence, return "UNCERTAIN".
+"""
+    try:
+        if debug_dir:
+            import time
+            timestamp = int(time.time())
+            prompt_file = Path(debug_dir) / f"sequence_validation_{seq.variant_id}_{timestamp}.txt"
+            _dump(prompt, prompt_file)
+        # Use triple validation for sequence correction
+        log.info(f"Correcting sequence for {seq.variant_id} with triple validation")
+        corrected_seq = _extract_plain_sequence_with_triple_validation(prompt, model, f"correction for {seq.variant_id}")
+        if debug_dir and corrected_seq:
+            response_file = Path(debug_dir) / f"sequence_validation_response_{seq.variant_id}_{timestamp}.txt"
+            _dump(corrected_seq, response_file)
+        if corrected_seq and corrected_seq not in ["UNCERTAIN", "VALID"] and _clean_seq(corrected_seq, _VALID_AA):
+            return SequenceBlock(
+                variant_id=seq.variant_id,
+                aa_seq=corrected_seq,
+                dna_seq=seq.dna_seq,
+                confidence=0.8,  # Lower confidence for corrected sequences
+                truncated=seq.truncated
+            )
+    except Exception as e:
+        log.warning(f"Failed to get corrected sequence for {seq.variant_id}: {e}")
+    return None
 # --- 7.5  Convenience wrapper -------------------------------------------------
 def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
     # Phase 1: Identify where sequences might be located
@@ -1973,6 +2936,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
             # Fallback to text search if page extraction didn't work
             if not focused_text:
+                log.info("Page extraction did not return text, falling back to text search")
                 focused_text = _extract_text_at_locations(
                     text, [best_location],
                     context_chars=max(min_length, 30000),
@@ -1982,6 +2946,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
             if focused_text and len(focused_text) < len(text):
                 log.info("Reduced text from %d to %d chars using validated location",
                          len(text), len(focused_text))
+            else:
+                log.warning("Failed to reduce text size - focused_text length: %d, full text length: %d",
+                           len(focused_text) if focused_text else 0, len(text))
                 # Build lineage context if available
                 lineage_context = None
                 if lineage_variants:
@@ -1993,7 +2960,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                         variant_info.append(info)
                     lineage_context = "\n".join(variant_info)
-                return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
+                return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
         else:
             log.warning("Location validation failed or returned invalid location: %s",
                        validation.get("reason", "Unknown"))
@@ -2011,7 +2978,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
             variant_info.append(info)
         lineage_context = "\n".join(variant_info)
-    return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
+    return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
 # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
 """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -2077,6 +3044,7 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
         log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
         return {}
 def extract_enzyme_info_with_gemini(
     text: str,
     variants: List[Variant],
@@ -2146,7 +3114,7 @@ If you cannot determine certain fields, set them to null.
             # Validate it looks like a protein sequence
             if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
                 # Sanity check the sequence against known mutations
-                validated_seq = _validate_sequence_against_mutations(seq, variants, lineage_text, model)
+                validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
                 if validated_seq:
                     seq = validated_seq
                     log.info(f"Sequence validated and potentially corrected by Gemini")
@@ -2714,7 +3682,7 @@ def run_pipeline(
     # 1. Prepare raw text ------------------------------------------------------
     # Always load both caption text (for identification) and full text (for extraction)
-    pdf_paths = [p for p in (si_path, manuscript) if p]
+    pdf_paths = [p for p in (manuscript, si_path) if p]
     caption_text = limited_caption_concat(*pdf_paths)
     full_text = limited_concat(*pdf_paths)

debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

debase 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl