PyPI - debase - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

debase 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

debase/_version.py +1 -1
debase/caption_pattern.py +7 -2
debase/cleanup_sequence.py +34 -6
debase/enzyme_lineage_extractor.py +673 -221
debase/lineage_format.py +55 -6
debase/reaction_info_extractor.py +282 -97
debase/substrate_scope_extractor.py +218 -65
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
debase-0.7.0.dist-info/RECORD +18 -0
debase-0.6.1.dist-info/RECORD +0 -18
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -336,7 +336,7 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
     return "\n".join(chunks)
-def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None) -> Optional[bytes]:
+def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None, caption_text: str = "") -> Optional[bytes]:
     """Extract a specific figure from a PDF by finding its caption.
     Returns the figure as PNG bytes if found, None otherwise.
@@ -345,64 +345,49 @@ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Option
     figure_bytes = None
     try:
-        # Search for the exact figure caption text
-        search_text = figure_id.strip()
+        # Use caption text if provided, otherwise use figure_id
+        if caption_text:
+            # Use first 50 chars of caption for searching (enough to be unique)
+            search_text = caption_text[:50].strip()
+            log.info(f"Searching for figure using caption: '{search_text}...'")
+        else:
+            search_text = figure_id.strip()
+            log.info(f"Searching for figure using ID: '{search_text}'")
         for page_num, page in enumerate(doc):
-            # Search for the caption text on this page
-            text_instances = page.search_for(search_text)
+            page_text = page.get_text()
-            if text_instances:
-                log.info(f"Found caption '{figure_id}' on page {page_num + 1}")
+            # Check if caption text appears on this page
+            if search_text in page_text:
+                log.info(f"Found caption on page {page_num + 1}")
-                # Get the position of the first instance
-                caption_rect = text_instances[0]
+                # Search for the exact text position
+                text_instances = page.search_for(search_text)
-                # Get all images on this page
-                image_list = page.get_images()
+                if text_instances:
+                    # Get the position of the caption
+                    caption_rect = text_instances[0]
-                if image_list:
-                    # Find the image closest to and above the caption
-                    best_img = None
-                    best_distance = float('inf')
-                    for img_index, img in enumerate(image_list):
-                        # Get image position
-                        xref = img[0]
-                        img_rects = page.get_image_rects(xref)
-                        if img_rects:
-                            img_rect = img_rects[0]
-                            # Check if image is above the caption and calculate distance
-                            if img_rect.y1 <= caption_rect.y0:  # Image bottom is above caption top
-                                distance = caption_rect.y0 - img_rect.y1
-                                if distance < best_distance and distance < 100:  # Within reasonable distance
-                                    best_distance = distance
-                                    best_img = xref
-                    if best_img is not None:
-                        # Extract the identified image
-                        pix = fitz.Pixmap(doc, best_img)
-                        if pix.n - pix.alpha < 4:  # GRAY or RGB
-                            figure_bytes = pix.tobytes("png")
-                        else:  # Convert CMYK to RGB
-                            pix2 = fitz.Pixmap(fitz.csRGB, pix)
-                            figure_bytes = pix2.tobytes("png")
-                            pix2 = None
-                        pix = None
-                        # Save to debug directory if provided
-                        if debug_dir and figure_bytes:
-                            debug_path = Path(debug_dir)
-                            debug_path.mkdir(parents=True, exist_ok=True)
-                            fig_file = debug_path / f"figure_{figure_id.replace(' ', '_').replace('.', '')}_{int(time.time())}.png"
-                            with open(fig_file, 'wb') as f:
-                                f.write(figure_bytes)
-                            log.info(f"Saved figure to: {fig_file}")
-                        break
+                # Instead of trying to extract individual images,
+                # extract the ENTIRE PAGE as an image
+                # This ensures we get the complete figure with all panels
+                log.info(f"Extracting entire page {page_num + 1} containing figure {figure_id}")
+                # Use high resolution for clarity
+                mat = fitz.Matrix(3.0, 3.0)  # 3x zoom
+                pix = page.get_pixmap(matrix=mat)
+                figure_bytes = pix.tobytes("png")
+                # Save the extracted figure if debug is enabled
+                if debug_dir and figure_bytes:
+                    debug_path = Path(debug_dir)
+                    debug_path.mkdir(parents=True, exist_ok=True)
+                    figure_file = debug_path / f"figure_{figure_id.replace(' ', '_')}_{int(time.time())}.png"
+                    with open(figure_file, 'wb') as f:
+                        f.write(figure_bytes)
+                    log.info(f"Saved figure to: {figure_file}")
+                break  # Found the figure, no need to continue
     finally:
         doc.close()
@@ -465,7 +450,7 @@ def get_model():
         "temperature": 0.0,  # Deterministic: always pick the most likely token
         "top_p": 1.0,      # Consider all tokens (but temperature=0 will pick the best)
         "top_k": 1,        # Only consider the single most likely token
-        "max_output_tokens": 32768,  # Increased from 8192 to handle larger sequence extractions
+        "max_output_tokens": 65536,  # Increased to 2x for handling larger lineage tables and sequences
     }
     # For Gemini 2.5 Flash, disable thinking tokens to save costs
@@ -685,39 +670,39 @@ from typing import List, Dict, Any
 # ---- 6.0  Campaign identification prompts -----------------------------------
 _CAMPAIGN_IDENTIFICATION_PROMPT = """
-You are an expert reader of protein engineering manuscripts.
-Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.
-Each campaign represents a separate evolutionary lineage targeting different:
-- Model reactions (e.g., different chemical transformations)
-- Substrate scopes
-- Activities (e.g., different enzymatic reactions)
+Identify directed evolution LINEAGE campaigns in this manuscript.
+A campaign is a multi-round directed evolution effort that creates a FAMILY of variants through iterative cycles.
 Look for:
-1. Different model substrates/products mentioned (e.g., different substrate/product pairs)
-2. Distinct enzyme lineage names (e.g., different variant naming patterns)
-3. Separate evolution trees or lineage tables
-4. Different reaction schemes or transformations
+- Multiple rounds/generations of evolution (e.g., "8 rounds of evolution", "5 generations")
+- Lineage trees or variant families (e.g., "L1→L2→L3→L4", "WT→M1→M2→M3")
+- Progressive improvement through iterations
+- Parent-child relationships across multiple variants
+Do NOT include:
+- Single-point mutation studies or individual variant characterization
+- Simple site-saturation mutagenesis at one position
+IMPORTANT: Include previously evolved lineages IF they are the main focus of THIS paper (e.g., characterizing a previously evolved enzyme lineage with new substrates/conditions)
+Key phrases: "rounds of directed evolution", "iterative evolution", "evolutionary lineage", "variant lineage", "generations of evolution"
 Return a JSON array of campaigns:
 [
   {{
     "campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
     "campaign_name": "descriptive name",
-    "description": "what this campaign evolved for",
+    "description": "what THIS STUDY evolved for",
     "model_substrate": "substrate name/id",
     "model_product": "product name/id",
     "substrate_id": "id from paper (e.g., 1a)",
     "product_id": "id from paper (e.g., 2a)",
     "data_locations": ["Table S1", "Figure 1"],
     "lineage_hint": "enzyme name pattern",
-    "notes": "additional context"
+    "notes": "evidence this was evolved in THIS study"
   }}
 ]
-IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
-Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
 TEXT:
 {text}
 """.strip()
@@ -757,26 +742,46 @@ lineage of enzyme variants (i.e. which variant came from which parent and what
 mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
 ensure the location you return are actually lineage location with variants and mutations.
+IMPORTANT SCORING CRITERIA:
+- Locations that explicitly mention "lineage" should be scored MUCH HIGHER (90-100)
+- Locations mentioning "evolutionary tree", "phylogenetic", "genealogy", or "ancestry" should also score high (85-95)
+- Locations that only mention "variants" without lineage context should score lower (60-80)
+- Generic tables of variants without parent-child relationships should score lowest (40-60)
 Respond with a JSON array of objects, each containing:
-- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
+- "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
 - "type": one of "table", "figure", "section"
-- "confidence": your confidence score (0-100) that this location contains lineage data
+- "confidence": your confidence score (0-100) that this location contains lineage data (PRIORITIZE "lineage" mentions!)
 - "reason": brief explanation of why this location likely contains lineage
+- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
+- "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
 {campaign_field}
-IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
-NOT page numbers. Focus on the actual figure/table titles and numbers.
-Order by confidence score (highest first). Tables showing complete variant lineages or
-mutation lists should be ranked higher than figures showing complete variant lineages.
-Sections are used when no suitable tables/figures exist.
+CRITICAL INSTRUCTIONS:
+1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
+   - Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
+   - Do NOT modify, standardize, or interpret the location - return it verbatim from the document
+2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
+   - This should be the complete caption as it appears in the document
+   - Include at least 200-300 characters to ensure unique matching
+3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
+   - Items like "Table S1", "Figure S2", etc. are typically in the SI
+   - Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
+   - If uncertain, use context clues from the text
+Order by confidence score (highest first), with special priority for:
+1. Tables/figures explicitly mentioning "lineage" or "evolutionary tree" (score 90-100)
+2. Tables showing complete parent-child relationships with mutations (score 80-95)
+3. Figures showing evolutionary/phylogenetic trees (score 75-90)
+4. Tables listing variants with parent information (score 70-85)
+5. Generic variant tables without clear lineage information (score 40-70)
 Don't include oligonucleotide results or result from only one round.
 Example output:
 [
-  {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
-  {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
-  {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
+  {{"location": "Table S1.", "type": "table", "confidence": 98, "reason": "Complete enzyme lineage table with parent-child relationships", "source": "si", "caption": "Table S1. Complete lineage of enzyme variants showing the evolutionary progression from wild-type through eight rounds of directed evolution. Each variant is listed with its parent and accumulated mutations..."{campaign_example}}},
+  {{"location": "Figure 2B", "type": "figure", "confidence": 92, "reason": "Evolutionary tree explicitly showing lineage", "source": "manuscript", "caption": "Figure 2B Evolutionary lineage tree depicting the complete genealogy of engineered variants. Branches show parent-child relationships with mutations annotated..."{campaign_example}}},
+  {{"location": "Table 2", "type": "table", "confidence": 75, "reason": "Variant table with parent information", "source": "manuscript", "caption": "Table 2. Summary of enzyme variants generated in this study. Parent templates and mutations are indicated for each variant..."{campaign_example}}}
 ]
 """.strip()
@@ -908,6 +913,9 @@ def identify_evolution_locations(
     pdf_paths: Optional[List[Path]] = None,
 ) -> List[dict]:
     """Ask Gemini where in the paper the lineage is probably described."""
+    # Extract manuscript pages as images (in addition to text)
+    manuscript_images = []
     # Extract table of contents from PDFs if available
     toc_text = ""
     if pdf_paths:
@@ -938,6 +946,27 @@ def identify_evolution_locations(
         if toc_sections:
             toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
+        # Extract manuscript pages as images
+        if len(pdf_paths) >= 1:
+            manuscript_pdf = pdf_paths[0]
+            log.info(f"Extracting manuscript pages as images from: {manuscript_pdf.name}")
+            doc = _open_doc(manuscript_pdf)
+            try:
+                # Extract up to 10 pages as images
+                for page_num in range(min(10, len(doc))):
+                    page = doc[page_num]
+                    # Render page as image
+                    mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
+                    pix = page.get_pixmap(matrix=mat)
+                    img_bytes = pix.tobytes("png")
+                    manuscript_images.append(img_bytes)
+                    log.debug(f"Extracted page {page_num + 1} as image ({len(img_bytes)} bytes)")
+            finally:
+                doc.close()
+            log.info(f"Extracted {len(manuscript_images)} manuscript pages as images")
     # Include TOC before the main text
     combined_text = toc_text + text if toc_text else text
@@ -979,15 +1008,80 @@ def identify_evolution_locations(
         campaign_specific=campaign_specific,
         campaign_field=campaign_field,
         campaign_example=campaign_example
-    ) + "\n\nTEXT:\n" + combined_text
+    )
     locs: List[dict] = []
     try:
-        locs = generate_json_with_retry(
-            model,
-            prompt,
-            debug_dir=debug_dir,
-            tag="locate",
-        )
+        if manuscript_images:
+            # Use vision API with manuscript images and SI text
+            log.info("Using vision API with %d manuscript page images and SI text", len(manuscript_images))
+            # Convert images to PIL format for Gemini
+            import PIL.Image
+            import io
+            pil_images = []
+            for img_bytes in manuscript_images:
+                image = PIL.Image.open(io.BytesIO(img_bytes))
+                pil_images.append(image)
+            # Build multimodal prompt with caption text AND manuscript images
+            multimodal_prompt = [prompt + "\n\nTEXT (Captions and sections):\n" + combined_text]
+            # Add manuscript page images
+            multimodal_prompt.append("\n\n=== MANUSCRIPT PAGES (as images for additional context) ===\n")
+            multimodal_prompt.extend(pil_images)
+            # Save debug info if requested
+            if debug_dir:
+                debug_path = Path(debug_dir)
+                debug_path.mkdir(parents=True, exist_ok=True)
+                # Save prompt
+                prompt_file = debug_path / f"locate_vision_prompt_{int(time.time())}.txt"
+                _dump(f"=== VISION PROMPT FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nManuscript pages: {len(manuscript_images)}\nText length: {len(combined_text)} chars\n{'='*80}\n\n{prompt}\n\nTEXT (Captions and sections):\n{combined_text[:2000]}...(truncated)\n\n[{len(manuscript_images)} manuscript page images]",
+                      prompt_file)
+                # Save manuscript page samples
+                for i, img_bytes in enumerate(manuscript_images[:3]):  # Save first 3 pages
+                    img_file = debug_path / f"locate_manuscript_page_{i+1}_{int(time.time())}.png"
+                    _dump(img_bytes, img_file)
+            # Generate content with vision
+            response = model.generate_content(multimodal_prompt)
+            raw = response.text
+            # Parse JSON from response
+            try:
+                # Save raw response if debug enabled
+                if debug_dir:
+                    response_file = Path(debug_dir) / f"locate_vision_response_{int(time.time())}.txt"
+                    _dump(f"=== VISION RESPONSE FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}", response_file)
+                # Try to parse JSON
+                try:
+                    locs = json.loads(raw)
+                except json.JSONDecodeError:
+                    # Try to extract JSON from response
+                    json_match = re.search(r'\[.*\]', raw, re.DOTALL)
+                    if json_match:
+                        locs = json.loads(json_match.group(0))
+                    else:
+                        log.warning("Could not parse JSON from vision response")
+                        locs = []
+            except Exception as e:
+                log.warning(f"Error parsing vision response: {e}")
+                locs = []
+        else:
+            # Fall back to text-only mode
+            prompt += "\n\nTEXT:\n" + combined_text
+            locs = generate_json_with_retry(
+                model,
+                prompt,
+                debug_dir=debug_dir,
+                tag="locate",
+            )
     except Exception as exc:  # pragma: no cover
         log.warning("identify_evolution_locations(): %s", exc)
@@ -1288,7 +1382,7 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
     return False
-def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
+def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 50000, validate_sequences: bool = False) -> str:
     """Extract text around identified locations."""
     if not locations:
         return text
@@ -1461,10 +1555,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
 # ---- 6.4  Public API -------------------------------------------------------
-def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
-    """Extract text from a specific location (table, section, etc.) in the full text."""
+def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
+    """Extract text from a specific location (table, section, etc.) in the full text.
+    Args:
+        full_text: The full text to search in
+        location: The location identifier (e.g., "Table S1")
+        location_type: Type of location ("table", "figure", "section")
+        caption_hint: Optional full caption text for fuzzy matching
+    """
     import re
+    # If caption hint is provided, try fuzzy matching first
+    if caption_hint and len(caption_hint) > 20:
+        log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
+        # Normalize texts for better matching (similar to reaction_info_extractor)
+        def normalize_for_matching(text):
+            # Remove extra whitespace, normalize spaces around punctuation
+            text = ' '.join(text.split())
+            # Normalize different dash types
+            text = text.replace('–', '-').replace('—', '-')
+            return text
+        normalized_hint = normalize_for_matching(caption_hint[:150])  # Use first 150 chars
+        normalized_text = normalize_for_matching(full_text)
+        # Try to find ALL caption matches using character-based fuzzy matching
+        all_matches = []
+        # Slide through the text looking for all matches above threshold
+        hint_len = len(normalized_hint)
+        for i in range(len(normalized_text) - hint_len + 1):
+            snippet = normalized_text[i:i + hint_len]
+            # Simple character-based similarity
+            matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
+            score = matches / hint_len
+            if score > 0.7:  # 70% similarity threshold
+                all_matches.append({
+                    'norm_pos': i,
+                    'score': score
+                })
+        # If we found matches, extract from all of them
+        if all_matches:
+            log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
+            # Collect all occurrences from fuzzy matches
+            all_occurrences = []
+            seen_positions = set()
+            for match_info in all_matches:
+                # Get the matched text from normalized version
+                matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
+                # Find where this appears in the original text
+                best_original_pos = -1
+                # Search in the original text for this specific match
+                for i in range(len(full_text) - len(caption_hint) + 1):
+                    if i in seen_positions:
+                        continue
+                    original_snippet = full_text[i:i + len(caption_hint)]
+                    # Normalize and compare
+                    normalized_snippet = normalize_for_matching(original_snippet)
+                    if normalized_snippet[:hint_len] == matched_normalized:
+                        # Found exact match after normalization
+                        best_original_pos = i
+                        seen_positions.add(i)
+                        break
+                if best_original_pos >= 0:
+                    # Extract generous context from this match position
+                    start = max(0, best_original_pos - 1000)
+                    end = min(len(full_text), best_original_pos + 10000)
+                    context = full_text[start:end]
+                    all_occurrences.append({
+                        'position': best_original_pos,
+                        'context': context,
+                        'score': match_info['score']
+                    })
+                    log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
+            if all_occurrences:
+                # Sort by position to maintain document order
+                all_occurrences.sort(key=lambda x: x['position'])
+                # Combine all occurrences
+                combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
+                for i, occurrence in enumerate(all_occurrences, 1):
+                    combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
+                    combined_text += occurrence['context']
+                    combined_text += "\n\n"
+                # Apply same limit as table extraction
+                if len(combined_text) > 150000:
+                    combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
+                log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
+                return combined_text
+            else:
+                log.warning(f"Could not map any fuzzy matches back to original text")
+        else:
+            log.warning(f"No fuzzy matches found for caption above 70% threshold")
     if location_type == 'table':
         # Find ALL mentions of this table and combine them
         location_clean = location.strip()
@@ -1506,6 +1704,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
         log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
+        # Sort occurrences by position to maintain document order
+        all_occurrences.sort(key=lambda x: x['position'])
         # Combine all occurrences into one text for Gemini to analyze
         combined_text = f"=== All occurrences of {location_clean} ===\n\n"
@@ -1515,8 +1716,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
             combined_text += "\n\n"
         # Limit total length to avoid overwhelming the model
-        if len(combined_text) > 50000:
-            combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
+        # Increased limit to ensure actual table content is included
+        if len(combined_text) > 150000:
+            combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
         return combined_text
@@ -1600,6 +1802,8 @@ def get_lineage(
     *,
     pdf_paths: Optional[List[Path]] = None,
     debug_dir: str | Path | None = None,
+    manuscript_text: Optional[str] = None,
+    si_text: Optional[str] = None,
 ) -> Tuple[List[Variant], List[Campaign]]:
     """
     High-level wrapper used by the pipeline.
@@ -1667,54 +1871,72 @@ def get_lineage(
             for loc in locations:
                 log.info(f"  - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
-            # Try to extract from the best location
+            # Sort locations by confidence and use the highest confidence one
+            locations_sorted = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
+            log.info(f"Using highest confidence location: {locations_sorted[0]['location']} (confidence: {locations_sorted[0]['confidence']})")
+            # Use the highest confidence location as primary location
+            primary_location = locations_sorted[0]
+            # Extract location details
+            location_str = primary_location.get('location', '')
+            location_type = primary_location.get('type', '')
+            confidence = primary_location.get('confidence', 0)
+            caption_text = primary_location.get('caption', '')
+            # Initialize extracted variants list
             extracted_variants = []
-            for location in locations:
-                if extracted_variants:
-                    break  # Already got variants
-                location_str = location.get('location', '')
-                location_type = location.get('type', '')
-                confidence = location.get('confidence', 0)
+            # Try figure extraction for high-confidence figures
+            if location_type == 'figure' and confidence >= 70 and pdf_paths:
+                log.info(f"Attempting to extract figure: {location_str}")
-                # Try figure extraction for high-confidence figures
-                if location_type == 'figure' and confidence >= 70 and pdf_paths:
-                    log.info(f"Attempting to extract figure: {location_str}")
-                    figure_bytes = None
-                    for pdf_path in pdf_paths:
-                        figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
-                        if figure_bytes:
-                            log.info(f"Successfully extracted figure from {pdf_path.name}")
-                            break
+                figure_bytes = None
+                for pdf_path in pdf_paths:
+                    figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text=caption_text)
                     if figure_bytes:
-                        # Save figure if debug enabled
-                        if debug_dir:
-                            debug_path = Path(debug_dir)
-                            debug_path.mkdir(parents=True, exist_ok=True)
-                            figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
-                            _dump(figure_bytes, figure_file)
-                            log.info(f"Saved figure to: {figure_file}")
-                        # Extract lineage from figure
-                        variants = extract_lineage_from_figure(
-                            figure_bytes, model,
-                            debug_dir=debug_dir,
-                            campaign_id=campaign.campaign_id,
-                            campaign_info=campaign
-                        )
-                        if variants:
-                            log.info(f"Extracted {len(variants)} variants from figure")
-                            extracted_variants = variants
-                            continue
+                        log.info(f"Successfully extracted figure from {pdf_path.name}")
+                        break
-                # Try table/text extraction
-                if location_type in ['table', 'text', 'section'] and not extracted_variants:
+                if figure_bytes:
+                    # Save figure if debug enabled
+                    if debug_dir:
+                        debug_path = Path(debug_dir)
+                        debug_path.mkdir(parents=True, exist_ok=True)
+                        figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
+                        _dump(figure_bytes, figure_file)
+                        log.info(f"Saved figure to: {figure_file}")
+                    # Extract lineage from figure
+                    variants = extract_lineage_from_figure(
+                        figure_bytes, model,
+                        debug_dir=debug_dir,
+                        campaign_id=campaign.campaign_id,
+                        campaign_info=campaign
+                    )
+                    if variants:
+                        log.info(f"Extracted {len(variants)} variants from figure")
+                        extracted_variants = variants
+            # Try table/text extraction if no figure extraction or if not a figure
+            if not extracted_variants and location_type in ['table', 'text', 'section']:
                     log.info(f"Attempting text extraction for {location_type}: {location_str}")
-                    # Extract the specific section/table from full text
-                    section_text = _extract_location_text(full_text, location_str, location_type)
+                    # Determine which text to use based on source
+                    location_source = location.get('source', 'manuscript')
+                    if location_source == 'si' and si_text:
+                        text_to_search = si_text
+                        log.info(f"Using SI text for location {location_str}")
+                    elif location_source == 'manuscript' and manuscript_text:
+                        text_to_search = manuscript_text
+                        log.info(f"Using manuscript text for location {location_str}")
+                    else:
+                        text_to_search = full_text
+                        log.info(f"Using combined text for location {location_str} (fallback)")
+                    # Extract the specific section/table from appropriate text
+                    caption_hint = location.get('caption', '')
+                    section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
                     if section_text:
                         log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
                         # Save extracted section if debug enabled
@@ -1940,8 +2162,9 @@ def get_lineage(
                     # Try to extract the figure from available PDFs
                     figure_bytes = None
+                    # Note: This fallback path doesn't have the caption text
                     for pdf_path in pdf_paths:
-                        figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
+                        figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text="")
                         if figure_bytes:
                             log.info("Successfully extracted figure from %s", pdf_path.name)
                             break
@@ -1980,7 +2203,7 @@ def get_lineage(
             # Use text-based extraction (works for tables and text sections)
             # Extract from full text, not caption text - use only primary location
             # Use more context for tables since they often span multiple pages
-            context_size = 15000 if location_type == 'table' else 5000
+            context_size = 75000 if location_type == 'table' else 50000
             focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
             log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
                      len(full_text), len(focused_text),
@@ -2028,17 +2251,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
 Look for table of contents entries or section listings that mention sequences.
 Return a JSON array where each element has:
-- "section": the section heading or description
+- "section": the section heading or description EXACTLY as it appears
 - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
+- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
+- "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
 Focus on:
 - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
 - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
 - Prioritize sections that mention "protein" or "amino acid" sequences
-CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
-- Correct: "53", "S12", "147"
-- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
+CRITICAL:
+1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
+   - Correct: "53", "S12", "147"
+   - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
+2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
+   - Pages with "S" prefix (e.g., "S53") are typically in the SI
+   - Regular page numbers (e.g., "53") are typically in the main manuscript
+   - Use context clues from the document structure
 Return [] if no sequence sections are found.
 Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -2236,7 +2466,7 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
         # Fallback to text search if page extraction didn't work
         if not sample_text:
             sample_text = _extract_text_at_locations(
-                text, [location], context_chars=2000, validate_sequences=False
+                text, [location], context_chars=20000, validate_sequences=False
             )
         samples.append({
@@ -2278,44 +2508,30 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
 # --- 7.3  Main extraction prompt ---------------------------------------------
 _SEQ_EXTRACTION_PROMPT = """
-Extract EVERY distinct enzyme-variant sequence you can find in the text.
-IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
-- If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
-- Only extract dna_seq if NO amino acid sequence is available for that variant
-- This reduces redundancy since protein sequences are usually more relevant
-CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
-- Papers often use different naming conventions in different sections
-- DO NOT normalize or simplify variant IDs
-- Extract the variant_id exactly as written where the sequence appears
-- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
-SEQUENCE EXTRACTION RULES:
-- Copy sequences EXACTLY as they appear in the text
-- Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
-- Do NOT add, remove, or modify any amino acids, or nucleotides
-- Preserve the exact length and character sequence
-- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
-- Double-check that consecutive identical amino acids or nucleotides  are copied correctly
-For each variant return:
-  * variant_id  - the EXACT label as it appears with the sequence (preserve all formatting)
-  * aa_seq      - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
-  * dna_seq     - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
-Respond ONLY with **minified JSON** that matches the schema below.
-NO markdown, no code fences, no commentary.
+Extract ALL enzyme variant sequences from the text. Copy sequences EXACTLY as they appear - character by character.
-Schema:
-```json
-{schema}
-```
+KEY RULES:
+1. EXHAUSTIVE SEARCH: If a variant appears multiple times, check ALL occurrences and extract the LONGEST sequence
+2. MULTI-PAGE: Sequences span pages. Skip page numbers (66, 67, etc.) that interrupt sequences
+3. MERGE IF NEEDED: If sequence continues after page break, combine the parts
+4. NO MODIFICATIONS: Copy exactly - no edits or improvements
-TEXT (may be truncated):
-```
+IMPORTANT: The same variant may appear multiple times with different sequence lengths. Always use the longest one.
+SEQUENCE PRIORITY:
+- If BOTH amino acid AND DNA exist → use amino acid ONLY
+- For DNA: If mixed case, extract UPPERCASE only (lowercase=backbone)
+- Return minified JSON only
+ACCURACY:
+- Extract ONLY what's written
+- Never hallucinate
+- Check entire document - complete sequences often appear later
+Schema: {schema}
+TEXT:
 {text}
-```
 """.strip()
 def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
@@ -2390,7 +2606,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
 def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
-    """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
+    """Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
     Can exit early after 2 attempts if the responses match exactly.
@@ -2404,9 +2620,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
         The most common sequence JSON data or None if all attempts failed
     """
     responses = []
-    max_attempts = 6
+    max_attempts = 5  # 5 attempts for better consensus
-    # Try 6 times with early match detection
+    # Try 5 times with early match detection
     for attempt in range(max_attempts):
         try:
             log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2432,8 +2648,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
             # Try to parse as JSON
             try:
-                parsed = json.loads(raw)
-            except json.JSONDecodeError:
+                # First clean the response - remove any BOM or invisible characters
+                raw_clean = raw.strip()
+                if raw_clean.startswith('\ufeff'):  # Remove BOM if present
+                    raw_clean = raw_clean[1:]
+                parsed = json.loads(raw_clean)
+            except json.JSONDecodeError as e:
+                log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
                 # Look for JSON array or object in the response
                 json_start = -1
                 json_end = -1
@@ -2482,17 +2703,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
             responses.append(parsed)
             log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
-            # Early match detection after 2 attempts
-            if attempt >= 1:  # After 2nd attempt (0-indexed)
-                valid_responses_so_far = [r for r in responses if r is not None]
-                if len(valid_responses_so_far) >= 2:
-                    # Check if the last two valid responses match
-                    if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
-                        log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
-                        # Add the matching response 4 more times to simulate consensus
-                        for _ in range(max_attempts - attempt - 1):
-                            responses.append(valid_responses_so_far[-1])
-                        break
+            # If we got a good response with sequences, we can check for early termination
+            if isinstance(parsed, list) and len(parsed) > 0:
+                # Early match detection after 2 attempts
+                if attempt >= 1:  # After 2nd attempt (0-indexed)
+                    valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
+                    if len(valid_responses_so_far) >= 2:
+                        # Check if the last two valid responses match
+                        if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
+                            log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
+                            # Add the matching response to fill remaining attempts
+                            for _ in range(max_attempts - attempt - 1):
+                                responses.append(valid_responses_so_far[-1])
+                            break
+                # If this is the first attempt and we got sequences, continue to validate with at least one more
+                elif attempt == 0 and len(parsed) > 5:  # Got substantial sequences on first try
+                    log.info("Got substantial sequences on first attempt, will validate with one more")
         except Exception as e:
             log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
@@ -2511,28 +2737,39 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
         if isinstance(resp, list):
             for seq in resp:
                 if isinstance(seq, dict) and "variant_id" in seq:
-                    # Create a key for this sequence (variant_id + cleaned aa_seq)
+                    # Create a key for this sequence (variant_id + cleaned sequence)
                     variant_id = seq.get("variant_id", "")
                     aa_seq = seq.get("aa_seq", "")
+                    dna_seq = seq.get("dna_seq", "")
+                    # Clean sequences for comparison
                     if aa_seq:
                         aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
-                    key = f"{variant_id}|{aa_seq}"
+                    if dna_seq:
+                        dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
+                    # Use whichever sequence is present for the key
+                    seq_for_key = aa_seq if aa_seq else (dna_seq if dna_seq else "")
+                    key = f"{variant_id}|{seq_for_key}"
                     if key not in sequence_counts:
                         sequence_counts[key] = {"count": 0, "data": seq}
                     sequence_counts[key]["count"] += 1
-    # Build result with sequences that appear in at least 3 attempts
+    # Build result with sequences that appear in at least 2 attempts
+    # Sort by count (descending) to prioritize sequences with higher consensus
     result = []
-    for key, info in sequence_counts.items():
-        if info["count"] >= 3:  # Appears in at least 3/6 attempts
+    sorted_sequences = sorted(sequence_counts.items(), key=lambda x: x[1]["count"], reverse=True)
+    for key, info in sorted_sequences:
+        if info["count"] >= 2:  # Appears in at least 2/5 attempts
             seq_data = info["data"].copy()
             seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
             result.append(seq_data)
             log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
     if result:
-        log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
+        log.info(f"Extracted {len(result)} sequences with at least 2/{max_attempts} consensus")
         return result
     # If no sequences appear twice, return the most complete attempt
@@ -2628,11 +2865,30 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
         if aa and len(aa) <= 50:
             log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
             aa = None
-        if dna and len(dna) <= 150:
-            log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
-            dna = None
+        # Validate DNA sequences
+        if dna:
+            if len(dna) <= 150:
+                log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
+                dna = None
+            # Check if DNA sequence length is divisible by 3
+            elif len(dna) % 3 != 0:
+                log.warning(f"Skipping DNA sequence for {vid}: length {len(dna)} not divisible by 3")
+                dna = None
+            else:
+                # Check for stop codons in the middle of the sequence
+                stop_codons = {'TAA', 'TAG', 'TGA'}
+                has_internal_stop = False
+                for i in range(0, len(dna) - 3, 3):
+                    codon = dna[i:i+3]
+                    if codon in stop_codons:
+                        log.warning(f"Skipping DNA sequence for {vid}: internal stop codon {codon} at position {i}")
+                        has_internal_stop = True
+                        break
+                if has_internal_stop:
+                    dna = None
-        # Skip if both sequences are too short or missing
+        # Skip if both sequences are invalid or missing
         if not aa and not dna:
             continue
@@ -2852,9 +3108,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
             focused_text = ""
             if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
                 page_num = best_location['page']
-                # Extract current page plus next 15 pages
+                # Extract current page plus next 5 pages (6 total) to prevent hallucination
                 all_pages = []
-                for i in range(16):  # Current + next 15
+                for i in range(6):  # Current + next 5 (6 pages total)
                     if isinstance(page_num, str) and page_num.upper().startswith('S'):
                         next_page = f"S{int(page_num[1:]) + i}"
                     else:
@@ -2866,7 +3122,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                         break
                 if all_pages:
                     focused_text = "\n".join(all_pages)
-                    log.info("Extracted %d chars from pages %s through %d more pages",
+                    log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
                              len(focused_text), page_num, len(all_pages) - 1)
             # Fallback to text search if page extraction didn't work
@@ -2874,7 +3130,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                 log.info("Page extraction did not return text, falling back to text search")
                 focused_text = _extract_text_at_locations(
                     text, [best_location],
-                    context_chars=max(min_length, 30000),
+                    context_chars=max(min_length, 50000),
                     validate_sequences=True
                 )
@@ -3152,6 +3408,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
         return {}
+def _match_variant_ids_with_gemini(
+    lineage_variant_ids: List[str],
+    pdb_variant_ids: List[str],
+    model
+) -> Dict[str, str]:
+    """Use Gemini to match variant IDs that may have slight formatting differences.
+    Args:
+        lineage_variant_ids: List of variant IDs from the lineage
+        pdb_variant_ids: List of variant IDs from PDB matching
+        model: Gemini model for matching
+    Returns:
+        Dictionary mapping lineage_variant_id -> pdb_variant_id
+    """
+    if not lineage_variant_ids or not pdb_variant_ids or not model:
+        return {}
+    # If the lists are identical, return direct mapping
+    if set(lineage_variant_ids) == set(pdb_variant_ids):
+        return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
+    # Use Gemini to match variant IDs that may have formatting differences
+    prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
+These represent the same enzyme variants but may be formatted differently.
+Lineage variant IDs:
+{json.dumps(lineage_variant_ids, indent=2)}
+PDB variant IDs:
+{json.dumps(pdb_variant_ids, indent=2)}
+Match variants that represent the SAME enzyme variant, accounting for:
+- Whitespace differences (extra spaces, tabs)
+- Character encoding differences
+- Minor formatting variations
+Return ONLY a JSON object mapping lineage IDs to PDB IDs.
+Format: {{"lineage_id": "pdb_id", ...}}
+Only include matches you are confident represent the same variant.
+Return an empty object {{}} if no matches can be confidently made.
+"""
+    try:
+        response = model.generate_content(prompt)
+        text = _extract_text(response).strip()
+        # Parse JSON response
+        if text.startswith("```"):
+            text = text.split("```")[1].strip()
+            if text.startswith("json"):
+                text = text[4:].strip()
+        # Clean up the text
+        text = text.strip()
+        if not text or text == "{}":
+            return {}
+        matches = json.loads(text)
+        log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
+        # Validate matches
+        valid_matches = {}
+        for lineage_id, pdb_id in matches.items():
+            if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
+                valid_matches[lineage_id] = pdb_id
+                log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
+            else:
+                log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
+        return valid_matches
+    except Exception as e:
+        log.warning(f"Failed to match variant IDs with Gemini: {e}")
+        return {}
 def match_pdb_to_variants(
     pdb_sequences: Dict[str, str],
     variants: List[Variant],
@@ -3235,24 +3568,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
         text = _extract_text(response).strip()
         # Parse JSON response (expecting a single string)
-        if text.startswith("```"):
+        # Look for JSON code blocks first
+        if "```json" in text:
+            # Extract content between ```json and ```
+            import re
+            json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
+            if json_match:
+                json_content = json_match.group(1).strip()
+                try:
+                    # Parse as JSON and extract the string value
+                    parsed = json.loads(json_content)
+                    matched_variant = str(parsed).strip('"\'')
+                except:
+                    # If JSON parsing fails, try to extract the quoted string
+                    quoted_match = re.search(r'"([^"]+)"', json_content)
+                    if quoted_match:
+                        matched_variant = quoted_match.group(1)
+                    else:
+                        matched_variant = json_content.strip('"\'')
+            else:
+                matched_variant = text.strip('"\'')
+        elif text.startswith("```"):
+            # Handle other code blocks
             text = text.split("```")[1].strip()
             if text.startswith("json"):
                 text = text[4:].strip()
+            matched_variant = text.strip('"\'')
+        else:
+            # Look for quoted strings in the response
+            import re
+            quoted_match = re.search(r'"([^"]+)"', text)
+            if quoted_match:
+                matched_variant = quoted_match.group(1)
+            else:
+                # Remove quotes if present
+                matched_variant = text.strip('"\'')
-        # Remove quotes if present
-        text = text.strip('"\'')
-        matched_variant = text
+        log.info(f"Extracted variant name: '{matched_variant}' from response")
         log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
         # Return mapping with all chains pointing to the same variant
         mapping = {}
-        if matched_variant and any(v.variant_id == matched_variant for v in variants):
-            for chain_id in pdb_sequences:
-                mapping[matched_variant] = chain_id
-                break  # Only use the first chain
+        if matched_variant:
+            # Debug logging
+            variant_ids = [v.variant_id for v in variants]
+            log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
+            # Check if the matched variant exists in the lineage
+            found_variant = any(v.variant_id == matched_variant for v in variants)
+            log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
+            if found_variant:
+                for chain_id in pdb_sequences:
+                    mapping[matched_variant] = chain_id
+                    log.info(f"Created mapping: {matched_variant} -> {chain_id}")
+                    break  # Only use the first chain
+            else:
+                log.warning(f"Variant '{matched_variant}' not found in lineage variants")
+                # Try fuzzy matching
+                for variant in variants:
+                    if variant.variant_id.strip() == matched_variant.strip():
+                        log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
+                        for chain_id in pdb_sequences:
+                            mapping[variant.variant_id] = chain_id
+                            log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
+                            break
+                        break
+        else:
+            log.warning("No matched variant extracted from response")
+        log.info(f"Final mapping result: {mapping}")
         return mapping
     except Exception as e:
@@ -3634,14 +4019,28 @@ def run_pipeline(
     caption_text = limited_caption_concat(*pdf_paths)
     full_text = limited_concat(*pdf_paths)
+    # Also load separate texts for manuscript and SI
+    manuscript_text = limited_concat(manuscript) if manuscript else None
+    si_text = limited_concat(si_path) if si_path else None
     log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
              len(caption_text), len(full_text))
+    if manuscript_text:
+        log.info("Loaded %d chars from manuscript", len(manuscript_text))
+    if si_text:
+        log.info("Loaded %d chars from SI", len(si_text))
     # 2. Connect to Gemini -----------------------------------------------------
     model = get_model()
     # 3. Extract lineage (Section 6) ------------------------------------------
-    lineage, campaigns = get_lineage(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
+    lineage, campaigns = get_lineage(
+        caption_text, full_text, model,
+        pdf_paths=pdf_paths,
+        debug_dir=debug_dir,
+        manuscript_text=manuscript_text,
+        si_text=si_text
+    )
     if not lineage:
         raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
@@ -3721,12 +4120,40 @@ def run_pipeline(
                         pdb_sequences, lineage, full_text, model, pdb_id
                     )
+                    log.info(f"PDB matching result: {variant_to_chain}")
+                    log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
+                    log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
                     # Convert to SequenceBlock objects
                     pdb_seq_blocks = []
-                    for variant in lineage:
-                        if variant.variant_id in variant_to_chain:
-                            chain_id = variant_to_chain[variant.variant_id]
-                            if chain_id in pdb_sequences:
+                    # Use Gemini-based matching for robust variant ID comparison
+                    if variant_to_chain and model:
+                        # Create a mapping using Gemini for robust string matching
+                        gemini_mapping = _match_variant_ids_with_gemini(
+                            lineage_variant_ids=[v.variant_id for v in lineage],
+                            pdb_variant_ids=list(variant_to_chain.keys()),
+                            model=model
+                        )
+                        for variant in lineage:
+                            log.info(f"Processing variant: {variant.variant_id}")
+                            # Try direct match first
+                            chain_id = variant_to_chain.get(variant.variant_id)
+                            log.info(f"Direct match for {variant.variant_id}: {chain_id}")
+                            # If no direct match, try Gemini-based matching
+                            if not chain_id:
+                                matched_pdb_variant = gemini_mapping.get(variant.variant_id)
+                                log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
+                                if matched_pdb_variant:
+                                    chain_id = variant_to_chain.get(matched_pdb_variant)
+                                    log.info(f"Chain ID from Gemini match: {chain_id}")
+                            if chain_id and chain_id in pdb_sequences:
+                                seq_length = len(pdb_sequences[chain_id])
+                                log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
                                 seq_block = SequenceBlock(
                                     variant_id=variant.variant_id,
                                     aa_seq=pdb_sequences[chain_id],
@@ -3737,6 +4164,26 @@ def run_pipeline(
                                 )
                                 pdb_seq_blocks.append(seq_block)
                                 log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
+                            else:
+                                log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
+                    else:
+                        # Fallback to direct matching if no model or no matches
+                        for variant in lineage:
+                            if variant.variant_id in variant_to_chain:
+                                chain_id = variant_to_chain[variant.variant_id]
+                                if chain_id in pdb_sequences:
+                                    seq_block = SequenceBlock(
+                                        variant_id=variant.variant_id,
+                                        aa_seq=pdb_sequences[chain_id],
+                                        dna_seq=None,
+                                        confidence=1.0,  # High confidence for PDB sequences
+                                        truncated=False,
+                                        metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
+                                    )
+                                    pdb_seq_blocks.append(seq_block)
+                                    log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
+                    log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
                     if pdb_seq_blocks:
                         # Update the dataframe with PDB sequences
@@ -3746,8 +4193,13 @@ def run_pipeline(
                                 df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
                                 df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
                                 df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
+                                log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
+                            else:
+                                log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
                         log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
                         break
+                    else:
+                        log.warning(f"No PDB sequence blocks were created for {pdb_id}")
                 else:
                     log.warning(f"No sequences found in PDB {pdb_id}")
         else:

debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

debase 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl