PyPI - debase - Versions diffs - 0.6.2__tar.gz → 0.7.0__tar.gz - Mend

debase 0.6.2tar.gz → 0.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{debase-0.6.2/src/debase.egg-info → debase-0.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.6.2
+Version: 0.7.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.6.2 → debase-0.7.0}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.6.2"
+__version__ = "0.7.0"

{debase-0.6.2 → debase-0.7.0}/src/debase/enzyme_lineage_extractor.py RENAMED Viewed

@@ -336,7 +336,7 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
     return "\n".join(chunks)
-def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None) -> Optional[bytes]:
+def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None, caption_text: str = "") -> Optional[bytes]:
     """Extract a specific figure from a PDF by finding its caption.
     Returns the figure as PNG bytes if found, None otherwise.
@@ -345,64 +345,49 @@ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Option
     figure_bytes = None
     try:
-        # Search for the exact figure caption text
-        search_text = figure_id.strip()
+        # Use caption text if provided, otherwise use figure_id
+        if caption_text:
+            # Use first 50 chars of caption for searching (enough to be unique)
+            search_text = caption_text[:50].strip()
+            log.info(f"Searching for figure using caption: '{search_text}...'")
+        else:
+            search_text = figure_id.strip()
+            log.info(f"Searching for figure using ID: '{search_text}'")
         for page_num, page in enumerate(doc):
-            # Search for the caption text on this page
-            text_instances = page.search_for(search_text)
+            page_text = page.get_text()
-            if text_instances:
-                log.info(f"Found caption '{figure_id}' on page {page_num + 1}")
+            # Check if caption text appears on this page
+            if search_text in page_text:
+                log.info(f"Found caption on page {page_num + 1}")
-                # Get the position of the first instance
-                caption_rect = text_instances[0]
+                # Search for the exact text position
+                text_instances = page.search_for(search_text)
-                # Get all images on this page
-                image_list = page.get_images()
+                if text_instances:
+                    # Get the position of the caption
+                    caption_rect = text_instances[0]
-                if image_list:
-                    # Find the image closest to and above the caption
-                    best_img = None
-                    best_distance = float('inf')
-                    for img_index, img in enumerate(image_list):
-                        # Get image position
-                        xref = img[0]
-                        img_rects = page.get_image_rects(xref)
-                        if img_rects:
-                            img_rect = img_rects[0]
-                            # Check if image is above the caption and calculate distance
-                            if img_rect.y1 <= caption_rect.y0:  # Image bottom is above caption top
-                                distance = caption_rect.y0 - img_rect.y1
-                                if distance < best_distance and distance < 100:  # Within reasonable distance
-                                    best_distance = distance
-                                    best_img = xref
-                    if best_img is not None:
-                        # Extract the identified image
-                        pix = fitz.Pixmap(doc, best_img)
-                        if pix.n - pix.alpha < 4:  # GRAY or RGB
-                            figure_bytes = pix.tobytes("png")
-                        else:  # Convert CMYK to RGB
-                            pix2 = fitz.Pixmap(fitz.csRGB, pix)
-                            figure_bytes = pix2.tobytes("png")
-                            pix2 = None
-                        pix = None
-                        # Save to debug directory if provided
-                        if debug_dir and figure_bytes:
-                            debug_path = Path(debug_dir)
-                            debug_path.mkdir(parents=True, exist_ok=True)
-                            fig_file = debug_path / f"figure_{figure_id.replace(' ', '_').replace('.', '')}_{int(time.time())}.png"
-                            with open(fig_file, 'wb') as f:
-                                f.write(figure_bytes)
-                            log.info(f"Saved figure to: {fig_file}")
-                        break
+                # Instead of trying to extract individual images,
+                # extract the ENTIRE PAGE as an image
+                # This ensures we get the complete figure with all panels
+                log.info(f"Extracting entire page {page_num + 1} containing figure {figure_id}")
+                # Use high resolution for clarity
+                mat = fitz.Matrix(3.0, 3.0)  # 3x zoom
+                pix = page.get_pixmap(matrix=mat)
+                figure_bytes = pix.tobytes("png")
+                # Save the extracted figure if debug is enabled
+                if debug_dir and figure_bytes:
+                    debug_path = Path(debug_dir)
+                    debug_path.mkdir(parents=True, exist_ok=True)
+                    figure_file = debug_path / f"figure_{figure_id.replace(' ', '_')}_{int(time.time())}.png"
+                    with open(figure_file, 'wb') as f:
+                        f.write(figure_bytes)
+                    log.info(f"Saved figure to: {figure_file}")
+                break  # Found the figure, no need to continue
     finally:
         doc.close()
@@ -685,39 +670,39 @@ from typing import List, Dict, Any
 # ---- 6.0  Campaign identification prompts -----------------------------------
 _CAMPAIGN_IDENTIFICATION_PROMPT = """
-You are an expert reader of protein engineering manuscripts.
-Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.
-Each campaign represents a separate evolutionary lineage targeting different:
-- Model reactions (e.g., different chemical transformations)
-- Substrate scopes
-- Activities (e.g., different enzymatic reactions)
+Identify directed evolution LINEAGE campaigns in this manuscript.
+A campaign is a multi-round directed evolution effort that creates a FAMILY of variants through iterative cycles.
 Look for:
-1. Different model substrates/products mentioned (e.g., different substrate/product pairs)
-2. Distinct enzyme lineage names (e.g., different variant naming patterns)
-3. Separate evolution trees or lineage tables
-4. Different reaction schemes or transformations
+- Multiple rounds/generations of evolution (e.g., "8 rounds of evolution", "5 generations")
+- Lineage trees or variant families (e.g., "L1→L2→L3→L4", "WT→M1→M2→M3")
+- Progressive improvement through iterations
+- Parent-child relationships across multiple variants
+Do NOT include:
+- Single-point mutation studies or individual variant characterization
+- Simple site-saturation mutagenesis at one position
+IMPORTANT: Include previously evolved lineages IF they are the main focus of THIS paper (e.g., characterizing a previously evolved enzyme lineage with new substrates/conditions)
+Key phrases: "rounds of directed evolution", "iterative evolution", "evolutionary lineage", "variant lineage", "generations of evolution"
 Return a JSON array of campaigns:
 [
   {{
     "campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
     "campaign_name": "descriptive name",
-    "description": "what this campaign evolved for",
+    "description": "what THIS STUDY evolved for",
     "model_substrate": "substrate name/id",
     "model_product": "product name/id",
     "substrate_id": "id from paper (e.g., 1a)",
     "product_id": "id from paper (e.g., 2a)",
     "data_locations": ["Table S1", "Figure 1"],
     "lineage_hint": "enzyme name pattern",
-    "notes": "additional context"
+    "notes": "evidence this was evolved in THIS study"
   }}
 ]
-IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
-Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
 TEXT:
 {text}
 """.strip()
@@ -757,10 +742,16 @@ lineage of enzyme variants (i.e. which variant came from which parent and what
 mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
 ensure the location you return are actually lineage location with variants and mutations.
+IMPORTANT SCORING CRITERIA:
+- Locations that explicitly mention "lineage" should be scored MUCH HIGHER (90-100)
+- Locations mentioning "evolutionary tree", "phylogenetic", "genealogy", or "ancestry" should also score high (85-95)
+- Locations that only mention "variants" without lineage context should score lower (60-80)
+- Generic tables of variants without parent-child relationships should score lowest (40-60)
 Respond with a JSON array of objects, each containing:
 - "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
 - "type": one of "table", "figure", "section"
-- "confidence": your confidence score (0-100) that this location contains lineage data
+- "confidence": your confidence score (0-100) that this location contains lineage data (PRIORITIZE "lineage" mentions!)
 - "reason": brief explanation of why this location likely contains lineage
 - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
 - "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
@@ -777,17 +768,20 @@ CRITICAL INSTRUCTIONS:
    - Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
    - If uncertain, use context clues from the text
-Order by confidence score (highest first). Tables showing complete variant lineages or
-mutation lists should be ranked higher than figures showing complete variant lineages.
-Sections are used when no suitable tables/figures exist.
+Order by confidence score (highest first), with special priority for:
+1. Tables/figures explicitly mentioning "lineage" or "evolutionary tree" (score 90-100)
+2. Tables showing complete parent-child relationships with mutations (score 80-95)
+3. Figures showing evolutionary/phylogenetic trees (score 75-90)
+4. Tables listing variants with parent information (score 70-85)
+5. Generic variant tables without clear lineage information (score 40-70)
 Don't include oligonucleotide results or result from only one round.
 Example output:
 [
-  {{"location": "Table S1.", "type": "table", "confidence": 95, "reason": "Variant lineage table", "source": "si", "caption": "Table S1. Summary of mutations introduced during directed evolution of PA-G8. The table shows all variants tested in each round of SSM with their corresponding mutations and activities..."{campaign_example}}},
-  {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram", "source": "manuscript", "caption": "Figure 2B Phylogenetic tree showing the evolutionary relationships between enzyme variants. Each node represents a variant with mutations indicated on branches..."{campaign_example}}},
-  {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description", "source": "manuscript", "caption": "Section 3.2 Directed Evolution Campaign. We performed eight rounds of site-saturation mutagenesis..."{campaign_example}}}
+  {{"location": "Table S1.", "type": "table", "confidence": 98, "reason": "Complete enzyme lineage table with parent-child relationships", "source": "si", "caption": "Table S1. Complete lineage of enzyme variants showing the evolutionary progression from wild-type through eight rounds of directed evolution. Each variant is listed with its parent and accumulated mutations..."{campaign_example}}},
+  {{"location": "Figure 2B", "type": "figure", "confidence": 92, "reason": "Evolutionary tree explicitly showing lineage", "source": "manuscript", "caption": "Figure 2B Evolutionary lineage tree depicting the complete genealogy of engineered variants. Branches show parent-child relationships with mutations annotated..."{campaign_example}}},
+  {{"location": "Table 2", "type": "table", "confidence": 75, "reason": "Variant table with parent information", "source": "manuscript", "caption": "Table 2. Summary of enzyme variants generated in this study. Parent templates and mutations are indicated for each variant..."{campaign_example}}}
 ]
 """.strip()
@@ -919,6 +913,9 @@ def identify_evolution_locations(
     pdf_paths: Optional[List[Path]] = None,
 ) -> List[dict]:
     """Ask Gemini where in the paper the lineage is probably described."""
+    # Extract manuscript pages as images (in addition to text)
+    manuscript_images = []
     # Extract table of contents from PDFs if available
     toc_text = ""
     if pdf_paths:
@@ -949,6 +946,27 @@ def identify_evolution_locations(
         if toc_sections:
             toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
+        # Extract manuscript pages as images
+        if len(pdf_paths) >= 1:
+            manuscript_pdf = pdf_paths[0]
+            log.info(f"Extracting manuscript pages as images from: {manuscript_pdf.name}")
+            doc = _open_doc(manuscript_pdf)
+            try:
+                # Extract up to 10 pages as images
+                for page_num in range(min(10, len(doc))):
+                    page = doc[page_num]
+                    # Render page as image
+                    mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
+                    pix = page.get_pixmap(matrix=mat)
+                    img_bytes = pix.tobytes("png")
+                    manuscript_images.append(img_bytes)
+                    log.debug(f"Extracted page {page_num + 1} as image ({len(img_bytes)} bytes)")
+            finally:
+                doc.close()
+            log.info(f"Extracted {len(manuscript_images)} manuscript pages as images")
     # Include TOC before the main text
     combined_text = toc_text + text if toc_text else text
@@ -990,15 +1008,80 @@ def identify_evolution_locations(
         campaign_specific=campaign_specific,
         campaign_field=campaign_field,
         campaign_example=campaign_example
-    ) + "\n\nTEXT:\n" + combined_text
+    )
     locs: List[dict] = []
     try:
-        locs = generate_json_with_retry(
-            model,
-            prompt,
-            debug_dir=debug_dir,
-            tag="locate",
-        )
+        if manuscript_images:
+            # Use vision API with manuscript images and SI text
+            log.info("Using vision API with %d manuscript page images and SI text", len(manuscript_images))
+            # Convert images to PIL format for Gemini
+            import PIL.Image
+            import io
+            pil_images = []
+            for img_bytes in manuscript_images:
+                image = PIL.Image.open(io.BytesIO(img_bytes))
+                pil_images.append(image)
+            # Build multimodal prompt with caption text AND manuscript images
+            multimodal_prompt = [prompt + "\n\nTEXT (Captions and sections):\n" + combined_text]
+            # Add manuscript page images
+            multimodal_prompt.append("\n\n=== MANUSCRIPT PAGES (as images for additional context) ===\n")
+            multimodal_prompt.extend(pil_images)
+            # Save debug info if requested
+            if debug_dir:
+                debug_path = Path(debug_dir)
+                debug_path.mkdir(parents=True, exist_ok=True)
+                # Save prompt
+                prompt_file = debug_path / f"locate_vision_prompt_{int(time.time())}.txt"
+                _dump(f"=== VISION PROMPT FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nManuscript pages: {len(manuscript_images)}\nText length: {len(combined_text)} chars\n{'='*80}\n\n{prompt}\n\nTEXT (Captions and sections):\n{combined_text[:2000]}...(truncated)\n\n[{len(manuscript_images)} manuscript page images]",
+                      prompt_file)
+                # Save manuscript page samples
+                for i, img_bytes in enumerate(manuscript_images[:3]):  # Save first 3 pages
+                    img_file = debug_path / f"locate_manuscript_page_{i+1}_{int(time.time())}.png"
+                    _dump(img_bytes, img_file)
+            # Generate content with vision
+            response = model.generate_content(multimodal_prompt)
+            raw = response.text
+            # Parse JSON from response
+            try:
+                # Save raw response if debug enabled
+                if debug_dir:
+                    response_file = Path(debug_dir) / f"locate_vision_response_{int(time.time())}.txt"
+                    _dump(f"=== VISION RESPONSE FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}", response_file)
+                # Try to parse JSON
+                try:
+                    locs = json.loads(raw)
+                except json.JSONDecodeError:
+                    # Try to extract JSON from response
+                    json_match = re.search(r'\[.*\]', raw, re.DOTALL)
+                    if json_match:
+                        locs = json.loads(json_match.group(0))
+                    else:
+                        log.warning("Could not parse JSON from vision response")
+                        locs = []
+            except Exception as e:
+                log.warning(f"Error parsing vision response: {e}")
+                locs = []
+        else:
+            # Fall back to text-only mode
+            prompt += "\n\nTEXT:\n" + combined_text
+            locs = generate_json_with_retry(
+                model,
+                prompt,
+                debug_dir=debug_dir,
+                tag="locate",
+            )
     except Exception as exc:  # pragma: no cover
         log.warning("identify_evolution_locations(): %s", exc)
@@ -1299,7 +1382,7 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
     return False
-def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
+def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 50000, validate_sequences: bool = False) -> str:
     """Extract text around identified locations."""
     if not locations:
         return text
@@ -1788,50 +1871,55 @@ def get_lineage(
             for loc in locations:
                 log.info(f"  - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
-            # Try to extract from the best location
+            # Sort locations by confidence and use the highest confidence one
+            locations_sorted = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
+            log.info(f"Using highest confidence location: {locations_sorted[0]['location']} (confidence: {locations_sorted[0]['confidence']})")
+            # Use the highest confidence location as primary location
+            primary_location = locations_sorted[0]
+            # Extract location details
+            location_str = primary_location.get('location', '')
+            location_type = primary_location.get('type', '')
+            confidence = primary_location.get('confidence', 0)
+            caption_text = primary_location.get('caption', '')
+            # Initialize extracted variants list
             extracted_variants = []
-            for location in locations:
-                if extracted_variants:
-                    break  # Already got variants
-                location_str = location.get('location', '')
-                location_type = location.get('type', '')
-                confidence = location.get('confidence', 0)
+            # Try figure extraction for high-confidence figures
+            if location_type == 'figure' and confidence >= 70 and pdf_paths:
+                log.info(f"Attempting to extract figure: {location_str}")
-                # Try figure extraction for high-confidence figures
-                if location_type == 'figure' and confidence >= 70 and pdf_paths:
-                    log.info(f"Attempting to extract figure: {location_str}")
-                    figure_bytes = None
-                    for pdf_path in pdf_paths:
-                        figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
-                        if figure_bytes:
-                            log.info(f"Successfully extracted figure from {pdf_path.name}")
-                            break
+                figure_bytes = None
+                for pdf_path in pdf_paths:
+                    figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text=caption_text)
                     if figure_bytes:
-                        # Save figure if debug enabled
-                        if debug_dir:
-                            debug_path = Path(debug_dir)
-                            debug_path.mkdir(parents=True, exist_ok=True)
-                            figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
-                            _dump(figure_bytes, figure_file)
-                            log.info(f"Saved figure to: {figure_file}")
-                        # Extract lineage from figure
-                        variants = extract_lineage_from_figure(
-                            figure_bytes, model,
-                            debug_dir=debug_dir,
-                            campaign_id=campaign.campaign_id,
-                            campaign_info=campaign
-                        )
-                        if variants:
-                            log.info(f"Extracted {len(variants)} variants from figure")
-                            extracted_variants = variants
-                            continue
+                        log.info(f"Successfully extracted figure from {pdf_path.name}")
+                        break
-                # Try table/text extraction
-                if location_type in ['table', 'text', 'section'] and not extracted_variants:
+                if figure_bytes:
+                    # Save figure if debug enabled
+                    if debug_dir:
+                        debug_path = Path(debug_dir)
+                        debug_path.mkdir(parents=True, exist_ok=True)
+                        figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
+                        _dump(figure_bytes, figure_file)
+                        log.info(f"Saved figure to: {figure_file}")
+                    # Extract lineage from figure
+                    variants = extract_lineage_from_figure(
+                        figure_bytes, model,
+                        debug_dir=debug_dir,
+                        campaign_id=campaign.campaign_id,
+                        campaign_info=campaign
+                    )
+                    if variants:
+                        log.info(f"Extracted {len(variants)} variants from figure")
+                        extracted_variants = variants
+            # Try table/text extraction if no figure extraction or if not a figure
+            if not extracted_variants and location_type in ['table', 'text', 'section']:
                     log.info(f"Attempting text extraction for {location_type}: {location_str}")
                     # Determine which text to use based on source
@@ -2074,8 +2162,9 @@ def get_lineage(
                     # Try to extract the figure from available PDFs
                     figure_bytes = None
+                    # Note: This fallback path doesn't have the caption text
                     for pdf_path in pdf_paths:
-                        figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
+                        figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text="")
                         if figure_bytes:
                             log.info("Successfully extracted figure from %s", pdf_path.name)
                             break
@@ -2114,7 +2203,7 @@ def get_lineage(
             # Use text-based extraction (works for tables and text sections)
             # Extract from full text, not caption text - use only primary location
             # Use more context for tables since they often span multiple pages
-            context_size = 15000 if location_type == 'table' else 5000
+            context_size = 75000 if location_type == 'table' else 50000
             focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
             log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
                      len(full_text), len(focused_text),
@@ -2377,7 +2466,7 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
         # Fallback to text search if page extraction didn't work
         if not sample_text:
             sample_text = _extract_text_at_locations(
-                text, [location], context_chars=2000, validate_sequences=False
+                text, [location], context_chars=20000, validate_sequences=False
             )
         samples.append({
@@ -2419,29 +2508,25 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
 # --- 7.3  Main extraction prompt ---------------------------------------------
 _SEQ_EXTRACTION_PROMPT = """
-Extract ALL enzyme variant sequences from the text.
-Rules:
-1. Use EXACT variant IDs as they appear with each sequence
-2. Copy sequences EXACTLY - preserve all amino acids/nucleotides including repeats
-3. For each variant:
-   - If amino acid sequence exists: set aa_seq to the sequence, set dna_seq to null
-   - If ONLY DNA sequence exists: set dna_seq to the sequence, set aa_seq to null
-   - NEVER include both aa_seq and dna_seq for the same variant
-   - IMPORTANT: Always prefer amino acid sequences over DNA sequences when both are available
-4. Return ONLY minified JSON, no markdown or commentary
-CRITICAL SEQUENCE PRIORITY RULE:
-- If you find BOTH amino acid sequence AND DNA sequence for the same variant, ONLY return the amino acid sequence
-- Set dna_seq to null when aa_seq is available, even if DNA sequence is present in the text
-- Only return dna_seq when NO amino acid sequence exists for that variant
-CRITICAL ACCURACY REQUIREMENTS:
-- Extract ONLY sequences that are explicitly present in the provided text
-- DO NOT generate, infer, or hallucinate any sequences
-- Every character in the sequence must be directly copied from the text
-- If a sequence appears truncated or incomplete in the text, extract only what is shown
-- Be extremely careful and accurate - sequence accuracy is critical for scientific validity
+Extract ALL enzyme variant sequences from the text. Copy sequences EXACTLY as they appear - character by character.
+KEY RULES:
+1. EXHAUSTIVE SEARCH: If a variant appears multiple times, check ALL occurrences and extract the LONGEST sequence
+2. MULTI-PAGE: Sequences span pages. Skip page numbers (66, 67, etc.) that interrupt sequences
+3. MERGE IF NEEDED: If sequence continues after page break, combine the parts
+4. NO MODIFICATIONS: Copy exactly - no edits or improvements
+IMPORTANT: The same variant may appear multiple times with different sequence lengths. Always use the longest one.
+SEQUENCE PRIORITY:
+- If BOTH amino acid AND DNA exist → use amino acid ONLY
+- For DNA: If mixed case, extract UPPERCASE only (lowercase=backbone)
+- Return minified JSON only
+ACCURACY:
+- Extract ONLY what's written
+- Never hallucinate
+- Check entire document - complete sequences often appear later
 Schema: {schema}
@@ -2535,9 +2620,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
         The most common sequence JSON data or None if all attempts failed
     """
     responses = []
-    max_attempts = 3  # Reduced from 6 to 3 for performance
+    max_attempts = 5  # 5 attempts for better consensus
-    # Try 3 times with early match detection
+    # Try 5 times with early match detection
     for attempt in range(max_attempts):
         try:
             log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2652,28 +2737,39 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
         if isinstance(resp, list):
             for seq in resp:
                 if isinstance(seq, dict) and "variant_id" in seq:
-                    # Create a key for this sequence (variant_id + cleaned aa_seq)
+                    # Create a key for this sequence (variant_id + cleaned sequence)
                     variant_id = seq.get("variant_id", "")
                     aa_seq = seq.get("aa_seq", "")
+                    dna_seq = seq.get("dna_seq", "")
+                    # Clean sequences for comparison
                     if aa_seq:
                         aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
-                    key = f"{variant_id}|{aa_seq}"
+                    if dna_seq:
+                        dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
+                    # Use whichever sequence is present for the key
+                    seq_for_key = aa_seq if aa_seq else (dna_seq if dna_seq else "")
+                    key = f"{variant_id}|{seq_for_key}"
                     if key not in sequence_counts:
                         sequence_counts[key] = {"count": 0, "data": seq}
                     sequence_counts[key]["count"] += 1
-    # Build result with sequences that appear in at least 3 attempts
+    # Build result with sequences that appear in at least 2 attempts
+    # Sort by count (descending) to prioritize sequences with higher consensus
     result = []
-    for key, info in sequence_counts.items():
-        if info["count"] >= 3:  # Appears in at least 3/6 attempts
+    sorted_sequences = sorted(sequence_counts.items(), key=lambda x: x[1]["count"], reverse=True)
+    for key, info in sorted_sequences:
+        if info["count"] >= 2:  # Appears in at least 2/5 attempts
             seq_data = info["data"].copy()
             seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
             result.append(seq_data)
             log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
     if result:
-        log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
+        log.info(f"Extracted {len(result)} sequences with at least 2/{max_attempts} consensus")
         return result
     # If no sequences appear twice, return the most complete attempt
@@ -2769,11 +2865,30 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
         if aa and len(aa) <= 50:
             log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
             aa = None
-        if dna and len(dna) <= 150:
-            log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
-            dna = None
+        # Validate DNA sequences
+        if dna:
+            if len(dna) <= 150:
+                log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
+                dna = None
+            # Check if DNA sequence length is divisible by 3
+            elif len(dna) % 3 != 0:
+                log.warning(f"Skipping DNA sequence for {vid}: length {len(dna)} not divisible by 3")
+                dna = None
+            else:
+                # Check for stop codons in the middle of the sequence
+                stop_codons = {'TAA', 'TAG', 'TGA'}
+                has_internal_stop = False
+                for i in range(0, len(dna) - 3, 3):
+                    codon = dna[i:i+3]
+                    if codon in stop_codons:
+                        log.warning(f"Skipping DNA sequence for {vid}: internal stop codon {codon} at position {i}")
+                        has_internal_stop = True
+                        break
+                if has_internal_stop:
+                    dna = None
-        # Skip if both sequences are too short or missing
+        # Skip if both sequences are invalid or missing
         if not aa and not dna:
             continue
@@ -3015,7 +3130,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                 log.info("Page extraction did not return text, falling back to text search")
                 focused_text = _extract_text_at_locations(
                     text, [best_location],
-                    context_chars=max(min_length, 30000),
+                    context_chars=max(min_length, 50000),
                     validate_sequences=True
                 )

debase 0.6.2__tar.gz → 0.7.0__tar.gz

debase 0.6.2tar.gz → 0.7.0tar.gz