PyPI - debase - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

debase 0.1.11py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +373 -222
debase/reaction_info_extractor.py +3 -3
debase/substrate_scope_extractor.py +516 -67
{debase-0.1.11.dist-info → debase-0.1.16.dist-info}/METADATA +1 -1
debase-0.1.16.dist-info/RECORD +16 -0
debase/PIPELINE_FLOW.md +0 -100
debase-0.1.11.dist-info/RECORD +0 -17
{debase-0.1.11.dist-info → debase-0.1.16.dist-info}/WHEEL +0 -0
{debase-0.1.11.dist-info → debase-0.1.16.dist-info}/entry_points.txt +0 -0
{debase-0.1.11.dist-info → debase-0.1.16.dist-info}/licenses/LICENSE +0 -0
{debase-0.1.11.dist-info → debase-0.1.16.dist-info}/top_level.txt +0 -0

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -589,17 +589,28 @@ TEXT:
 {text}
 """.strip()
-_CAMPAIGN_MAPPING_PROMPT = """
-Given these identified campaigns and the lineage data location, determine which campaign this data belongs to:
+_CAMPAIGN_BEST_LOCATION_PROMPT = """
+Given this specific campaign and the available data locations, select the BEST location to extract the complete lineage data for this campaign.
-Campaigns:
-{campaigns}
+Campaign:
+- ID: {campaign_id}
+- Name: {campaign_name}
+- Description: {description}
+- Lineage identifiers: {identifiers}
-Data location: {location}
-Caption/context: {context}
+Available locations with context:
+{locations_with_context}
-Based on the caption, enzyme names, or reaction details, which campaign does this data belong to?
-Return ONLY the campaign_id as a string.
+Select the location that most likely contains the COMPLETE lineage data (all variants, mutations, and parent relationships) for THIS SPECIFIC campaign.
+Consider:
+1. Tables are usually more structured and complete than figures
+2. Look for locations that mention this campaign's specific identifiers or enzyme names
+3. Some locations may contain data for multiple campaigns - that's fine, we can filter later
+4. Prioritize completeness over visual clarity
+Return a JSON object with:
+{{"location": "selected location identifier", "confidence": 0-100, "reason": "explanation"}}
 """.strip()
 # ---- 6.1  Prompt templates -------------------------------------------------
@@ -756,9 +767,43 @@ def identify_evolution_locations(
     max_results: int = 5,
     debug_dir: str | Path | None = None,
     campaigns: Optional[List[Campaign]] = None,
+    pdf_paths: Optional[List[Path]] = None,
 ) -> List[dict]:
     """Ask Gemini where in the paper the lineage is probably described."""
-    prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
+    # Extract table of contents from PDFs if available
+    toc_text = ""
+    if pdf_paths:
+        toc_sections = []
+        for pdf_path in pdf_paths:
+            # Extract first few pages looking for TOC
+            doc = _open_doc(pdf_path)
+            try:
+                for page_num in range(min(5, len(doc))):
+                    page_text = doc[page_num].get_text()
+                    if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
+                        # Found TOC page
+                        lines = page_text.split('\n')
+                        toc_lines = []
+                        for line in lines:
+                            line = line.strip()
+                            # TOC entries typically have page numbers
+                            if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
+                                re.search(r'\s{2,}S?\d+\s*$', line) or
+                                re.match(r'^\d+\.\s+\w+', line)):
+                                toc_lines.append(line)
+                        if toc_lines:
+                            pdf_name = pdf_path.name
+                            toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
+                            break
+            finally:
+                doc.close()
+        if toc_sections:
+            toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
+    # Include TOC before the main text
+    combined_text = toc_text + text if toc_text else text
+    prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
     locs: List[dict] = []
     try:
         locs = generate_json_with_retry(
@@ -770,69 +815,7 @@ def identify_evolution_locations(
     except Exception as exc:  # pragma: no cover
         log.warning("identify_evolution_locations(): %s", exc)
-    # If we have campaigns, try to map locations to campaigns
-    if campaigns and locs:
-        for loc in locs:
-            # Extract more context around the location
-            location_str = loc.get('location', '')
-            context = loc.get('reason', '')
-            # Ask Gemini to map this location to a campaign
-            if campaigns:
-                try:
-                    campaigns_json = json.dumps([{
-                        "campaign_id": c.campaign_id,
-                        "campaign_name": c.campaign_name,
-                        "lineage_hint": c.notes
-                    } for c in campaigns])
-                    mapping_prompt = _CAMPAIGN_MAPPING_PROMPT.format(
-                        campaigns=campaigns_json,
-                        location=location_str,
-                        context=context
-                    )
-                    # Save mapping prompt to debug if provided
-                    if debug_dir:
-                        debug_path = Path(debug_dir)
-                        debug_path.mkdir(parents=True, exist_ok=True)
-                        mapping_file = debug_path / f"campaign_mapping_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
-                        _dump(f"=== CAMPAIGN MAPPING PROMPT ===\nLocation: {location_str}\n{'='*80}\n\n{mapping_prompt}", mapping_file)
-                    response = model.generate_content(mapping_prompt)
-                    response_text = _extract_text(response).strip()
-                    # Extract just the campaign_id from the response
-                    # Look for the campaign_id pattern in the response
-                    campaign_id = None
-                    for campaign in campaigns:
-                        if hasattr(campaign, 'campaign_id') and campaign.campaign_id in response_text:
-                            campaign_id = campaign.campaign_id
-                            break
-                    # If not found, try to extract the last line or quoted string
-                    if not campaign_id:
-                        # Try to find quoted string
-                        quoted_match = re.search(r'"([^"]+)"', response_text)
-                        if quoted_match:
-                            campaign_id = quoted_match.group(1)
-                        else:
-                            # Take the last non-empty line
-                            lines = [line.strip() for line in response_text.split('\n') if line.strip()]
-                            if lines:
-                                campaign_id = lines[-1].strip('"')
-                    # Save mapping response to debug if provided
-                    if debug_dir:
-                        response_file = debug_path / f"campaign_mapping_response_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
-                        _dump(f"=== CAMPAIGN MAPPING RESPONSE ===\nLocation: {location_str}\nFull response:\n{response_text}\nExtracted campaign_id: {campaign_id}\n{'='*80}", response_file)
-                    # Add campaign_id to location
-                    if campaign_id:
-                        loc['campaign_id'] = campaign_id
-                    log.info(f"Mapped {location_str} to campaign: {campaign_id}")
-                except Exception as exc:
-                    log.warning(f"Failed to map location to campaign: {exc}")
+    # No longer mapping locations to campaigns here - we'll ask for best location per campaign instead
     return locs if isinstance(locs, list) else []
@@ -878,6 +861,7 @@ def extract_complete_lineage(
     debug_dir: str | Path | None = None,
     campaign_id: Optional[str] = None,
     campaign_info: Optional[Campaign] = None,
+    pdf_paths: Optional[List[Path]] = None,
 ) -> List[Variant]:
     """Prompt Gemini for the full lineage and return a list[Variant]."""
     # Build campaign context
@@ -899,10 +883,44 @@ IMPORTANT:
 4. Include parent variants only if they are direct ancestors in this campaign's lineage.
 """
+    # Extract table of contents from PDFs if available
+    toc_text = ""
+    if pdf_paths:
+        toc_sections = []
+        for pdf_path in pdf_paths:
+            # Extract first few pages looking for TOC
+            doc = _open_doc(pdf_path)
+            try:
+                for page_num in range(min(5, len(doc))):
+                    page_text = doc[page_num].get_text()
+                    if any(indicator in page_text.lower() for indicator in ['table of contents', 'contents', 'summary']):
+                        # Found TOC page
+                        lines = page_text.split('\n')
+                        toc_lines = []
+                        for line in lines:
+                            line = line.strip()
+                            # TOC entries typically have page numbers
+                            if (re.search(r'\.{2,}\s*S?\d+\s*$', line) or
+                                re.search(r'\s{2,}S?\d+\s*$', line) or
+                                re.match(r'^\d+\.\s+\w+', line)):
+                                toc_lines.append(line)
+                        if toc_lines:
+                            pdf_name = pdf_path.name
+                            toc_sections.append(f"\n--- Table of Contents from {pdf_name} ---\n" + '\n'.join(toc_lines))
+                            break
+            finally:
+                doc.close()
+        if toc_sections:
+            toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
+    # Include TOC in the prompt text
+    combined_text = toc_text + text if toc_text else text
     prompt = _LINEAGE_EXTRACT_PROMPT.format(
         campaign_context=campaign_context,
         schema=_LINEAGE_SCHEMA_HINT,
-        text=text[:MAX_CHARS],
+        text=combined_text[:MAX_CHARS],
     )
     raw = generate_json_with_retry(
         model,
@@ -1044,15 +1062,27 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
     # 1. Line contains dots (...) followed by page number
     # 2. Line ends with just a page number
     # 3. Line has "Table S12:" or similar followed by title and page
-    if '...' in line or re.search(r'\.\s*\d+\s*$', line) or re.search(r':\s*[^:]+\s+\d+\s*$', line):
+    # 4. Pattern appears at start of line followed by description and page number
+    if ('...' in line or
+        re.search(r'\.\s*\d+\s*$', line) or
+        re.search(r':\s*[^:]+\s+\d+\s*$', line) or
+        (line.strip().startswith(pattern) and re.search(r'\s+\d+\s*$', line))):
         return True
     # Check if this is in a contents/TOC section
-    # Look backwards up to 500 chars for "Contents" or "Table of Contents"
-    context_start = max(0, position - 500)
+    # Look backwards up to 1000 chars for "Contents" or "Table of Contents"
+    context_start = max(0, position - 1000)
     context = text[context_start:position].lower()
     if 'contents' in context or 'table of contents' in context:
         return True
+    # Check if we're in the first ~5000 chars of the document (likely TOC area)
+    # This helps catch TOC entries that don't have obvious formatting
+    if position < 5000:
+        # Be more strict for early document positions
+        # Check if line looks like a TOC entry (has page number at end)
+        if re.search(r'\s+\d+\s*$', line):
+            return True
     return False
@@ -1185,13 +1215,39 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
                     log.warning("No sequences found in any of %d occurrences of '%s'",
                                len(all_positions), location_str)
             else:
-                # For lineage extraction, use the original logic
-                start = max(0, pos - context_chars)
-                end = min(len(text), pos + len(used_pattern) + context_chars)
-                section_text = text[start:end]
-                extracted_sections.append(section_text)
-                log.info("Found '%s' using pattern '%s' at position %d, extracted %d chars",
-                         location_str, used_pattern, pos, len(section_text))
+                # For lineage extraction, find ALL occurrences of the pattern
+                all_positions = []
+                search_pos = 0
+                # Find all occurrences of this pattern (not just the first)
+                while search_pos < len(text_lower):
+                    temp_pos = text_lower.find(used_pattern.lower(), search_pos)
+                    if temp_pos == -1:
+                        break
+                    # Check if this is a TOC entry
+                    if _is_toc_entry(text, temp_pos, used_pattern):
+                        log.debug("Skipping TOC entry for pattern '%s' at position %d", used_pattern, temp_pos)
+                        search_pos = temp_pos + len(used_pattern)
+                        continue
+                    all_positions.append(temp_pos)
+                    search_pos = temp_pos + len(used_pattern)
+                    if len(all_positions) >= 10:  # Limit to 10 occurrences
+                        break
+                log.info("Found %d non-TOC occurrences of pattern '%s' for location '%s'",
+                         len(all_positions), used_pattern, location_str)
+                # Extract context around each occurrence
+                for idx, pos in enumerate(all_positions):
+                    start = max(0, pos - context_chars)
+                    end = min(len(text), pos + len(used_pattern) + context_chars)
+                    section_text = text[start:end]
+                    extracted_sections.append(section_text)
+                    log.info("Occurrence %d/%d: Found '%s' at position %d, extracted %d chars",
+                             idx + 1, len(all_positions), location_str, pos, len(section_text))
         else:
             log.warning("Location '%s' not found in text (tried %d patterns)", location_str, len(page_patterns))
@@ -1229,41 +1285,113 @@ def get_lineage(
             log.info(f"  - {camp.campaign_name}: {camp.description}")
     # Use captions for identification - they're concise and focused
-    locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=campaigns)
+    locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
     all_variants = []
-    if locations:
+    if locations and campaigns:
         # Log location information
         location_summary = []
         for loc in locations[:5]:
             if isinstance(loc, dict):
-                campaign_info = f", campaign: {loc.get('campaign_id', 'unknown')}" if 'campaign_id' in loc else ""
-                location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)}{campaign_info})")
+                location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
             else:
                 location_summary.append(str(loc))
         log.info("Gemini identified %d potential lineage locations: %s",
                  len(locations), ", ".join(location_summary))
-        # Group locations by campaign
-        locations_by_campaign = {}
+        # Extract context around each location for better decision making
+        locations_with_context = []
         for loc in locations:
-            campaign_id = loc.get('campaign_id', 'default') if isinstance(loc, dict) else 'default'
-            if campaign_id not in locations_by_campaign:
-                locations_by_campaign[campaign_id] = []
-            locations_by_campaign[campaign_id].append(loc)
+            location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
+            # Extract 1000 chars of context around the location
+            context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
+            locations_with_context.append({
+                'location': loc,
+                'context': context_text[:1000]  # First 1000 chars of extracted context
+            })
-        # Process each campaign's locations
-        for campaign_id, campaign_locations in locations_by_campaign.items():
-            log.info(f"Processing campaign: {campaign_id}")
+        # For each campaign, ask Gemini to select the best location
+        for campaign in campaigns:
+            log.info(f"Processing campaign: {campaign.campaign_id}")
-            # Sort locations by confidence to get the highest confidence one
-            sorted_locations = sorted(campaign_locations,
-                                    key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
-                                    reverse=True)
+            # Build locations context string
+            locations_str = ""
+            for i, loc_ctx in enumerate(locations_with_context):
+                loc = loc_ctx['location']
+                context = loc_ctx['context']
+                location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
+                location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
+                confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
+                reason = loc.get('reason', '') if isinstance(loc, dict) else ''
+                locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
+                locations_str += f"   Reason: {reason}\n"
+                locations_str += f"   Context (first 500 chars):\n   {context[:500]}...\n"
-            # Use only the highest confidence location to avoid duplicates
-            primary_location = sorted_locations[0] if sorted_locations else None
+            # Ask Gemini to select best location for this campaign
+            best_location_prompt = _CAMPAIGN_BEST_LOCATION_PROMPT.format(
+                campaign_id=campaign.campaign_id,
+                campaign_name=campaign.campaign_name,
+                description=campaign.description,
+                identifiers=campaign.notes or "No specific identifiers provided",
+                locations_with_context=locations_str
+            )
+            primary_location = None
+            try:
+                # Save prompt to debug if provided
+                if debug_dir:
+                    debug_path = Path(debug_dir)
+                    debug_path.mkdir(parents=True, exist_ok=True)
+                    prompt_file = debug_path / f"best_location_{campaign.campaign_id}_{int(time.time())}.txt"
+                    _dump(f"=== BEST LOCATION PROMPT ===\nCampaign: {campaign.campaign_id}\n{'='*80}\n\n{best_location_prompt}", prompt_file)
+                response = model.generate_content(best_location_prompt)
+                response_text = _extract_text(response).strip()
+                # Parse JSON response
+                if response_text.startswith("```"):
+                    response_text = response_text.split("```")[1].strip()
+                    if response_text.startswith("json"):
+                        response_text = response_text[4:].strip()
+                best_loc_data = json.loads(response_text)
+                selected_location = best_loc_data.get('location', '')
+                confidence = best_loc_data.get('confidence', 0)
+                reason = best_loc_data.get('reason', '')
+                # Save response to debug if provided
+                if debug_dir:
+                    response_file = debug_path / f"best_location_response_{campaign.campaign_id}_{int(time.time())}.txt"
+                    _dump(f"=== BEST LOCATION RESPONSE ===\nCampaign: {campaign.campaign_id}\nSelected: {selected_location}\nConfidence: {confidence}\nReason: {reason}\n{'='*80}", response_file)
+                log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
+                # Find the actual location object
+                for loc in locations:
+                    loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
+                    if loc_str == selected_location:
+                        primary_location = loc
+                        break
+                if not primary_location:
+                    log.warning(f"Could not find selected location '{selected_location}' in locations list")
+                    # Fall back to highest confidence location
+                    primary_location = sorted(locations,
+                                            key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
+                                            reverse=True)[0] if locations else None
+            except Exception as e:
+                log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
+                # Fall back to highest confidence location
+                primary_location = sorted(locations,
+                                        key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
+                                        reverse=True)[0] if locations else None
+            if not primary_location:
+                log.warning(f"No location found for campaign {campaign.campaign_id}")
+                continue
             # Track if we successfully extracted from figure
             extracted_from_figure = False
@@ -1297,12 +1425,11 @@ def get_lineage(
                             log.info("Saved lineage figure to: %s", figure_file)
                         # Extract lineage from the figure
-                        campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
                         variants = extract_lineage_from_figure(
                             figure_bytes, model,
                             debug_dir=debug_dir,
-                            campaign_id=campaign_id,
-                            campaign_info=campaign_obj
+                            campaign_id=campaign.campaign_id,
+                            campaign_info=campaign
                         )
                         if variants:
                             all_variants.extend(variants)
@@ -1327,22 +1454,22 @@ def get_lineage(
             log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
                      len(full_text), len(focused_text),
                      primary_location.get('location', 'Unknown') if isinstance(primary_location, dict) else 'Unknown',
-                     campaign_id)
+                     campaign.campaign_id)
-            # Find the campaign object
-            campaign_obj = next((c for c in campaigns if c.campaign_id == campaign_id), None)
+            # Extract lineage for this campaign
             campaign_variants = extract_complete_lineage(
                 focused_text, model,
                 debug_dir=debug_dir,
-                campaign_id=campaign_id,
-                campaign_info=campaign_obj
+                campaign_id=campaign.campaign_id,
+                campaign_info=campaign,
+                pdf_paths=pdf_paths
             )
             all_variants.extend(campaign_variants)
         return all_variants, campaigns
     else:
         log.info("Gemini did not identify specific lineage locations")
-        variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir)
+        variants = extract_complete_lineage(full_text, model, debug_dir=debug_dir, pdf_paths=pdf_paths)
         return variants, campaigns
 # === 7. SEQUENCE EXTRACTION === ----------------------------------------------
@@ -1398,18 +1525,31 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
         return []
 # --- 7.2  Page-based extraction helper ---------------------------------------
-def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int]) -> str:
-    """Extract text from a specific page number in the PDFs."""
+def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
+    """Extract text from a specific page number in the PDFs.
+    Args:
+        pdf_paths: List of PDF paths
+        page_num: Page number (can be "S1", "S2", etc for SI pages)
+        skip_si_toc: If True, skip first 2 pages of SI to avoid TOC
+    """
     # Convert page number to int and handle S-prefix
     page_str = str(page_num).strip().upper()
     if page_str.startswith('S'):
         # Supplementary page - look in the SI PDF (second PDF)
         actual_page = int(page_str[1:]) - 1  # 0-indexed
         pdf_index = 1 if len(pdf_paths) > 1 else 0
+        is_si_page = True
     else:
         # Regular page - look in the main PDF
         actual_page = int(page_str) - 1  # 0-indexed
         pdf_index = 0
+        is_si_page = False
+    # Skip first 2 pages of SI to avoid table of contents
+    if skip_si_toc and is_si_page and actual_page < 2:
+        log.info("Skipping SI page %s (first 2 pages are typically TOC)", page_str)
+        return ""
     if pdf_index >= len(pdf_paths):
         log.warning("Page %s requested but not enough PDFs provided", page_str)
@@ -1543,8 +1683,14 @@ IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
 - Only extract dna_seq if NO amino acid sequence is available for that variant
 - This reduces redundancy since protein sequences are usually more relevant
+CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
+- Papers often use different naming conventions in different sections
+- DO NOT normalize or simplify variant IDs
+- Extract the variant_id exactly as written where the sequence appears
+- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
 For each variant return:
-  * variant_id  - the label used in the paper (e.g. "R4-10")
+  * variant_id  - the EXACT label as it appears with the sequence (preserve all formatting)
   * aa_seq      - amino-acid sequence (uppercase), or null
   * dna_seq     - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
@@ -1584,7 +1730,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
     return _parse_sequences(data)
 # --- 7.4  JSON -> dataclass helpers -------------------------------------------
-_VALID_AA  = set("ACDEFGHIKLMNPQRSTVWY")
+_VALID_AA  = set("ACDEFGHIKLMNPQRSTVWY*")  # Include * for stop codon
 _VALID_DNA = set("ACGT")
 def _contains_sequence(text: str, min_length: int = 50) -> bool:
@@ -1974,43 +2120,53 @@ def _merge_lineage_and_sequences(
         }
         for s in seqs
     ])
+    # Log sequence data info
+    if len(df_seq) > 0:
+        seq_with_aa = (~df_seq['aa_seq'].isna()).sum()
+        seq_with_dna = (~df_seq['dna_seq'].isna()).sum()
+        log.info(f"Sequence data: {len(df_seq)} entries, {seq_with_aa} with aa_seq, {seq_with_dna} with dna_seq")
-    # 2. Outer merge keeps every lineage entry and adds sequence cols when present
+    # 2. First try direct merge
     df = pd.merge(df_lin, df_seq, on="variant_id", how="left")
-    # 2a. If we have unmatched sequences and a model, use Gemini to match them
-    log.info(f"Model available: {model is not None}, Sequences found: {len(df_seq)}")
-    if model and len(df_seq) > 0:
-        # Log initial state
-        log.info(f"Merge attempt: {len(df_lin)} lineage entries, {len(df_seq)} sequences")
-        log.info(f"Lineage IDs: {df_lin['variant_id'].tolist()[:5]}...")
-        log.info(f"Sequence IDs: {df_seq['variant_id'].tolist()[:5]}...")
-        # Find lineage entries without sequences
+    # Log merge results
+    merged_aa = (~df['aa_seq'].isna()).sum()
+    merged_dna = (~df['dna_seq'].isna()).sum()
+    log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
+    # 3. If we have unmatched sequences and a model, use Gemini to match
+    if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
+        # Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
         missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
-        unmatched_lineage = df[missing_seq]['variant_id'].tolist()
+        unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
-        # Find sequences that weren't matched
+        # Find unmatched sequences
         matched_seq_ids = df[~missing_seq]['variant_id'].tolist()
         unmatched_seqs = df_seq[~df_seq['variant_id'].isin(matched_seq_ids)]
-        if len(unmatched_lineage) > 0 and len(unmatched_seqs) > 0:
-            log.info(f"Found {len(unmatched_lineage)} lineage entries without sequences and {len(unmatched_seqs)} unmatched sequences")
-            log.info(f"Using Gemini to match variants")
+        if unmatched_lineage_ids and len(unmatched_seqs) > 0:
+            log.info(f"Found {len(unmatched_lineage_ids)} lineage entries without sequences")
+            log.info(f"Found {len(unmatched_seqs)} unmatched sequences")
+            log.info("Using Gemini to match variants")
-            # Build prompt for Gemini to match variants
-            prompt = f"""Match enzyme variant IDs between two lists. The same variant may be labeled differently in different parts of the paper.
+            # Build prompt for Gemini
+            prompt = f"""Match enzyme variant IDs between two lists from the same paper.
+Papers often use different naming conventions for the same variant:
+- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
+- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
+Match variants by analyzing generation numbers, prefixes, and patterns.
 Lineage variant IDs (need sequences):
-{json.dumps(unmatched_lineage)}
+{json.dumps(unmatched_lineage_ids)}
 Sequence variant IDs (have sequences):
 {json.dumps(unmatched_seqs['variant_id'].tolist())}
-These lists contain variant identifiers from the same paper but may use different naming conventions.
-Match each lineage variant ID to its corresponding sequence variant ID based on any patterns or relationships you can identify.
-Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. Include only matches you are confident about.
+Return ONLY a JSON object mapping lineage IDs to sequence IDs.
+Format: {{"lineage_id": "sequence_id", ...}}
 """
             try:
@@ -2024,85 +2180,82 @@ Return ONLY a JSON object mapping lineage variant IDs to sequence variant IDs. I
                         text = text[4:].strip()
                 matches = json.loads(text)
-                log.info(f"Gemini returned matches: {matches}")
+                log.info(f"Gemini returned {len(matches)} matches")
-                # Debug: Log what sequences we actually have
-                log.info(f"Available sequence variant IDs: {unmatched_seqs['variant_id'].tolist()}")
+                # Create a mapping of sequence IDs to their data for efficient lookup
+                seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
-                # Apply the matches
+                # Apply matches and update variant IDs
                 for lineage_id, seq_id in matches.items():
-                    if lineage_id in unmatched_lineage:
-                        # Find the sequence data - be flexible with matching
-                        seq_data = None
+                    if lineage_id in unmatched_lineage_ids and seq_id in seq_data_map:
+                        # Get the sequence data
+                        seq_data = seq_data_map[seq_id]
-                        # First try exact match
-                        seq_matches = unmatched_seqs[unmatched_seqs['variant_id'] == seq_id]
-                        if len(seq_matches) > 0:
-                            seq_data = seq_matches.iloc[0]
-                        else:
-                            # Try to find by checking various matching strategies
-                            for idx, row in unmatched_seqs.iterrows():
-                                variant_id = row['variant_id']
-                                # Check if one is contained in the other
-                                if seq_id in variant_id or variant_id in seq_id:
-                                    seq_data = row
-                                    break
-                                # Check if they share the same core identifier (e.g., G0, G1, etc.)
-                                seq_id_parts = re.findall(r'G\d+(?:-\d+)?', seq_id)
-                                variant_id_parts = re.findall(r'G\d+(?:-\d+)?', variant_id)
-                                if seq_id_parts and variant_id_parts and seq_id_parts[0] == variant_id_parts[0]:
-                                    seq_data = row
-                                    break
-                        if seq_data is not None:
-                            # Update the dataframe
-                            mask = df['variant_id'] == lineage_id
-                            if mask.any():
-                                # Log before update
-                                log.debug(f"Before update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0]}")
-                                df.loc[mask, 'aa_seq'] = seq_data['aa_seq']
-                                df.loc[mask, 'dna_seq'] = seq_data['dna_seq']
-                                df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
-                                df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
+                        # Update the row with the matched sequence ID and data
+                        mask = df['variant_id'] == lineage_id
+                        if mask.any():
+                            # Update variant_id to use the sequence variant name
+                            df.loc[mask, 'variant_id'] = seq_id
+                            # Update parent_id if it matches any of the mapped lineage IDs
+                            parent_mask = df['parent_id'] == lineage_id
+                            if parent_mask.any():
+                                df.loc[parent_mask, 'parent_id'] = seq_id
+                            # Update sequence data
+                            # For pandas Series from iterrows(), use proper indexing
+                            aa_seq_val = seq_data['aa_seq'] if 'aa_seq' in seq_data else None
+                            dna_seq_val = seq_data['dna_seq'] if 'dna_seq' in seq_data else None
+                            # Always update sequence fields to preserve DNA even when aa_seq is null
+                            df.loc[mask, 'aa_seq'] = aa_seq_val
+                            df.loc[mask, 'dna_seq'] = dna_seq_val
-                                # Log after update
-                                log.debug(f"After update: df row for {lineage_id} has aa_seq={df.loc[mask, 'aa_seq'].iloc[0][:50] if df.loc[mask, 'aa_seq'].iloc[0] else 'None'}")
-                                log.info(f"Matched {lineage_id} -> {seq_id} using Gemini (populated sequence of length {len(seq_data['aa_seq']) if seq_data['aa_seq'] else 0})")
-                            else:
-                                log.warning(f"No rows found in dataframe for lineage_id {lineage_id}")
-                        else:
-                            log.warning(f"Could not find sequence data for {seq_id} in unmatched sequences")
+                            df.loc[mask, 'seq_confidence'] = seq_data.get('seq_confidence', None)
+                            df.loc[mask, 'truncated'] = seq_data.get('truncated', False)
+                            # Log sequence info - check both aa_seq and dna_seq
+                            aa_len = len(seq_data['aa_seq']) if pd.notna(seq_data.get('aa_seq')) and seq_data.get('aa_seq') else 0
+                            dna_len = len(seq_data['dna_seq']) if pd.notna(seq_data.get('dna_seq')) and seq_data.get('dna_seq') else 0
+                            log.info(f"Matched {lineage_id} -> {seq_id} (aa_seq: {aa_len} chars, dna_seq: {dna_len} chars)")
+                # Update any remaining parent_id references to matched variants
+                for lineage_id, seq_id in matches.items():
+                    parent_mask = df['parent_id'] == lineage_id
+                    if parent_mask.any():
+                        df.loc[parent_mask, 'parent_id'] = seq_id
-                # Log the final state after all matches
-                matched_count = (~df['aa_seq'].isna()).sum()
-                log.info(f"After Gemini matching: {matched_count}/{len(df)} variants have sequences")
+                # Log final state - count variants with any sequence (aa or dna)
+                aa_count = (~df['aa_seq'].isna()).sum()
+                dna_count = (~df['dna_seq'].isna()).sum()
+                any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
+                log.info(f"After Gemini matching: {any_seq_count}/{len(df)} variants have sequences (aa: {aa_count}, dna: {dna_count})")
             except Exception as e:
                 log.warning(f"Failed to match variants using Gemini: {e}")
-    # 3. If generation missing after user input, try inference
+    # 4. If generation missing, try inference
     if df["generation"].isna().any():
-        _infer_generations(lineage)  # mutates in place
-        df = df.drop(columns=["generation"]).merge(
-            pd.DataFrame(
-                {"variant_id": [v.variant_id for v in lineage], "generation": [v.generation for v in lineage]}
-            ),
-            on="variant_id",
-            how="left",
-        )
-    # 4. Attach DOI column for provenance
+        _infer_generations(lineage)
+        # Need to update the generations based on the potentially updated variant IDs
+        gen_map = {v.variant_id: v.generation for v in lineage}
+        # Also create a map for any variant IDs that were replaced
+        for idx, row in df.iterrows():
+            variant_id = row['variant_id']
+            if variant_id in gen_map:
+                df.at[idx, 'generation'] = gen_map[variant_id]
+    # 5. Attach DOI column
     df["doi"] = doi
-    # 5. Sort rows: primary by generation, then by variant_id
+    # 6. Sort by generation, then variant_id
     df = df.sort_values(["generation", "variant_id"], kind="mergesort")
-    # Debug: Log final merge state
-    seq_count = (~df['aa_seq'].isna()).sum()
-    log.info(f"_merge_lineage_and_sequences returning: {len(df)} variants, {seq_count} with sequences")
-    if seq_count > 0:
-        log.info(f"Sample variant with sequence: {df[~df['aa_seq'].isna()].iloc[0]['variant_id']}")
+    # 7. Log final state
+    aa_count = (~df['aa_seq'].isna()).sum()
+    dna_count = (~df['dna_seq'].isna()).sum()
+    any_seq_count = (~(df['aa_seq'].isna() & df['dna_seq'].isna())).sum()
+    log.info(f"Final result: {len(df)} variants, {any_seq_count} with sequences (aa: {aa_count}, dna: {dna_count})")
     return df
@@ -2114,28 +2267,27 @@ def merge_and_score(
     doi: Optional[str] = None,
     model=None,
 ) -> pd.DataFrame:
-    """User-facing helper imported by the pipeline orchestrator.
-    * Ensures lineage + sequence lists are non-empty.
-    * Performs a shallow validation.
-    * Returns a ready-to-export pandas DataFrame.
+    """Merge lineage and sequence data into a single DataFrame.
+    Args:
+        lineage: List of Variant objects from lineage extraction
+        seqs: List of SequenceBlock objects from sequence extraction
+        doi: DOI of the paper for provenance
+        model: Gemini model for smart matching (optional)
+    Returns:
+        DataFrame with merged lineage and sequence data
     """
     if not lineage:
         raise ValueError("merge_and_score(): `lineage` list is empty; nothing to merge")
-    # If no sequences found, still build a DataFrame so caller can decide what to do.
     df = _merge_lineage_and_sequences(lineage, seqs, doi, model)
-    # Basic sanity: warn if many missing sequences
+    # Warn if many sequences are missing
     missing_rate = df["aa_seq"].isna().mean() if "aa_seq" in df else 1.0
     if missing_rate > 0.5:
         log.warning(">50%% of variants lack sequences (%d / %d)", df["aa_seq"].isna().sum(), len(df))
-    # Debug log before returning
-    seq_count = (~df['aa_seq'].isna()).sum() if 'aa_seq' in df else 0
-    log.info(f"merge_and_score returning: {len(df)} variants, {seq_count} with sequences")
     return df
 # -------------------------------------------------------------------- end 8 ---
@@ -2320,18 +2472,17 @@ def run_pipeline(
         # Save final data with sequences using same filename (overwrites lineage-only)
         sequence_path = output_csv_path.parent / "enzyme_lineage_data.csv"
-        # Debug: Log what we're about to save
-        seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
-        log.info(f"About to save CSV: {len(df_final)} variants, {seq_count} with sequences")
-        if seq_count > 0 and 'aa_seq' in df_final:
-            with_seq = df_final[~df_final['aa_seq'].isna()]
-            log.info(f"First variant with sequence: {with_seq.iloc[0]['variant_id']} has {len(with_seq.iloc[0]['aa_seq'])} AA")
+        # Save the final CSV
         df_final.to_csv(sequence_path, index=False)
+        # Log summary statistics
+        seq_count = (~df_final['aa_seq'].isna()).sum() if 'aa_seq' in df_final else 0
         log.info(
-            "Overwrote with final results -> %s (%.1f kB)",
+            "Saved final CSV -> %s (%.1f kB, %d variants, %d with sequences)",
             sequence_path,
             sequence_path.stat().st_size / 1024,
+            len(df_final),
+            seq_count
         )
     log.info(

debase 0.1.11__py3-none-any.whl → 0.1.16__py3-none-any.whl

debase 0.1.11py3-none-any.whl → 0.1.16py3-none-any.whl