PyPI - debase - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

debase 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/campaign_utils.py +146 -0
debase/caption_pattern.py +39 -0
debase/enzyme_lineage_extractor.py +58 -20
debase/lineage_format.py +9 -46
debase/reaction_info_extractor.py +407 -75
debase/substrate_scope_extractor.py +124 -49
debase/wrapper.py +3 -3
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/METADATA +1 -1
debase-0.6.1.dist-info/RECORD +18 -0
debase-0.5.1.dist-info/RECORD +0 -16
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/WHEEL +0 -0
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/entry_points.txt +0 -0
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/licenses/LICENSE +0 -0
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/top_level.txt +0 -0

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.5.1"
+__version__ = "0.6.1"

debase/campaign_utils.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Utilities for handling campaign information across extractors.
+This module provides functions to load and use campaign information
+to improve extraction accuracy by providing context about model substrates,
+products, and data locations.
+"""
+import json
+import logging
+from pathlib import Path
+from typing import List, Dict, Optional, Any
+logger = logging.getLogger(__name__)
+def load_campaigns_from_file(campaign_file: Path) -> List[Dict[str, Any]]:
+    """Load campaign information from a JSON file.
+    Args:
+        campaign_file: Path to campaigns.json file
+    Returns:
+        List of campaign dictionaries
+    """
+    if not campaign_file.exists():
+        logger.warning(f"Campaign file not found: {campaign_file}")
+        return []
+    try:
+        with open(campaign_file, 'r') as f:
+            campaigns = json.load(f)
+        logger.info(f"Loaded {len(campaigns)} campaigns from {campaign_file}")
+        return campaigns
+    except Exception as e:
+        logger.error(f"Failed to load campaigns from {campaign_file}: {e}")
+        return []
+def find_campaign_by_id(campaigns: List[Dict[str, Any]], campaign_id: str) -> Optional[Dict[str, Any]]:
+    """Find a specific campaign by ID.
+    Args:
+        campaigns: List of campaign dictionaries
+        campaign_id: Campaign ID to search for
+    Returns:
+        Campaign dictionary if found, None otherwise
+    """
+    for campaign in campaigns:
+        if campaign.get('campaign_id') == campaign_id:
+            return campaign
+    return None
+def get_campaign_context(campaign: Dict[str, Any]) -> str:
+    """Generate context string for prompts from campaign information.
+    Args:
+        campaign: Campaign dictionary
+    Returns:
+        Formatted context string for inclusion in prompts
+    """
+    context_parts = []
+    # Basic campaign info
+    context_parts.append(f"Campaign: {campaign.get('campaign_name', 'Unknown')}")
+    context_parts.append(f"Description: {campaign.get('description', '')}")
+    # Model reaction info
+    if campaign.get('model_substrate'):
+        context_parts.append(f"Model Substrate: {campaign['model_substrate']} (ID: {campaign.get('substrate_id', 'unknown')})")
+    if campaign.get('model_product'):
+        context_parts.append(f"Model Product: {campaign['model_product']} (ID: {campaign.get('product_id', 'unknown')})")
+    # Data locations
+    if campaign.get('data_locations'):
+        locations = ', '.join(campaign['data_locations'])
+        context_parts.append(f"Key Data Locations: {locations}")
+    # Lineage hint if available
+    if campaign.get('lineage_hint'):
+        context_parts.append(f"Evolution Pathway: {campaign['lineage_hint']}")
+    # Additional notes
+    if campaign.get('notes'):
+        context_parts.append(f"Notes: {campaign['notes']}")
+    return '\n'.join(context_parts)
+def get_location_hints_for_campaign(campaign: Dict[str, Any]) -> List[str]:
+    """Extract specific location hints from campaign data.
+    Args:
+        campaign: Campaign dictionary
+    Returns:
+        List of location strings (e.g., ["Figure 2a", "Table S4"])
+    """
+    return campaign.get('data_locations', [])
+def enhance_prompt_with_campaign(prompt: str, campaign: Optional[Dict[str, Any]],
+                                 section_name: str = "CAMPAIGN CONTEXT") -> str:
+    """Enhance a prompt with campaign context information.
+    Args:
+        prompt: Original prompt
+        campaign: Campaign dictionary (optional)
+        section_name: Section header for the campaign context
+    Returns:
+        Enhanced prompt with campaign context
+    """
+    if not campaign:
+        return prompt
+    context = get_campaign_context(campaign)
+    locations = get_location_hints_for_campaign(campaign)
+    campaign_section = f"\n\n{section_name}:\n{'-' * 50}\n{context}"
+    if locations:
+        campaign_section += f"\n\nIMPORTANT: Focus particularly on these locations: {', '.join(locations)}"
+    campaign_section += f"\n{'-' * 50}\n"
+    # Insert campaign context early in the prompt
+    # Look for a good insertion point after initial instructions
+    lines = prompt.split('\n')
+    insert_idx = 0
+    # Find a good place to insert (after first paragraph or instruction block)
+    for i, line in enumerate(lines):
+        if i > 5 and (not line.strip() or line.startswith('Given') or line.startswith('You')):
+            insert_idx = i
+            break
+    if insert_idx == 0:
+        # Fallback: just prepend
+        return campaign_section + prompt
+    else:
+        # Insert at found position
+        lines.insert(insert_idx, campaign_section)
+        return '\n'.join(lines)

debase/caption_pattern.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Universal caption pattern for all DEBase extractors.
+This module provides a consistent caption pattern that handles various
+formats found in scientific papers, including:
+- Standard formats: Figure 1, Fig. 1, Table 1
+- Supplementary formats: Supplementary Figure 1, Supp. Table 1
+- Extended data: Extended Data Figure 1, ED Fig. 1
+- Other types: Scheme 1, Chart 1
+- Page headers: S14 Table 5
+- Various punctuation: Figure 1. Figure 1: Figure 1 |
+"""
+import re
+# Universal caption pattern that handles all common formats
+UNIVERSAL_CAPTION_PATTERN = re.compile(
+    r"""
+    ^                                      # Start of line
+    [^\n]{0,20}?                          # Up to 20 chars of any content (page headers, etc.)
+    (                                      # Start capture group
+        (?:Extended\s+Data\s+)?           # Optional "Extended Data" prefix
+        (?:ED\s+)?                        # Optional "ED" prefix
+        (?:Supplementary|Supp\.?|Suppl\.?)?\s*  # Optional supplementary prefixes
+        (?:Table|Fig(?:ure)?|Scheme|Chart)      # Main caption types
+    )                                      # End capture group
+    (?:                                    # Non-capturing group for what follows
+        \s*                               # Optional whitespace
+        (?:S?\d+[A-Za-z]?|[IVX]+)        # Number (with optional S prefix or roman)
+        (?:[.:|]|\s+\|)?                  # Optional punctuation (. : or |)
+    |                                      # OR
+        \.                                # Just a period (for "Fig." without number)
+    )
+    """,
+    re.I | re.X | re.M
+)
+def get_universal_caption_pattern():
+    """Get the universal caption pattern for use in extractors."""
+    return UNIVERSAL_CAPTION_PATTERN

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -28,6 +28,13 @@ import fitz
 import re
 import json
 import time
+# Import universal caption pattern
+try:
+    from .caption_pattern import get_universal_caption_pattern
+except ImportError:
+    # Fallback if running as standalone script
+    from caption_pattern import get_universal_caption_pattern
 import logging
 from pathlib import Path
 from dataclasses import dataclass, field
@@ -113,17 +120,8 @@ _DOI_REGEX = re.compile(r"10\.[0-9]{4,9}/[-._;()/:A-Z0-9]+", re.I)
 # PDB ID regex - matches 4-character PDB codes
 _PDB_REGEX = re.compile(r"\b[1-9][A-Z0-9]{3}\b")
-# Improved caption prefix regex - captures most journal variants
-_CAPTION_PREFIX_RE = re.compile(
-    r"""
-    ^\s*
-    (?:Fig(?:ure)?|Extended\s+Data\s+Fig|ED\s+Fig|Scheme|Chart|
-       Table|Supp(?:lementary|l|\.?)\s+(?:Fig(?:ure)?|Table))  # label part
-    \s*(?:S?\d+[A-Za-z]?|[IVX]+)                               # figure number
-    [.:]?\s*                                                   # trailing punctuation/space
-    """,
-    re.I | re.X,
-)
+# Use universal caption pattern
+_CAPTION_PREFIX_RE = get_universal_caption_pattern()
 def _open_doc(pdf_path: str | Path | bytes):
@@ -956,6 +954,9 @@ def identify_evolution_locations(
         campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
         if hasattr(camp, 'notes') and camp.notes:
             campaign_context += f"- Key identifiers: {camp.notes}\n"
+        if hasattr(camp, 'data_locations') and camp.data_locations:
+            campaign_context += f"- KNOWN DATA LOCATIONS: {', '.join(camp.data_locations)}\n"
+            campaign_context += "  IMPORTANT: Prioritize these known locations highly!\n"
         campaign_specific = f" for the '{camp.campaign_name}' campaign"
         campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
         campaign_example = f', "campaign_id": "{camp.campaign_id}"'
@@ -964,7 +965,10 @@ def identify_evolution_locations(
         campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
         for camp in campaigns:
             campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
+            if hasattr(camp, 'data_locations') and camp.data_locations:
+                campaign_context += f"  Known locations: {', '.join(camp.data_locations)}\n"
         campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
+        campaign_context += "IMPORTANT: Prioritize the known locations listed above!\n"
         campaign_specific = " for any of the identified campaigns"
         campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
         campaign_example = ', "campaign_id": "campaign_id_here"'
@@ -1041,6 +1045,7 @@ def extract_complete_lineage(
     campaign_id: Optional[str] = None,
     campaign_info: Optional[Campaign] = None,
     pdf_paths: Optional[List[Path]] = None,
+    location_str: Optional[str] = None,
 ) -> List[Variant]:
     """Prompt Gemini for the full lineage and return a list[Variant]."""
     # Build campaign context
@@ -1060,6 +1065,21 @@ IMPORTANT:
 2. Include "campaign_id": "{campaign_info.campaign_id}" for each variant in your response.
 3. Use the lineage hint pattern above to identify which variants belong to this campaign.
 4. Include parent variants only if they are direct ancestors in this campaign's lineage.
+"""
+    # Add location context if provided
+    location_context = ""
+    if location_str:
+        location_context = f"""
+LOCATION CONTEXT:
+You are extracting data SPECIFICALLY from: {location_str}
+CRITICAL INSTRUCTIONS:
+- ONLY extract enzyme variants that appear in {location_str}
+- DO NOT include variants from other figures, tables, or sections
+- If {location_str} references variants from other locations, DO NOT include those unless they are explicitly shown in {location_str}
+- Focus strictly on the data presented within the boundaries of {location_str}
 """
     # Extract table of contents from PDFs if available
@@ -1096,8 +1116,11 @@ IMPORTANT:
     # Include TOC in the prompt text
     combined_text = toc_text + text if toc_text else text
+    # Combine campaign and location context
+    full_context = campaign_context + location_context
     prompt = _LINEAGE_EXTRACT_PROMPT.format(
-        campaign_context=campaign_context,
+        campaign_context=full_context,
         schema=_LINEAGE_SCHEMA_HINT,
         text=combined_text[:MAX_CHARS],
     )
@@ -1705,7 +1728,8 @@ def get_lineage(
                             debug_dir=debug_dir,
                             campaign_id=campaign.campaign_id,
                             campaign_info=campaign,
-                            pdf_paths=pdf_paths
+                            pdf_paths=pdf_paths,
+                            location_str=location_str
                         )
                         if variants:
                             log.info(f"Extracted {len(variants)} variants from {location_type}")
@@ -3364,6 +3388,9 @@ Only match variants that represent the SAME enzyme, accounting for different nam
 Return ONLY a JSON object mapping lineage IDs to sequence IDs.
 Format: {{"lineage_id": "sequence_id", ...}}
 Only include matches you are confident represent the same variant.
+DO NOT include any explanation, reasoning, or text other than the JSON object.
+Response must be valid JSON that starts with {{ and ends with }}
 """
             try:
@@ -3406,17 +3433,28 @@ Only include matches you are confident represent the same variant.
                         log.error(f"Full cleaned text: {text}")
                         # Try to extract JSON from within the response
                         import re
-                        json_match = re.search(r'\{.*\}', text, re.DOTALL)
-                        if json_match:
+                        # First try to find JSON in code blocks
+                        code_block_match = re.search(r'```json\s*(\{[^`]*\})\s*```', text, re.DOTALL)
+                        if code_block_match:
                             try:
-                                matches = json.loads(json_match.group(0))
-                                log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
+                                matches = json.loads(code_block_match.group(1))
+                                log.info(f"Successfully extracted JSON from code block: {len(matches)} matches")
                             except json.JSONDecodeError:
-                                log.error("Failed to extract JSON from response")
+                                log.error("Failed to parse JSON from code block")
                                 matches = {}
                         else:
-                            log.error("No JSON object found in response")
-                            matches = {}
+                            # Try to find standalone JSON object (non-greedy, looking for balanced braces)
+                            json_match = re.search(r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})', text)
+                            if json_match:
+                                try:
+                                    matches = json.loads(json_match.group(1))
+                                    log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
+                                except json.JSONDecodeError:
+                                    log.error("Failed to extract JSON from response")
+                                    matches = {}
+                            else:
+                                log.error("No JSON object found in response")
+                                matches = {}
                 # Create a mapping of sequence IDs to their data for efficient lookup
                 seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}

debase/lineage_format.py CHANGED Viewed

@@ -35,7 +35,6 @@ import logging
 import os
 import pickle
 import re
-import sqlite3
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -137,8 +136,7 @@ SUBSTRATE_CACHE_FILE: Path = CACHE_DIR / "substrate_smiles_cache.pkl"
 CANONICAL_CACHE_FILE: Path = CACHE_DIR / "canonical_smiles_cache.pkl"
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
-# Local PubChem DB (optional) --------------------------------------------------------
-PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
+# API endpoints for IUPAC to SMILES conversion --------------------------------------
 # Gemini API configuration -----------------------------------------------------------
 GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
@@ -323,37 +321,7 @@ SUBSTRATE_CACHE: Dict[str, str] = _load_pickle(SUBSTRATE_CACHE_FILE)
 CANONICAL_CACHE: Dict[str, str] = _load_pickle(CANONICAL_CACHE_FILE)
-# --- Database lookup ---------------------------------------------------------------
-class PubChemDB:
-    """Very thin wrapper around a local SQLite mapping IUPAC -> SMILES."""
-    def __init__(self, path: Path | str) -> None:
-        self.path = Path(path)
-        self._conn: Optional[sqlite3.Connection] = None
-        if not self.path.exists():
-            log.warning("Local PubChem DB not found at %s", self.path)
-    def _connect(self) -> sqlite3.Connection:
-        if self._conn is None:
-            self._conn = sqlite3.connect(str(self.path))
-        return self._conn
-    def lookup(self, name: str) -> Optional[str]:
-        if not self.path.exists():
-            return None
-        sql = "SELECT smiles FROM x WHERE name = ? LIMIT 1"
-        try:
-            # Create a new connection for thread safety
-            conn = sqlite3.connect(str(self.path))
-            cur = conn.execute(sql, (name.lower(),))
-            row = cur.fetchone()
-            conn.close()
-            return row[0] if row else None
-        except Exception:  # pragma: no cover
-            return None
-PC_DB = PubChemDB(PUBCHEM_DB_PATH)
+# --- Removed local database - using only online APIs -------------------------------
 # === 5. SEQUENCE / MUTATION HELPERS ================================================
@@ -481,12 +449,7 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
     if not name or name.lower() in ['nan', 'none', 'null', 'n/a', 'na', '']:
         return ""
-    # 1. Local DB (fast, offline)
-    db_smiles = PC_DB.lookup(name)
-    if db_smiles:
-        return db_smiles
-    # 2. OPSIN (if installed) ---------------------------------------------------
+    # 1. OPSIN (if installed) - fast and reliable for IUPAC names
     try:
         import subprocess
@@ -503,12 +466,7 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
     except FileNotFoundError:
         pass  # OPSIN not installed
-    # 3. Gemini search (for complex compounds) ---------------------------------
-    gemini_smiles = search_smiles_with_gemini(name)
-    if gemini_smiles:
-        return gemini_smiles
-    # 4. PubChem PUG REST (online) ---------------------------------------------
+    # 2. PubChem PUG REST API (online) - comprehensive database
     try:
         import requests
@@ -521,6 +479,11 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
             return pug_smiles
     except Exception:  # pragma: no cover
         pass
+    # 3. Gemini search (for complex compounds) - AI fallback
+    gemini_smiles = search_smiles_with_gemini(name)
+    if gemini_smiles:
+        return gemini_smiles
     # Return empty string if all methods fail
     return ""

debase 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

debase 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl