PyPI - debase - Versions diffs - 0.5.0__tar.gz → 0.6.0__tar.gz - Mend

debase 0.5.0tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{debase-0.5.0/src/debase.egg-info → debase-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.5.0
+Version: 0.6.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.5.0 → debase-0.6.0}/src/debase/_version.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.5.0"
+__version__ = "0.6.0"

{debase-0.5.0 → debase-0.6.0}/src/debase/lineage_format.py RENAMED Viewed

@@ -35,7 +35,6 @@ import logging
 import os
 import pickle
 import re
-import sqlite3
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -137,8 +136,7 @@ SUBSTRATE_CACHE_FILE: Path = CACHE_DIR / "substrate_smiles_cache.pkl"
 CANONICAL_CACHE_FILE: Path = CACHE_DIR / "canonical_smiles_cache.pkl"
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
-# Local PubChem DB (optional) --------------------------------------------------------
-PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
+# API endpoints for IUPAC to SMILES conversion --------------------------------------
 # Gemini API configuration -----------------------------------------------------------
 GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
@@ -323,37 +321,7 @@ SUBSTRATE_CACHE: Dict[str, str] = _load_pickle(SUBSTRATE_CACHE_FILE)
 CANONICAL_CACHE: Dict[str, str] = _load_pickle(CANONICAL_CACHE_FILE)
-# --- Database lookup ---------------------------------------------------------------
-class PubChemDB:
-    """Very thin wrapper around a local SQLite mapping IUPAC -> SMILES."""
-    def __init__(self, path: Path | str) -> None:
-        self.path = Path(path)
-        self._conn: Optional[sqlite3.Connection] = None
-        if not self.path.exists():
-            log.warning("Local PubChem DB not found at %s", self.path)
-    def _connect(self) -> sqlite3.Connection:
-        if self._conn is None:
-            self._conn = sqlite3.connect(str(self.path))
-        return self._conn
-    def lookup(self, name: str) -> Optional[str]:
-        if not self.path.exists():
-            return None
-        sql = "SELECT smiles FROM x WHERE name = ? LIMIT 1"
-        try:
-            # Create a new connection for thread safety
-            conn = sqlite3.connect(str(self.path))
-            cur = conn.execute(sql, (name.lower(),))
-            row = cur.fetchone()
-            conn.close()
-            return row[0] if row else None
-        except Exception:  # pragma: no cover
-            return None
-PC_DB = PubChemDB(PUBCHEM_DB_PATH)
+# --- Removed local database - using only online APIs -------------------------------
 # === 5. SEQUENCE / MUTATION HELPERS ================================================
@@ -481,12 +449,7 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
     if not name or name.lower() in ['nan', 'none', 'null', 'n/a', 'na', '']:
         return ""
-    # 1. Local DB (fast, offline)
-    db_smiles = PC_DB.lookup(name)
-    if db_smiles:
-        return db_smiles
-    # 2. OPSIN (if installed) ---------------------------------------------------
+    # 1. OPSIN (if installed) - fast and reliable for IUPAC names
     try:
         import subprocess
@@ -503,12 +466,7 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
     except FileNotFoundError:
         pass  # OPSIN not installed
-    # 3. Gemini search (for complex compounds) ---------------------------------
-    gemini_smiles = search_smiles_with_gemini(name)
-    if gemini_smiles:
-        return gemini_smiles
-    # 4. PubChem PUG REST (online) ---------------------------------------------
+    # 2. PubChem PUG REST API (online) - comprehensive database
     try:
         import requests
@@ -521,6 +479,11 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
             return pug_smiles
     except Exception:  # pragma: no cover
         pass
+    # 3. Gemini search (for complex compounds) - AI fallback
+    gemini_smiles = search_smiles_with_gemini(name)
+    if gemini_smiles:
+        return gemini_smiles
     # Return empty string if all methods fail
     return ""

{debase-0.5.0 → debase-0.6.0}/src/debase/reaction_info_extractor.py RENAMED Viewed

@@ -1195,7 +1195,8 @@ class ReactionExtractor:
                 # Create a flexible pattern that handles various spacing and formatting
                 # This pattern looks for "Figure" (case insensitive) followed by optional spaces
                 # then the figure number, then any of: period, colon, space+capital letter, or end of line
-                flexible_pattern = rf"(?i)figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
+                # Also match at the beginning of a line to catch captions
+                flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
                 LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
                            main_figure_num, flexible_pattern)
@@ -1231,11 +1232,17 @@ class ReactionExtractor:
                         continue
                     # Check if the remaining text looks like a caption (contains descriptive words)
+                    # Expanded list of caption keywords to be more inclusive
                     first_words = remaining_text[:50].lower()
-                    if not any(word in first_words for word in ['detailed', 'representative', 'shows', 'comparison',
-                                                                 'illustrates', 'demonstrates', 'results', 'data',
-                                                                 'chromatogram', 'spectra', 'analysis', 'site-directed',
-                                                                 'mutagenesis', 'mutants']):
+                    caption_keywords = ['detailed', 'representative', 'shows', 'comparison',
+                                      'illustrates', 'demonstrates', 'results', 'data',
+                                      'chromatogram', 'spectra', 'analysis', 'site-directed',
+                                      'mutagenesis', 'mutants', 'evolution', 'directed',
+                                      'screening', 'reaction', 'variant', 'enzyme', 'protein',
+                                      'activity', 'performance', 'yield', 'selectivity',
+                                      'characterization', 'optimization', 'development',
+                                      'structure', 'domain', 'crystal', 'model']
+                    if not any(word in first_words for word in caption_keywords):
                         LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
                         continue
@@ -1322,6 +1329,39 @@ class ReactionExtractor:
                     self._figure_cache.put(cache_key, result)
                     return result
+        # Fallback: If no caption found, try to find any page that mentions this figure
+        LOGGER.info("No figure caption found for '%s', trying fallback search", ref)
+        for doc_idx, doc in enumerate(docs):
+            doc_name = "MS" if doc_idx == 0 else "SI"
+            for page_number in range(doc.page_count):
+                page = doc.load_page(page_number)
+                page_text = page.get_text()
+                # Look for any mention of the figure reference
+                if re.search(rf'\b{re.escape(ref)}\b', page_text, re.IGNORECASE):
+                    LOGGER.info("Found '%s' mentioned on page %d of %s document (fallback)",
+                               ref, page_number + 1, doc_name)
+                    # Extract the entire page as the figure might be on this page
+                    mat = fitz.Matrix(5.0, 5.0)  # 5x zoom for better quality
+                    pix = page.get_pixmap(matrix=mat)
+                    pix = self._ensure_rgb_pixmap(pix)
+                    img_bytes = pix.tobytes("png")
+                    # Save PNG to debug directory if available
+                    if self.debug_dir:
+                        timestamp = int(time.time())
+                        png_file = self.debug_dir / f"fallback_{ref.replace(' ', '_')}_{timestamp}.png"
+                        with open(png_file, 'wb') as f:
+                            f.write(img_bytes)
+                        LOGGER.info("Saved fallback page image to: %s", png_file)
+                    result = b64encode(img_bytes).decode()
+                    # Cache the result
+                    self._figure_cache.put(cache_key, result)
+                    return result
         LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
         return None

{debase-0.5.0 → debase-0.6.0/src/debase.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.5.0
+Version: 0.6.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team