debase 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.5.0/src/debase.egg-info → debase-0.6.0}/PKG-INFO +1 -1
- {debase-0.5.0 → debase-0.6.0}/src/debase/_version.py +1 -1
- {debase-0.5.0 → debase-0.6.0}/src/debase/lineage_format.py +9 -46
- {debase-0.5.0 → debase-0.6.0}/src/debase/reaction_info_extractor.py +45 -5
- {debase-0.5.0 → debase-0.6.0/src/debase.egg-info}/PKG-INFO +1 -1
- {debase-0.5.0 → debase-0.6.0}/.gitignore +0 -0
- {debase-0.5.0 → debase-0.6.0}/LICENSE +0 -0
- {debase-0.5.0 → debase-0.6.0}/MANIFEST.in +0 -0
- {debase-0.5.0 → debase-0.6.0}/README.md +0 -0
- {debase-0.5.0 → debase-0.6.0}/environment.yml +0 -0
- {debase-0.5.0 → debase-0.6.0}/pyproject.toml +0 -0
- {debase-0.5.0 → debase-0.6.0}/setup.cfg +0 -0
- {debase-0.5.0 → debase-0.6.0}/setup.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/__init__.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase/__init__.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase/__main__.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase/build_db.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase/cleanup_sequence.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase/enzyme_lineage_extractor.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase/substrate_scope_extractor.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase/wrapper.py +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase.egg-info/SOURCES.txt +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.5.0 → debase-0.6.0}/src/debase.egg-info/top_level.txt +0 -0
@@ -35,7 +35,6 @@ import logging
|
|
35
35
|
import os
|
36
36
|
import pickle
|
37
37
|
import re
|
38
|
-
import sqlite3
|
39
38
|
import sys
|
40
39
|
import time
|
41
40
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
@@ -137,8 +136,7 @@ SUBSTRATE_CACHE_FILE: Path = CACHE_DIR / "substrate_smiles_cache.pkl"
|
|
137
136
|
CANONICAL_CACHE_FILE: Path = CACHE_DIR / "canonical_smiles_cache.pkl"
|
138
137
|
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
139
138
|
|
140
|
-
#
|
141
|
-
PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
|
139
|
+
# API endpoints for IUPAC to SMILES conversion --------------------------------------
|
142
140
|
|
143
141
|
# Gemini API configuration -----------------------------------------------------------
|
144
142
|
GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
|
@@ -323,37 +321,7 @@ SUBSTRATE_CACHE: Dict[str, str] = _load_pickle(SUBSTRATE_CACHE_FILE)
|
|
323
321
|
CANONICAL_CACHE: Dict[str, str] = _load_pickle(CANONICAL_CACHE_FILE)
|
324
322
|
|
325
323
|
|
326
|
-
# ---
|
327
|
-
class PubChemDB:
|
328
|
-
"""Very thin wrapper around a local SQLite mapping IUPAC -> SMILES."""
|
329
|
-
|
330
|
-
def __init__(self, path: Path | str) -> None:
|
331
|
-
self.path = Path(path)
|
332
|
-
self._conn: Optional[sqlite3.Connection] = None
|
333
|
-
if not self.path.exists():
|
334
|
-
log.warning("Local PubChem DB not found at %s", self.path)
|
335
|
-
|
336
|
-
def _connect(self) -> sqlite3.Connection:
|
337
|
-
if self._conn is None:
|
338
|
-
self._conn = sqlite3.connect(str(self.path))
|
339
|
-
return self._conn
|
340
|
-
|
341
|
-
def lookup(self, name: str) -> Optional[str]:
|
342
|
-
if not self.path.exists():
|
343
|
-
return None
|
344
|
-
sql = "SELECT smiles FROM x WHERE name = ? LIMIT 1"
|
345
|
-
try:
|
346
|
-
# Create a new connection for thread safety
|
347
|
-
conn = sqlite3.connect(str(self.path))
|
348
|
-
cur = conn.execute(sql, (name.lower(),))
|
349
|
-
row = cur.fetchone()
|
350
|
-
conn.close()
|
351
|
-
return row[0] if row else None
|
352
|
-
except Exception: # pragma: no cover
|
353
|
-
return None
|
354
|
-
|
355
|
-
|
356
|
-
PC_DB = PubChemDB(PUBCHEM_DB_PATH)
|
324
|
+
# --- Removed local database - using only online APIs -------------------------------
|
357
325
|
|
358
326
|
|
359
327
|
# === 5. SEQUENCE / MUTATION HELPERS ================================================
|
@@ -481,12 +449,7 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
|
|
481
449
|
if not name or name.lower() in ['nan', 'none', 'null', 'n/a', 'na', '']:
|
482
450
|
return ""
|
483
451
|
|
484
|
-
# 1.
|
485
|
-
db_smiles = PC_DB.lookup(name)
|
486
|
-
if db_smiles:
|
487
|
-
return db_smiles
|
488
|
-
|
489
|
-
# 2. OPSIN (if installed) ---------------------------------------------------
|
452
|
+
# 1. OPSIN (if installed) - fast and reliable for IUPAC names
|
490
453
|
try:
|
491
454
|
import subprocess
|
492
455
|
|
@@ -503,12 +466,7 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
|
|
503
466
|
except FileNotFoundError:
|
504
467
|
pass # OPSIN not installed
|
505
468
|
|
506
|
-
#
|
507
|
-
gemini_smiles = search_smiles_with_gemini(name)
|
508
|
-
if gemini_smiles:
|
509
|
-
return gemini_smiles
|
510
|
-
|
511
|
-
# 4. PubChem PUG REST (online) ---------------------------------------------
|
469
|
+
# 2. PubChem PUG REST API (online) - comprehensive database
|
512
470
|
try:
|
513
471
|
import requests
|
514
472
|
|
@@ -521,6 +479,11 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
|
|
521
479
|
return pug_smiles
|
522
480
|
except Exception: # pragma: no cover
|
523
481
|
pass
|
482
|
+
|
483
|
+
# 3. Gemini search (for complex compounds) - AI fallback
|
484
|
+
gemini_smiles = search_smiles_with_gemini(name)
|
485
|
+
if gemini_smiles:
|
486
|
+
return gemini_smiles
|
524
487
|
|
525
488
|
# Return empty string if all methods fail
|
526
489
|
return ""
|
@@ -1195,7 +1195,8 @@ class ReactionExtractor:
|
|
1195
1195
|
# Create a flexible pattern that handles various spacing and formatting
|
1196
1196
|
# This pattern looks for "Figure" (case insensitive) followed by optional spaces
|
1197
1197
|
# then the figure number, then any of: period, colon, space+capital letter, or end of line
|
1198
|
-
|
1198
|
+
# Also match at the beginning of a line to catch captions
|
1199
|
+
flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
|
1199
1200
|
|
1200
1201
|
LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
|
1201
1202
|
main_figure_num, flexible_pattern)
|
@@ -1231,11 +1232,17 @@ class ReactionExtractor:
|
|
1231
1232
|
continue
|
1232
1233
|
|
1233
1234
|
# Check if the remaining text looks like a caption (contains descriptive words)
|
1235
|
+
# Expanded list of caption keywords to be more inclusive
|
1234
1236
|
first_words = remaining_text[:50].lower()
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1237
|
+
caption_keywords = ['detailed', 'representative', 'shows', 'comparison',
|
1238
|
+
'illustrates', 'demonstrates', 'results', 'data',
|
1239
|
+
'chromatogram', 'spectra', 'analysis', 'site-directed',
|
1240
|
+
'mutagenesis', 'mutants', 'evolution', 'directed',
|
1241
|
+
'screening', 'reaction', 'variant', 'enzyme', 'protein',
|
1242
|
+
'activity', 'performance', 'yield', 'selectivity',
|
1243
|
+
'characterization', 'optimization', 'development',
|
1244
|
+
'structure', 'domain', 'crystal', 'model']
|
1245
|
+
if not any(word in first_words for word in caption_keywords):
|
1239
1246
|
LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
|
1240
1247
|
continue
|
1241
1248
|
|
@@ -1322,6 +1329,39 @@ class ReactionExtractor:
|
|
1322
1329
|
self._figure_cache.put(cache_key, result)
|
1323
1330
|
return result
|
1324
1331
|
|
1332
|
+
# Fallback: If no caption found, try to find any page that mentions this figure
|
1333
|
+
LOGGER.info("No figure caption found for '%s', trying fallback search", ref)
|
1334
|
+
|
1335
|
+
for doc_idx, doc in enumerate(docs):
|
1336
|
+
doc_name = "MS" if doc_idx == 0 else "SI"
|
1337
|
+
for page_number in range(doc.page_count):
|
1338
|
+
page = doc.load_page(page_number)
|
1339
|
+
page_text = page.get_text()
|
1340
|
+
|
1341
|
+
# Look for any mention of the figure reference
|
1342
|
+
if re.search(rf'\b{re.escape(ref)}\b', page_text, re.IGNORECASE):
|
1343
|
+
LOGGER.info("Found '%s' mentioned on page %d of %s document (fallback)",
|
1344
|
+
ref, page_number + 1, doc_name)
|
1345
|
+
|
1346
|
+
# Extract the entire page as the figure might be on this page
|
1347
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
|
1348
|
+
pix = page.get_pixmap(matrix=mat)
|
1349
|
+
pix = self._ensure_rgb_pixmap(pix)
|
1350
|
+
img_bytes = pix.tobytes("png")
|
1351
|
+
|
1352
|
+
# Save PNG to debug directory if available
|
1353
|
+
if self.debug_dir:
|
1354
|
+
timestamp = int(time.time())
|
1355
|
+
png_file = self.debug_dir / f"fallback_{ref.replace(' ', '_')}_{timestamp}.png"
|
1356
|
+
with open(png_file, 'wb') as f:
|
1357
|
+
f.write(img_bytes)
|
1358
|
+
LOGGER.info("Saved fallback page image to: %s", png_file)
|
1359
|
+
|
1360
|
+
result = b64encode(img_bytes).decode()
|
1361
|
+
# Cache the result
|
1362
|
+
self._figure_cache.put(cache_key, result)
|
1363
|
+
return result
|
1364
|
+
|
1325
1365
|
LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
|
1326
1366
|
return None
|
1327
1367
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|