PyPI - debase - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

debase 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +512 -33
debase/enzyme_lineage_extractor.py +985 -100
debase/lineage_format.py +226 -13
debase/reaction_info_extractor.py +178 -34
debase/substrate_scope_extractor.py +52 -4
debase/wrapper.py +155 -151
debase-0.4.5.dist-info/METADATA +121 -0
debase-0.4.5.dist-info/RECORD +16 -0
debase-0.4.3.dist-info/METADATA +0 -296
debase-0.4.3.dist-info/RECORD +0 -16
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0

debase/reaction_info_extractor.py CHANGED Viewed

@@ -58,7 +58,7 @@ class Config:
     extract_temperature: float = 0.0
     model_reaction_temperature: float = 0.0
     top_p: float = 1.0
-    max_tokens: int = 4096
+    max_tokens: int = 12288  # Increased 3x from 4096
     pdf_cache_size: int = 8
     retries: int = 2
@@ -209,7 +209,7 @@ def _cached_gemini_call(
                 parts,
                 generation_config={
                     "temperature": temperature,
-                    "max_output_tokens": 8192,
+                    "max_output_tokens": 24576,  # Increased 3x from 8192
                 }
             )
             # Track token usage if available
@@ -450,7 +450,7 @@ Respond with a JSON array where each element contains:
 - "lineage_hint": any indication of which enzyme group this data is for (or null)
 - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
-Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
+Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
 Do not include too much sources, just return 2 or 3 sources.
 Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
 When returning confidence scores, be more accurate and avoid scores that are too close together.
@@ -703,6 +703,14 @@ CRITICAL - NO HALLUCINATION:
 - If no IUPAC name is found for a compound, return null for iupac_name
 - Include ALL compounds found or referenced
+IMPORTANT - ONE NAME PER COMPOUND:
+- Return ONLY ONE IUPAC name per compound identifier
+- If multiple names are found for the same compound, choose the one most likely to be the IUPAC name:
+  1. Names explicitly labeled as "IUPAC name:" in the text
+  2. Names in compound characterization sections
+  3. The most systematic/complete chemical name
+- Do NOT return multiple IUPAC names in a single iupac_name field
 Return as JSON:
 {
   "compound_mappings": [
@@ -722,8 +730,8 @@ Return as JSON:
 ###############################################################################
 class ReactionExtractor:
-    _FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
-    _TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
+    _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
+    _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
     def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
                  campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
@@ -792,14 +800,24 @@ class ReactionExtractor:
                 context_start = context_start + last_period + 1
             # For tables, include much more content after the caption to show actual table data
-            # For figures, keep the original limit
-            is_table = match.group(1).lower() == 'table'
-            max_chars = 5000 if is_table else 3000
-            # Get up to max_chars or until double newline
-            caption_end = all_text.find("\n\n", caption_start)
-            if caption_end == -1 or caption_end - caption_start > max_chars:
+            # For figures, include more content to ensure complete captions
+            is_table = 'table' in match.group(1).lower()
+            max_chars = 8000 if is_table else 5000
+            # Get up to max_chars or until double newline (but ensure we get complete caption)
+            # First, try to find the end of the caption sentence
+            caption_end = caption_start
+            period_pos = all_text.find('. ', caption_start)
+            if period_pos != -1 and period_pos < caption_start + 1000:
+                # Include at least to the end of the caption sentence
+                caption_end = period_pos + 1
+            # Then extend to include more context or until double newline
+            double_newline_pos = all_text.find("\n\n", caption_end)
+            if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
                 caption_end = caption_start + max_chars
+            else:
+                caption_end = double_newline_pos
             # Include the context and full caption with table content
             full_caption = all_text[context_start:caption_end].strip()
@@ -1082,6 +1100,7 @@ class ReactionExtractor:
         If extract_figure_only=True, extracts just the figure above the caption.
         If False, extracts the entire page (useful for tables).
         Returns a base64-encoded PNG or None."""
+        LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
         # Check cache first
         cache_key = f"{ref}_{extract_figure_only}"
@@ -1099,10 +1118,18 @@ class ReactionExtractor:
             return None
         # For figure extraction, search both documents for actual figure captions
-        for doc in filter(None, [self.ms_doc, self.si_doc]):
+        docs = list(filter(None, [self.ms_doc, self.si_doc]))
+        LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
+        for doc_idx, doc in enumerate(docs):
+            doc_name = "MS" if doc_idx == 0 else "SI"
+            LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
             for page_number in range(doc.page_count):
                 page = doc.load_page(page_number)
                 page_text = page.get_text()
+                LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
+                           page_number + 1, doc_name, len(page_text))
                 # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
                 # For subfigures like "Figure 1C", extract the main figure "Figure 1"
@@ -1150,6 +1177,14 @@ class ReactionExtractor:
                     if figure_mentions:
                         LOGGER.debug("Page %d has figure mentions but no caption match: %s",
                                    page_number, figure_mentions[:3])
+                    # For supplementary figures, also check for "supplementary" mentions
+                    if 'supplementary' in ref.lower():
+                        supp_mentions = [line.strip() for line in page_text.split('\n')
+                                       if 'supplementary' in line.lower() and 'figure' in line.lower()]
+                        if supp_mentions:
+                            LOGGER.warning("Found supplementary figure mentions on page %d but no caption match. First 3: %s",
+                                         page_number + 1, supp_mentions[:3])
                     continue
                 if extract_figure_only:
@@ -1207,6 +1242,8 @@ class ReactionExtractor:
                     # Cache the result
                     self._figure_cache.put(cache_key, result)
                     return result
+        LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
         return None
     def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
@@ -1437,6 +1474,7 @@ class ReactionExtractor:
     def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
         """Extract performance metrics for multiple enzymes from the identified location in batch."""
+        LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
         ref_lc = ref.lower()
         image_b64: Optional[str] = None
@@ -1458,11 +1496,15 @@ class ReactionExtractor:
                 snippet = self._extract_table_context(ref)
         elif self._FIG_RE.search(ref_lc):
             # For figures, extract just the figure image (same logic as compound mapping)
+            LOGGER.debug("Attempting to extract figure image for '%s'", ref)
             image_b64 = self._extract_page_png(ref, extract_figure_only=True)
             if not image_b64:
-                LOGGER.debug("No figure image found for %s - using caption text", ref)
+                LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
                 snippet = self._extract_figure_caption(ref)
+                LOGGER.debug("Caption extraction result: %s",
+                           f"'{snippet[:100]}...'" if snippet else "empty")
             else:
+                LOGGER.info("Successfully extracted figure image for '%s'", ref)
                 # If figure is found, ignore text information - use image only
                 snippet = ""
         else:
@@ -1907,8 +1949,14 @@ TEXT FROM MANUSCRIPT:
                     f.write(prompt)
                 LOGGER.info("Full prompt saved to: %s", prompt_file)
-            # Make multimodal API call
-            response = self.model.generate_content(content_parts)
+            # Make multimodal API call with increased token limit
+            response = self.model.generate_content(
+                content_parts,
+                generation_config={
+                    "temperature": 0.0,
+                    "max_output_tokens": 24576,  # Increased 3x for compound mapping
+                }
+            )
             # Track token usage if available
             try:
@@ -1971,6 +2019,7 @@ TEXT FROM MANUSCRIPT:
         compound_ids: List[str],
         initial_sections: List[str] = None,
         campaign_filter: Optional[str] = None,
+        iupac_location_hint: Optional[Dict[str, Any]] = None,
     ) -> Dict[str, CompoundMapping]:
         """Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
@@ -2002,14 +2051,57 @@ TEXT FROM MANUSCRIPT:
         LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
                    len(uncached_compound_ids), sorted(uncached_compound_ids))
-        # Tier 1: Standard sections (manuscript + initial SI sections)
-        initial_sections = initial_sections or [
-            "General procedure", "Compound characterization",
-            "Synthesis", "Experimental", "Materials and methods"
-        ]
-        # Include manuscript pages (first 10) for model reaction context
-        manuscript_text = "\n\n".join(self.ms_pages[:10])
+        # Tier 1: Use IUPAC location hint if provided, otherwise standard sections
+        if iupac_location_hint and iupac_location_hint.get('location'):
+            LOGGER.info("Tier 1: Using IUPAC location hint: %s", iupac_location_hint.get('location'))
+            if iupac_location_hint.get('compound_section_hint'):
+                LOGGER.info("Tier 1: Compound section hint: %s", iupac_location_hint.get('compound_section_hint'))
+            # Extract text from the specific IUPAC location
+            iupac_text = self._get_extended_text_around_location(
+                iupac_location_hint['location'],
+                before=2000,
+                after=10000
+            )
+            # Also check for compound-specific hints
+            compound_hint = iupac_location_hint.get('compound_section_hint', '')
+            if compound_hint and iupac_text:
+                # Search for the specific compound section
+                hint_pattern = re.escape(compound_hint)
+                match = re.search(hint_pattern, iupac_text, re.IGNORECASE)
+                if match:
+                    # Extract more focused text around the compound hint
+                    start = max(0, match.start() - 500)
+                    end = min(len(iupac_text), match.end() + 2000)
+                    iupac_text = iupac_text[start:end]
+                    LOGGER.info("Found compound hint '%s' in IUPAC section", compound_hint)
+            extraction_text = iupac_text or ""
+            if extraction_text:
+                LOGGER.info("Tier 1: Extracted %d chars from IUPAC location hint", len(extraction_text))
+            else:
+                LOGGER.warning("Tier 1: No text found at IUPAC location hint")
+            # Add some manuscript context
+            manuscript_text = "\n\n".join(self.ms_pages[:5])
+        else:
+            # Fallback to standard sections
+            initial_sections = initial_sections or [
+                "General procedure", "Compound characterization",
+                "Synthesis", "Experimental", "Materials and methods"
+            ]
+            # Extract from initial sections - search in all pages (manuscript + SI)
+            extraction_text = self._extract_sections_by_title(initial_sections)
+            # If no sections found by title, include first few SI pages which often have compound data
+            if not extraction_text and self.si_pages:
+                # SI often starts with compound characterization after TOC
+                si_compound_pages = "\n\n".join(self.si_pages[2:10])  # Skip first 2 pages (usually TOC)
+                extraction_text = si_compound_pages
+            # Include manuscript pages (first 10) for model reaction context
+            manuscript_text = "\n\n".join(self.ms_pages[:10])
         # Add campaign context if provided
         campaign_context = ""
@@ -2033,8 +2125,7 @@ Do NOT include compound information from other campaigns.
 """
-        # Extract from initial sections
-        extraction_text = self._extract_sections_by_title(initial_sections)
+        # Combine manuscript text, campaign context, and extraction text
         if extraction_text:
             extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
         else:
@@ -2083,11 +2174,11 @@ Do NOT include compound information from other campaigns.
                         figure_images[ref] = img_b64
                         LOGGER.info("Extracted %s for compound mapping", ref)
-            # Full text search including all pages
-            full_text = "\n\n".join(self.all_pages[:40])  # First 40 pages (more comprehensive)
+            # Full text search including ALL pages (manuscript + SI)
+            full_text = "\n\n".join(self.all_pages)  # Send everything
             final_mappings = self._extract_compound_mappings_with_figures(
-                full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
+                full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
             )
             # Merge final mappings with better compound ID matching
@@ -2261,7 +2352,13 @@ Do NOT include compound information from other campaigns.
         compound_mappings = {}
         if compound_ids:
             LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
-            compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
+            # Pass the IUPAC location hint if we have it
+            iupac_hint = locations.get("iupac_location") if locations else None
+            compound_mappings = self._extract_compound_mappings_adaptive(
+                compound_ids,
+                campaign_filter=self.campaign_filter,
+                iupac_location_hint=iupac_hint
+            )
             # Add the mapped IUPAC names to the context for better extraction
             if compound_mappings:
@@ -2404,6 +2501,34 @@ Different campaigns may use different model reactions and substrates.
                     LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
                                list(compound_mappings.keys()))
+                    # First, populate IUPAC lists directly from compound mappings based on compound_type
+                    substrate_iupacs_from_mappings = []
+                    product_iupacs_from_mappings = []
+                    for mapping in compound_mappings.values():
+                        if mapping.iupac_name and mapping.compound_type:
+                            if mapping.compound_type.lower() == "substrate":
+                                substrate_iupacs_from_mappings.append(mapping.iupac_name)
+                                LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
+                            elif mapping.compound_type.lower() == "product":
+                                product_iupacs_from_mappings.append(mapping.iupac_name)
+                                LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
+                    # Initialize or update the IUPAC lists with mapped compounds
+                    if substrate_iupacs_from_mappings:
+                        existing_substrates = data.get("substrate_iupac_list", []) or []
+                        if isinstance(existing_substrates, list):
+                            data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
+                        else:
+                            data["substrate_iupac_list"] = substrate_iupacs_from_mappings
+                    if product_iupacs_from_mappings:
+                        existing_products = data.get("product_iupac_list", []) or []
+                        if isinstance(existing_products, list):
+                            data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
+                        else:
+                            data["product_iupac_list"] = product_iupacs_from_mappings
                     # Try to map substrate/product lists through compound IDs
                     substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
                     if isinstance(substrate_list, list):
@@ -3021,7 +3146,14 @@ def main() -> None:
                 campaign_filter = all_campaigns[0]
                 LOGGER.info("Detected single campaign: %s", campaign_filter)
-                extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+                # Create campaign-specific debug directory even for single campaign
+                campaign_debug_dir = None
+                if args.debug_dir:
+                    campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign_filter}"
+                    campaign_debug_dir.mkdir(parents=True, exist_ok=True)
+                    LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
+                extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
                                             campaign_filter=campaign_filter, all_campaigns=all_campaigns)
                 df_metrics = extractor.run(enzyme_df)
@@ -3041,8 +3173,14 @@ def main() -> None:
                         LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
                         continue
-                    # Create extractor for this campaign
-                    extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+                    # Create extractor for this campaign with campaign-specific debug directory
+                    campaign_debug_dir = None
+                    if args.debug_dir:
+                        campaign_debug_dir = Path(args.debug_dir) / f"campaign_{campaign}"
+                        campaign_debug_dir.mkdir(parents=True, exist_ok=True)
+                        LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
+                    extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
                                                 campaign_filter=campaign, all_campaigns=all_campaigns)
                     # Run extraction for this campaign
@@ -3088,7 +3226,13 @@ def main() -> None:
                     df_metrics = pd.DataFrame()
         else:
             # No campaign information, process all enzymes together
-            extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
+            campaign_debug_dir = None
+            if args.debug_dir:
+                campaign_debug_dir = Path(args.debug_dir) / "no_campaign"
+                campaign_debug_dir.mkdir(parents=True, exist_ok=True)
+                LOGGER.info("Debug directory (no campaign): %s", campaign_debug_dir)
+            extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
                                         campaign_filter=campaign_filter, all_campaigns=all_campaigns)
             df_metrics = extractor.run(enzyme_df)

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -28,6 +28,7 @@ import re
 import json
 import time
 import logging
+import subprocess
 from pathlib import Path
 from dataclasses import dataclass, field
 from typing import List, Optional, Dict, Any, Union
@@ -103,6 +104,52 @@ class CompoundMapping:
     compound_type: str = "unknown"
     source_location: Optional[str] = None
+def is_valid_iupac_name_with_opsin(name: str) -> bool:
+    """Check if a name is a valid IUPAC name using the local OPSIN command."""
+    if not name or len(name.strip()) < 3:
+        return False
+    try:
+        # Use local OPSIN command to check if name can be converted to SMILES
+        process = subprocess.run(
+            ['opsin', '-o', 'smi'],
+            input=name.strip(),
+            text=True,
+            capture_output=True,
+            timeout=30
+        )
+        # If OPSIN successfully converts to SMILES, the name is valid IUPAC
+        if process.returncode == 0 and process.stdout.strip():
+            output = process.stdout.strip()
+            # Check if output looks like a valid SMILES (contains common SMILES characters)
+            if any(char in output for char in 'CNOS()=[]#+-'):
+                return True
+        return False
+    except Exception as e:
+        log.debug(f"OPSIN check failed for '{name}': {e}")
+        return False
+def _get_iupac_name(compound) -> str:
+    """Get IUPAC name for a compound, checking if the common name is already IUPAC."""
+    if not compound:
+        return ''
+    # If we already have an IUPAC name, use it
+    if compound.iupac_name:
+        return compound.iupac_name
+    # If no IUPAC name but we have a common name, check if it's already IUPAC
+    if compound.name:
+        # Check with OPSIN if the name is a valid IUPAC name
+        if is_valid_iupac_name_with_opsin(compound.name):
+            log.info(f"'{compound.name}' is already a valid IUPAC name, using it directly")
+            return compound.name
+    return ''
 # === 3. LOGGING HELPERS ===
 # --- Debug dump helper ----------------------------------------------------
@@ -2496,7 +2543,8 @@ def merge_with_lineage(
                     data = lineage_map[matched_name]
                     entry.parent_id = data['parent_id']
                     entry.mutations = data['mutations']
-                    entry.generation = data['generation']
+                    # Skip generation - to be filled by lineage_format
+                    # entry.generation = data['generation']
                     entry.aa_seq = data['aa_seq']
                     entry.dna_seq = data['dna_seq']
                     entry.confidence = data['confidence']
@@ -2524,7 +2572,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
             'enzyme_id': entry.enzyme_id,
             'parent_enzyme_id': entry.parent_id or '',
             'mutations': entry.mutations or '',
-            'generation': entry.generation if entry.generation is not None else '',
+            'generation': '',  # Empty generation - to be filled by lineage_format
             'campaign_id': entry.campaign_id or '',
             'protein_sequence': entry.aa_seq or '',
             'nucleotide_sequence': entry.dna_seq or '',
@@ -2532,9 +2580,9 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
             'flag': '',
             'substrate_list': '; '.join(s.name for s in entry.substrates if s.name),
-            'substrate_iupac_list': '; '.join(s.iupac_name or '' for s in entry.substrates),
+            'substrate_iupac_list': '; '.join(_get_iupac_name(s) for s in entry.substrates),
             'product_list': '; '.join(p.name for p in entry.products if p.name),
-            'product_iupac_list': '; '.join(p.iupac_name or '' for p in entry.products),
+            'product_iupac_list': '; '.join(_get_iupac_name(p) for p in entry.products),
             'cofactor_list': '; '.join(c.name for c in entry.cofactors if c.name),
             'cofactor_iupac_list': '; '.join(c.iupac_name or '' for c in entry.cofactors),

debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

debase 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl