PyPI - debase - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

debase 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +623 -234
debase/lineage_format.py +113 -11
debase/reaction_info_extractor.py +21 -7
debase/substrate_scope_extractor.py +516 -67
debase/wrapper.py +301 -67
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/METADATA +1 -1
debase-0.1.17.dist-info/RECORD +17 -0
debase-0.1.11.dist-info/RECORD +0 -17
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/WHEEL +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/entry_points.txt +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/licenses/LICENSE +0 -0
{debase-0.1.11.dist-info → debase-0.1.17.dist-info}/top_level.txt +0 -0

debase/lineage_format.py CHANGED Viewed

@@ -188,11 +188,17 @@ class VariantRecord:
     # Reaction-related -------------------------------------------------------------
     def substrate_iupac(self) -> List[str]:
         raw = str(self.row.get("substrate_iupac_list", "")).strip()
-        return _split_list(raw)
+        result = _split_list(raw)
+        if not result and raw and raw.lower() != 'nan':
+            log.debug(f"substrate_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
+        return result
     def product_iupac(self) -> List[str]:
         raw = str(self.row.get("product_iupac_list", "")).strip()
-        return _split_list(raw)
+        result = _split_list(raw)
+        if not result and raw and raw.lower() != 'nan':
+            log.debug(f"product_iupac_list for {self.eid}: raw='{raw}', parsed={result}")
+        return result
     def ttn_or_yield(self) -> Optional[float]:
@@ -377,6 +383,53 @@ def _nt_mut(parent_aa: str, child_aa: str, parent_nt: str = "", child_nt: str =
 # === 6. SMILES CONVERSION HELPERS ==================================================
+def search_smiles_with_gemini(compound_name: str, model=None) -> Optional[str]:
+    """
+    Use Gemini to search for SMILES strings of complex compounds.
+    Returns SMILES string if found, None otherwise.
+    """
+    if not compound_name or compound_name.lower() in ['nan', 'none', '']:
+        return None
+    if not model:
+        try:
+            # Import get_model from enzyme_lineage_extractor
+            import sys
+            from pathlib import Path
+            sys.path.append(str(Path(__file__).parent))
+            from enzyme_lineage_extractor import get_model
+            model = get_model()
+        except Exception as e:
+            log.warning(f"Could not load Gemini model: {e}")
+            return None
+    prompt = f"""Search for the SMILES string representation of this chemical compound:
+"{compound_name}"
+IMPORTANT:
+- Do NOT generate or create a SMILES string
+- Only provide SMILES that you can find in chemical databases or literature
+- For deuterated compounds, search for the specific isotope-labeled SMILES
+- If you cannot find the exact SMILES, say "NOT FOUND"
+Return ONLY the SMILES string if found, or "NOT FOUND" if not found.
+No explanation or additional text."""
+    try:
+        response = model.generate_content(prompt)
+        result = response.text.strip()
+        if result and result != "NOT FOUND" and not result.startswith("I"):
+            # Basic validation that it looks like SMILES
+            if any(c in result for c in ['C', 'c', 'N', 'O', 'S', 'P', '[', ']', '(', ')']):
+                log.info(f"Gemini found SMILES for '{compound_name}': {result}")
+                return result
+        return None
+    except Exception as e:
+        log.debug(f"Gemini SMILES search failed for '{compound_name}': {e}")
+        return None
 def _split_list(raw: str) -> List[str]:
     if not raw or str(raw).lower() == 'nan':
         return []
@@ -429,7 +482,12 @@ def _name_to_smiles(name: str, is_substrate: bool) -> str:
     except FileNotFoundError:
         pass  # OPSIN not installed
-    # 3. PubChem PUG REST (online) ---------------------------------------------
+    # 3. Gemini search (for complex compounds) ---------------------------------
+    gemini_smiles = search_smiles_with_gemini(name)
+    if gemini_smiles:
+        return gemini_smiles
+    # 4. PubChem PUG REST (online) ---------------------------------------------
     try:
         import requests
@@ -538,13 +596,23 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
 def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
     """Infer lineage roots using generation numbers and simple sequence similarity."""
-    idmap: Dict[str, Dict[str, str]] = {str(r["enzyme_id"]): r for _, r in df.iterrows()}
+    # Create idmap, handling missing enzyme_id gracefully
+    idmap: Dict[str, Dict[str, str]] = {}
+    for _, r in df.iterrows():
+        eid = r.get("enzyme_id")
+        if pd.isna(eid) or str(eid).strip() == "":
+            continue
+        idmap[str(eid)] = r
     roots: Dict[str, str] = {}
     # Look for generation 0 as the root
-    gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "0"}
+    gen0 = {r["enzyme_id"] for _, r in df.iterrows()
+            if str(r.get("generation", "")).strip() == "0"
+            and not pd.isna(r.get("enzyme_id"))}
     # If no gen0 found, fall back to gen1
     if not gen0:
-        gen0 = {r["enzyme_id"] for _, r in df.iterrows() if str(r.get("generation", "")).strip() == "1"}
+        gen0 = {r["enzyme_id"] for _, r in df.iterrows()
+                if str(r.get("generation", "")).strip() == "1"
+                and not pd.isna(r.get("enzyme_id"))}
     def _seq_sim(a: str, b: str) -> float:
         if not a or not b:
@@ -553,7 +621,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
         return matches / max(len(a), len(b))
     for _, row in df.iterrows():
-        eid = row["enzyme_id"]
+        eid = row.get("enzyme_id")
+        if pd.isna(eid) or str(eid).strip() == "":
+            continue
         if eid in gen0:
             roots[eid] = eid
             continue
@@ -593,6 +663,9 @@ def _generate_lineage_roots(df: pd.DataFrame) -> Dict[str, str]:
 def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     """Main public API: returns a DataFrame in the flat output format."""
+    log.info(f"Starting flatten_dataframe with {len(df)} input rows")
+    log.info(f"Input columns: {list(df.columns)}")
     # Apply column aliases to the dataframe
     for alias, canonical in COLUMN_ALIASES.items():
         if alias in df.columns and canonical not in df.columns:
@@ -621,8 +694,29 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     # _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
     # 3. Flatten rows ---------------------------------------------------------
-    idmap = {str(r["enzyme_id"]): r.to_dict() for _, r in df.iterrows()}
+    # Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
+    idmap = {}
+    for _, r in df.iterrows():
+        eid = str(r["enzyme_id"])
+        if eid in idmap:
+            log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
+        idmap[eid] = r.to_dict()
+    # Check for duplicate enzyme_ids
+    enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
+    unique_ids = set(enzyme_ids)
+    if len(enzyme_ids) != len(unique_ids):
+        log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
+        from collections import Counter
+        id_counts = Counter(enzyme_ids)
+        duplicates = {k: v for k, v in id_counts.items() if v > 1}
+        log.warning(f"Duplicate enzyme_ids: {duplicates}")
+        log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
     output_rows: List[Dict[str, str]] = []
+    skipped_count = 0
+    processed_count = 0
     for idx, (_, row) in enumerate(df.iterrows()):
         rec = VariantRecord(row.to_dict())
         eid = rec.eid
@@ -632,13 +726,19 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         prods = rec.product_iupac()
         data_type = rec.row.get("data_type", "")
-        if not subs or not prods:
-            # Skip entries without reaction info unless it's marked as lineage only
+        if not prods:
+            # Skip entries without product info unless it's marked as lineage only
             if data_type == "lineage":
                 subs, prods = [""], [""]  # placeholders
             else:
-                log.debug("Skipping %s due to missing reaction data", eid)
+                log.info(f"Skipping enzyme_id={eid} (row {idx}) due to missing product data. prods={prods}, data_type={data_type}")
+                skipped_count += 1
                 continue
+        # If no substrates but we have products, use empty substrate list
+        if not subs:
+            log.debug(f"Empty substrate list for enzyme_id={eid}, using empty placeholder")
+            subs = [""]
         sub_smiles = [sub_cache.get(s, "") for s in subs]
         prod_smiles = [prod_cache.get(p, "") for p in prods]
@@ -712,7 +812,9 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             additional_information=additional_information,
         )
         output_rows.append(flat.as_dict())
+        processed_count += 1
+    log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
     out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
     return out_df

debase/reaction_info_extractor.py CHANGED Viewed

@@ -761,6 +761,15 @@ Ignore locations that contain data for other campaigns.
                 return line
         return page[:800]
+    def _ensure_rgb_pixmap(self, pix: fitz.Pixmap) -> fitz.Pixmap:
+        """Ensure pixmap is in RGB colorspace for PIL compatibility."""
+        if pix.alpha:  # RGBA -> RGB
+            pix = fitz.Pixmap(fitz.csRGB, pix)
+        elif pix.colorspace and pix.colorspace.name not in ["DeviceRGB", "DeviceGray"]:
+            # Convert unsupported colorspaces (CMYK, LAB, etc.) to RGB
+            pix = fitz.Pixmap(fitz.csRGB, pix)
+        return pix
     # ---- NEW: Page image helper for both figures and tables ----
     def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
         """Export the page containing the reference as PNG.
@@ -802,14 +811,14 @@ Ignore locations that contain data for other campaigns.
                             if img_rect.y1 < cap_rect.y0:  # fully above caption
                                 # Extract image bytes
                                 pix = fitz.Pixmap(doc, xref)
-                                if pix.alpha:  # RGBA -> RGB
-                                    pix = fitz.Pixmap(fitz.csRGB, pix)
+                                pix = self._ensure_rgb_pixmap(pix)
                                 img_bytes = pix.tobytes("png")
                                 return b64encode(img_bytes).decode()
                 else:
                     # Extract the entire page as an image
                     mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
                     pix = page.get_pixmap(matrix=mat)
+                    pix = self._ensure_rgb_pixmap(pix)
                     img_bytes = pix.tobytes("png")
                     return b64encode(img_bytes).decode()
         return None
@@ -842,11 +851,13 @@ Ignore locations that contain data for other campaigns.
             # Add the current page
             mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
             pix = doc.load_page(page_num).get_pixmap(matrix=mat)
+            pix = self._ensure_rgb_pixmap(pix)
             all_images.append(pix)
             # If this is the last page with the reference, also add the next page
             if i == len(pages) - 1 and page_num + 1 < doc.page_count:
                 next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
+                next_pix = self._ensure_rgb_pixmap(next_pix)
                 all_images.append(next_pix)
                 LOGGER.info(f"Added next page: page {page_num + 2}")  # +2 because page numbers are 1-based for users
@@ -855,14 +866,16 @@ Ignore locations that contain data for other campaigns.
         # If only one page, return it directly
         if len(all_images) == 1:
-            return b64encode(all_images[0].tobytes("png")).decode()
+            pix = self._ensure_rgb_pixmap(all_images[0])
+            return b64encode(pix.tobytes("png")).decode()
         # Combine multiple pages vertically
         if not all_images:
             return None
         if len(all_images) == 1:
-            return b64encode(all_images[0].tobytes("png")).decode()
+            pix = self._ensure_rgb_pixmap(all_images[0])
+            return b64encode(pix.tobytes("png")).decode()
         # Calculate dimensions for combined image
         total_height = sum(pix.height for pix in all_images)
@@ -903,6 +916,7 @@ Ignore locations that contain data for other campaigns.
         # Convert the page to a pixmap
         mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for quality
         combined_pix = page.get_pixmap(matrix=mat)
+        combined_pix = self._ensure_rgb_pixmap(combined_pix)
         # Convert to PNG and return
         img_bytes = combined_pix.tobytes("png")
@@ -2025,9 +2039,9 @@ TEXT FROM MANUSCRIPT:
         filtered = []
         for loc in locations:
             # Check caption and clues for campaign indicators
-            caption = loc.get('caption', '').lower()
-            campaign_clues = loc.get('campaign_clues', '').lower()
-            lineage_hint = loc.get('lineage_hint', '').lower()
+            caption = (loc.get('caption') or '').lower()
+            campaign_clues = (loc.get('campaign_clues') or '').lower()
+            lineage_hint = (loc.get('lineage_hint') or '').lower()
             combined_text = caption + ' ' + campaign_clues + ' ' + lineage_hint
             # Check if location is relevant to this campaign

debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

debase 0.1.11py3-none-any.whl → 0.1.17py3-none-any.whl