PyPI - debase - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

debase 0.4.2py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.4.2"
+__version__ = "0.4.3"

debase/cleanup_sequence.py CHANGED Viewed

@@ -11,6 +11,7 @@ Usage:
 import argparse
 import logging
+import os
 import re
 import sys
 from dataclasses import dataclass, field
@@ -19,11 +20,20 @@ from typing import Dict, List, Optional, Set, Tuple, Union
 import pandas as pd
+try:
+    import google.generativeai as genai  # type: ignore
+    GEMINI_OK = True
+except ImportError:  # pragma: no cover
+    GEMINI_OK = False
 # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
 VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*")  # Include * for stop codons
+# Gemini API configuration
+GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
 # Configure module logger
 log = logging.getLogger(__name__)
@@ -565,7 +575,136 @@ class SequenceGenerator:
         return None
-# === 7. MAIN PROCESSOR === ---------------------------------------------------
+# === 7. GEMINI PARENT IDENTIFICATION === ------------------------------------
+def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
+    """Use Gemini API to identify parent enzymes for entries with missing parent information."""
+    if not GEMINI_OK:
+        log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
+        return df
+    if not GEMINI_API_KEY:
+        log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
+        return df
+    try:
+        genai.configure(api_key=GEMINI_API_KEY)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+    except Exception as e:
+        log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
+        return df
+    # Find entries with empty sequences but missing parent information
+    entries_needing_parents = []
+    for idx, row in df.iterrows():
+        protein_seq = str(row.get("protein_sequence", "")).strip()
+        parent_id = str(row.get("parent_enzyme_id", "")).strip()
+        # Only process entries that have empty sequences AND no parent info
+        if (not protein_seq or protein_seq == "nan") and (not parent_id or parent_id == "nan"):
+            enzyme_id = str(row.get("enzyme_id", ""))
+            campaign_id = str(row.get("campaign_id", ""))
+            generation = str(row.get("generation", ""))
+            entries_needing_parents.append({
+                "idx": idx,
+                "enzyme_id": enzyme_id,
+                "campaign_id": campaign_id,
+                "generation": generation
+            })
+    if not entries_needing_parents:
+        log.info("No entries need parent identification from Gemini")
+        return df
+    log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
+    # Create a lookup of all available enzyme IDs for context
+    available_enzymes = {}
+    for idx, row in df.iterrows():
+        enzyme_id = str(row.get("enzyme_id", ""))
+        campaign_id = str(row.get("campaign_id", ""))
+        protein_seq = str(row.get("protein_sequence", "")).strip()
+        generation = str(row.get("generation", ""))
+        if enzyme_id and enzyme_id != "nan":
+            available_enzymes[enzyme_id] = {
+                "campaign_id": campaign_id,
+                "has_sequence": bool(protein_seq and protein_seq != "nan"),
+                "generation": generation
+            }
+    identified_count = 0
+    for entry in entries_needing_parents:
+        enzyme_id = entry["enzyme_id"]
+        campaign_id = entry["campaign_id"]
+        generation = entry["generation"]
+        # Create context for Gemini
+        context_info = []
+        context_info.append(f"Enzyme ID: {enzyme_id}")
+        context_info.append(f"Campaign ID: {campaign_id}")
+        if generation:
+            context_info.append(f"Generation: {generation}")
+        # Add available enzymes from the same campaign for context
+        campaign_enzymes = []
+        for enz_id, enz_data in available_enzymes.items():
+            if enz_data["campaign_id"] == campaign_id:
+                status = "with sequence" if enz_data["has_sequence"] else "without sequence"
+                gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
+                campaign_enzymes.append(f"  - {enz_id} {status} {gen_info}")
+        if campaign_enzymes:
+            context_info.append("Available enzymes in same campaign:")
+            context_info.extend(campaign_enzymes[:10])  # Limit to first 10 for context
+        context_text = "\n".join(context_info)
+        prompt = f"""
+Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
+{context_text}
+This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
+Please provide your response in this format:
+Parent: [parent_enzyme_id or "Unknown"]
+If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
+"""
+        try:
+            response = model.generate_content(prompt)
+            response_text = response.text.strip()
+            # Parse the response
+            parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
+            if parent_match:
+                parent = parent_match.group(1).strip()
+                if parent and parent != "Unknown" and parent != "No parent identified":
+                    # Verify the parent exists in our available enzymes
+                    if parent in available_enzymes:
+                        df.at[entry["idx"], "parent_enzyme_id"] = parent
+                        identified_count += 1
+                        log.info(f"Identified parent for {enzyme_id}: {parent}")
+                    else:
+                        log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
+        except Exception as e:
+            log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
+            continue
+    if identified_count > 0:
+        log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
+    else:
+        log.info("No parent enzymes were identified using Gemini API")
+    return df
+# === 8. MAIN PROCESSOR === ---------------------------------------------------
 class SequenceProcessor:
     """Main processor for handling the complete workflow."""
@@ -866,6 +1005,17 @@ class SequenceProcessor:
             self.process_remaining()
             self.backward_pass()
+            # Use Gemini to identify parent enzymes for entries with missing sequences
+            log.info(f"Identifying parents with Gemini for campaign: {campaign_id}")
+            self.df = identify_parents_with_gemini(self.df)
+            # Rebuild relationships after parent identification
+            self.generator = SequenceGenerator(self.df)
+            # Try to fill sequences again after parent identification
+            log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
+            self.process_remaining()
             # Update the original dataframe with results
             original_df.loc[campaign_mask, :] = self.df

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -142,21 +142,36 @@ def extract_text(pdf_path: str | Path | bytes) -> str:
 def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
-    """Extract figure/table captions using the improved regex.
+    """Extract ALL figure/table captions with extensive surrounding context.
     The function scans every text line on every page and keeps lines whose first
     token matches `_CAPTION_PREFIX_RE`. This covers labels such as:
-      * Fig. 1, Figure 2A, Extended Data Fig 3
+      * Fig. 1, Figure 2A, Figure 2B, Figure 2C (ALL sub-captions)
       * Table S1, Table 4, Scheme 2, Chart 1B
-      * Supplementary Fig. S5, Supp Table 2
+      * Supplementary Fig. S5A, S5B, S5C (ALL variations)
+    For SI documents, includes extensive context since understanding what each
+    section contains is crucial for accurate location identification.
     """
     doc = _open_doc(pdf_path)
     captions: list[str] = []
     try:
-        for page in doc:
+        for page_num, page in enumerate(doc):
             page_dict = page.get_text("dict")
+            # Get all text blocks on this page for broader context
+            page_text_blocks = []
             for block in page_dict.get("blocks", []):
+                block_text = ""
+                for line in block.get("lines", []):
+                    text_line = "".join(span["text"] for span in line.get("spans", []))
+                    if text_line.strip():
+                        block_text += text_line.strip() + " "
+                if block_text.strip():
+                    page_text_blocks.append(block_text.strip())
+            for block_idx, block in enumerate(page_dict.get("blocks", [])):
                 # Get all lines in this block
                 block_lines = []
                 for line in block.get("lines", []):
@@ -166,21 +181,94 @@ def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -
                 # Check if any line starts with a caption prefix
                 for i, line in enumerate(block_lines):
                     if _CAPTION_PREFIX_RE.match(line):
-                        # Found a caption start - collect this line and subsequent lines
-                        # until we hit an empty line or the end of the block
+                        context_parts = []
+                        # Add page context for SI documents (more critical there)
+                        context_parts.append(f"Page {page_num + 1}")
+                        # Add extensive context before the caption (5-7 lines for SI context)
+                        context_before = []
+                        # First try to get context from current block
+                        for k in range(max(0, i-7), i):
+                            if k < len(block_lines) and block_lines[k].strip():
+                                if not _CAPTION_PREFIX_RE.match(block_lines[k]):
+                                    context_before.append(block_lines[k])
+                        # If not enough context, look at previous text blocks on the page
+                        if len(context_before) < 3 and block_idx > 0:
+                            prev_block_text = page_text_blocks[block_idx - 1] if block_idx < len(page_text_blocks) else ""
+                            if prev_block_text:
+                                # Get last few sentences from previous block
+                                sentences = prev_block_text.split('. ')
+                                context_before = sentences[-2:] + context_before if len(sentences) > 1 else [prev_block_text] + context_before
+                        if context_before:
+                            # Include more extensive context for better understanding
+                            context_text = " ".join(context_before[-5:])  # Last 5 lines/sentences of context
+                            context_parts.append("Context: " + context_text)
+                        # Extract the COMPLETE caption including all sub-parts
                         caption_parts = [line]
-                        for j in range(i + 1, len(block_lines)):
+                        j = i + 1
+                        # Continue collecting caption text until we hit a clear break
+                        while j < len(block_lines):
                             next_line = block_lines[j]
-                            if not next_line:  # Empty line signals end of caption
-                                break
-                            # Check if next line is a new caption
+                            # Stop if we hit an empty line followed by non-caption text
+                            if not next_line:
+                                # Check if the line after empty is a new caption
+                                if j + 1 < len(block_lines) and _CAPTION_PREFIX_RE.match(block_lines[j + 1]):
+                                    break
+                                # If next non-empty line is not a caption, continue collecting
+                                elif j + 1 < len(block_lines):
+                                    j += 1
+                                    continue
+                                else:
+                                    break
+                            # Stop if we hit a new caption
                             if _CAPTION_PREFIX_RE.match(next_line):
                                 break
+                            # Include this line as part of the caption
                             caption_parts.append(next_line)
+                            j += 1
-                        # Join the parts with spaces
+                        # Join the caption parts
                         full_caption = " ".join(caption_parts)
-                        captions.append(full_caption)
+                        context_parts.append("Caption: " + full_caption)
+                        # Add extensive context after the caption (especially important for SI)
+                        context_after = []
+                        # Look for descriptive text following the caption
+                        for k in range(j, min(len(block_lines), j + 10)):  # Look ahead up to 10 lines
+                            if k < len(block_lines) and block_lines[k].strip():
+                                if not _CAPTION_PREFIX_RE.match(block_lines[k]):
+                                    context_after.append(block_lines[k])
+                        # If not enough context, look at next text blocks
+                        if len(context_after) < 3 and block_idx + 1 < len(page_text_blocks):
+                            next_block_text = page_text_blocks[block_idx + 1]
+                            if next_block_text:
+                                # Get first few sentences from next block
+                                sentences = next_block_text.split('. ')
+                                context_after.extend(sentences[:3] if len(sentences) > 1 else [next_block_text])
+                        if context_after:
+                            # Include extensive following context
+                            following_text = " ".join(context_after[:7])  # First 7 lines of following context
+                            context_parts.append("Following: " + following_text)
+                        # For SI documents, add section context if this appears to be a section header
+                        if any(keyword in full_caption.lower() for keyword in ['supplementary', 'supporting', 'si ', 's1', 's2', 's3']):
+                            context_parts.append("SI_SECTION: This appears to be supplementary material content")
+                        # Combine all parts with proper separation
+                        full_caption_with_context = " | ".join(context_parts)
+                        captions.append(full_caption_with_context)
     finally:
         doc.close()

{debase-0.4.2.dist-info → debase-0.4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.2
+Version: 0.4.3
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

{debase-0.4.2.dist-info → debase-0.4.3.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
 debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=WDlHvBix-yPlJxEgBTyb1Z8IXZmUbhOOQSbpA2mamvU,49
+debase/_version.py,sha256=r0b4fvQcrrvOScFMddjVgWAGNt17iQxCJH2xYW06jio,49
 debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/cleanup_sequence.py,sha256=ngxb_tPekjCWvampAjyuFqK4wLk_meFSj_TwfvOxheQ,33978
-debase/enzyme_lineage_extractor.py,sha256=Gcb-AzmAs5uTR65ncQ83Ds2JrK3cuzrPyS70OyLniMo,124278
+debase/cleanup_sequence.py,sha256=4qZrSXInyJKEJqcgcONp4IX24ALEj5lf7E0XaOZVxZ0,40329
+debase/enzyme_lineage_extractor.py,sha256=tFyrcWkNKKr8T9xq0tIXUDNfcX0tbdWGrLhgo5m7lmA,129804
 debase/lineage_format.py,sha256=Q6kpqKPUxJsMYpb0Yt8IbVlp6VDYX2vkITuGhT9MEbw,47056
 debase/reaction_info_extractor.py,sha256=q-iHgfVLXP4r2Se8yA9I0AvtnAhHBltTztrXspl3EKU,151949
 debase/substrate_scope_extractor.py,sha256=eaVimhxmmaRj-9dRN6RKK4yStCmZAuX8xBaarsIsmUo,114212
 debase/wrapper.py,sha256=r0xxoiBvmMIktiGPOD4w9hne8m0SLzZ03WeWnBuDW0A,25236
-debase-0.4.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.4.2.dist-info/METADATA,sha256=__XUoQUlZPLGT3TXgb6EBfuUdJ07-g5gBQBOiEiGXLA,10789
-debase-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.4.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.4.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.4.2.dist-info/RECORD,,
+debase-0.4.3.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.4.3.dist-info/METADATA,sha256=UT8ymX3oothXvgA9ayr74_Bd-St7I0Pj7CoEg8LlKg8,10789
+debase-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.4.3.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.4.3.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.4.3.dist-info/RECORD,,

{debase-0.4.2.dist-info → debase-0.4.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.4.2.dist-info → debase-0.4.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.4.2.dist-info → debase-0.4.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.4.2.dist-info → debase-0.4.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

debase 0.4.2py3-none-any.whl → 0.4.3py3-none-any.whl