PyPI - debase - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

debase 0.4.4py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +11 -6
debase/lineage_format.py +22 -18
debase/reaction_info_extractor.py +45 -11
debase/substrate_scope_extractor.py +3 -2
{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/METADATA +1 -1
debase-0.4.5.dist-info/RECORD +16 -0
debase-0.4.4.dist-info/RECORD +0 -16
{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.4.4"
+__version__ = "0.4.5"

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -2943,12 +2943,15 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                     validate_sequences=True
                 )
-            if focused_text and len(focused_text) < len(text):
-                log.info("Reduced text from %d to %d chars using validated location",
-                         len(text), len(focused_text))
-            else:
-                log.warning("Failed to reduce text size - focused_text length: %d, full text length: %d",
-                           len(focused_text) if focused_text else 0, len(text))
+            # Use focused text if we got any content, regardless of size
+            if focused_text:
+                if len(focused_text) < len(text):
+                    log.info("Reduced text from %d to %d chars using validated location",
+                             len(text), len(focused_text))
+                else:
+                    log.info("Extracted focused text (%d chars) from validated location (full text: %d chars)",
+                             len(focused_text), len(text))
                 # Build lineage context if available
                 lineage_context = None
                 if lineage_variants:
@@ -2961,6 +2964,8 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
                     lineage_context = "\n".join(variant_info)
                 return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
+            else:
+                log.warning("Failed to extract focused text from validated location, will use full text")
         else:
             log.warning("Location validation failed or returned invalid location: %s",
                        validation.get("reason", "Unknown"))

debase/lineage_format.py CHANGED Viewed

@@ -1183,15 +1183,33 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         if generation != "0":
             for cid, cmap in campaign_idmap.items():
                 if cid == campaign_id:
+                    # First try to find generation 0
                     for enzyme_id, enzyme_row in cmap.items():
                         enzyme_gen = str(enzyme_row.get("generation", "")).strip()
                         if enzyme_gen == "0" or enzyme_gen == "0.0":
                             reference_row = enzyme_row
                             log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
                             break
+                    # If no generation 0 found, find the earliest generation
+                    if not reference_row:
+                        earliest_gen = float('inf')
+                        earliest_enzyme = None
+                        for enzyme_id, enzyme_row in cmap.items():
+                            try:
+                                enzyme_gen = float(str(enzyme_row.get("generation", "")).strip())
+                                if enzyme_gen < earliest_gen and enzyme_gen < float(generation):
+                                    earliest_gen = enzyme_gen
+                                    earliest_enzyme = enzyme_id
+                                    reference_row = enzyme_row
+                            except (ValueError, AttributeError):
+                                continue
+                        if reference_row:
+                            log.info(f"No generation 0 found in campaign {campaign_id}, using generation {earliest_gen} enzyme {earliest_enzyme} as reference for {eid}")
+                        else:
+                            log.warning(f"No suitable reference enzyme found in campaign {campaign_id} for {eid}")
                     break
-            if not reference_row:
-                log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
         reference_aa = ""
         reference_nt = ""
@@ -1332,24 +1350,10 @@ def run_pipeline(reaction_csv: str | Path | None = None,
     if not dfs:
         raise ValueError("At least one input CSV must be provided")
-    # Combine dataframes with deduplication
+    # Combine dataframes without deduplication
     if len(dfs) > 1:
         df_in = pd.concat(dfs, ignore_index=True)
-        log.info("Combined data: %d total entries (before deduplication)", len(df_in))
-        # Deduplicate based on unique combination of campaign, variant, fitness, and product
-        # Define the key columns that should be unique
-        unique_cols = ['campaign_id', 'enzyme_id', 'product_list']
-        # Check if we have these columns
-        available_cols = [col for col in unique_cols if col in df_in.columns]
-        if len(available_cols) >= 2:  # Need at least campaign_id and enzyme_id
-            # Keep the first occurrence of each unique combination
-            df_in = df_in.drop_duplicates(subset=available_cols, keep='first')
-            log.info("After deduplication on %s: %d entries", available_cols, len(df_in))
-        else:
-            log.warning("Could not deduplicate - missing required columns: %s", unique_cols)
+        log.info("Combined data: %d total entries", len(df_in))
     else:
         df_in = dfs[0]

debase/reaction_info_extractor.py CHANGED Viewed

@@ -730,8 +730,8 @@ Return as JSON:
 ###############################################################################
 class ReactionExtractor:
-    _FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
-    _TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
+    _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
+    _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
     def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
                  campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
@@ -800,14 +800,24 @@ class ReactionExtractor:
                 context_start = context_start + last_period + 1
             # For tables, include much more content after the caption to show actual table data
-            # For figures, keep the original limit
-            is_table = match.group(1).lower() == 'table'
-            max_chars = 5000 if is_table else 3000
-            # Get up to max_chars or until double newline
-            caption_end = all_text.find("\n\n", caption_start)
-            if caption_end == -1 or caption_end - caption_start > max_chars:
+            # For figures, include more content to ensure complete captions
+            is_table = 'table' in match.group(1).lower()
+            max_chars = 8000 if is_table else 5000
+            # Get up to max_chars or until double newline (but ensure we get complete caption)
+            # First, try to find the end of the caption sentence
+            caption_end = caption_start
+            period_pos = all_text.find('. ', caption_start)
+            if period_pos != -1 and period_pos < caption_start + 1000:
+                # Include at least to the end of the caption sentence
+                caption_end = period_pos + 1
+            # Then extend to include more context or until double newline
+            double_newline_pos = all_text.find("\n\n", caption_end)
+            if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
                 caption_end = caption_start + max_chars
+            else:
+                caption_end = double_newline_pos
             # Include the context and full caption with table content
             full_caption = all_text[context_start:caption_end].strip()
@@ -1090,6 +1100,7 @@ class ReactionExtractor:
         If extract_figure_only=True, extracts just the figure above the caption.
         If False, extracts the entire page (useful for tables).
         Returns a base64-encoded PNG or None."""
+        LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
         # Check cache first
         cache_key = f"{ref}_{extract_figure_only}"
@@ -1107,10 +1118,18 @@ class ReactionExtractor:
             return None
         # For figure extraction, search both documents for actual figure captions
-        for doc in filter(None, [self.ms_doc, self.si_doc]):
+        docs = list(filter(None, [self.ms_doc, self.si_doc]))
+        LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
+        for doc_idx, doc in enumerate(docs):
+            doc_name = "MS" if doc_idx == 0 else "SI"
+            LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
             for page_number in range(doc.page_count):
                 page = doc.load_page(page_number)
                 page_text = page.get_text()
+                LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
+                           page_number + 1, doc_name, len(page_text))
                 # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
                 # For subfigures like "Figure 1C", extract the main figure "Figure 1"
@@ -1158,6 +1177,14 @@ class ReactionExtractor:
                     if figure_mentions:
                         LOGGER.debug("Page %d has figure mentions but no caption match: %s",
                                    page_number, figure_mentions[:3])
+                    # For supplementary figures, also check for "supplementary" mentions
+                    if 'supplementary' in ref.lower():
+                        supp_mentions = [line.strip() for line in page_text.split('\n')
+                                       if 'supplementary' in line.lower() and 'figure' in line.lower()]
+                        if supp_mentions:
+                            LOGGER.warning("Found supplementary figure mentions on page %d but no caption match. First 3: %s",
+                                         page_number + 1, supp_mentions[:3])
                     continue
                 if extract_figure_only:
@@ -1215,6 +1242,8 @@ class ReactionExtractor:
                     # Cache the result
                     self._figure_cache.put(cache_key, result)
                     return result
+        LOGGER.warning("_extract_page_png returning None for '%s' - figure not found in any document", ref)
         return None
     def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
@@ -1445,6 +1474,7 @@ class ReactionExtractor:
     def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
         """Extract performance metrics for multiple enzymes from the identified location in batch."""
+        LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
         ref_lc = ref.lower()
         image_b64: Optional[str] = None
@@ -1466,11 +1496,15 @@ class ReactionExtractor:
                 snippet = self._extract_table_context(ref)
         elif self._FIG_RE.search(ref_lc):
             # For figures, extract just the figure image (same logic as compound mapping)
+            LOGGER.debug("Attempting to extract figure image for '%s'", ref)
             image_b64 = self._extract_page_png(ref, extract_figure_only=True)
             if not image_b64:
-                LOGGER.debug("No figure image found for %s - using caption text", ref)
+                LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
                 snippet = self._extract_figure_caption(ref)
+                LOGGER.debug("Caption extraction result: %s",
+                           f"'{snippet[:100]}...'" if snippet else "empty")
             else:
+                LOGGER.info("Successfully extracted figure image for '%s'", ref)
                 # If figure is found, ignore text information - use image only
                 snippet = ""
         else:

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -2543,7 +2543,8 @@ def merge_with_lineage(
                     data = lineage_map[matched_name]
                     entry.parent_id = data['parent_id']
                     entry.mutations = data['mutations']
-                    entry.generation = data['generation']
+                    # Skip generation - to be filled by lineage_format
+                    # entry.generation = data['generation']
                     entry.aa_seq = data['aa_seq']
                     entry.dna_seq = data['dna_seq']
                     entry.confidence = data['confidence']
@@ -2571,7 +2572,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
             'enzyme_id': entry.enzyme_id,
             'parent_enzyme_id': entry.parent_id or '',
             'mutations': entry.mutations or '',
-            'generation': entry.generation if entry.generation is not None else '',
+            'generation': '',  # Empty generation - to be filled by lineage_format
             'campaign_id': entry.campaign_id or '',
             'protein_sequence': entry.aa_seq or '',
             'nucleotide_sequence': entry.dna_seq or '',

{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.4.4
+Version: 0.4.5
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.4.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
+debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
+debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
+debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
+debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
+debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
+debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
+debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.4.5.dist-info/RECORD,,

debase-0.4.4.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=Vtl1u7rFItRnkcTvBiUypIltuuzta9Uy3PxMO2NgNgc,49
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
-debase/enzyme_lineage_extractor.py,sha256=jWyDRfOY792zjY5SZCvhNfQxVcEOC1JjTGb9Wo2qZ4I,170543
-debase/lineage_format.py,sha256=ch5kyoUqD_4Hj7K0hJrRbKrN_FysqFrFXgbyDIgp2oA,57515
-debase/reaction_info_extractor.py,sha256=Gv1qgzInNWxdaEJdsWGlgyy5syL2qClVoKHFQpR_6q0,158498
-debase/substrate_scope_extractor.py,sha256=7JyTE3CiIQVDDetwfENCoiq5bLnHElsY3Db1ThVLEBE,115884
-debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
-debase-0.4.4.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.4.4.dist-info/METADATA,sha256=Gwx754a5Zr_0yp-HXQuRRLylgEp0hD15MhhMjSOVMHo,4047
-debase-0.4.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.4.4.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.4.4.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.4.4.dist-info/RECORD,,

{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.4.4.dist-info → debase-0.4.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

debase 0.4.4py3-none-any.whl → 0.4.5py3-none-any.whl