PyPI - debase - Versions diffs - 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

debase 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/campaign_utils.py +146 -0
debase/caption_pattern.py +39 -0
debase/enzyme_lineage_extractor.py +58 -20
debase/lineage_format.py +9 -46
debase/reaction_info_extractor.py +407 -75
debase/substrate_scope_extractor.py +124 -49
debase/wrapper.py +3 -3
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/METADATA +1 -1
debase-0.6.1.dist-info/RECORD +18 -0
debase-0.5.1.dist-info/RECORD +0 -16
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/WHEEL +0 -0
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/entry_points.txt +0 -0
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/licenses/LICENSE +0 -0
{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/top_level.txt +0 -0

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -28,6 +28,13 @@ import re
 import json
 import time
 import logging
+# Import universal caption pattern
+try:
+    from .caption_pattern import get_universal_caption_pattern
+except ImportError:
+    # Fallback if running as standalone script
+    from caption_pattern import get_universal_caption_pattern
 import subprocess
 from pathlib import Path
 from dataclasses import dataclass, field
@@ -183,13 +190,8 @@ except ImportError as exc:  # pragma: no cover
 from base64 import b64encode
-# Improved caption prefix regex - captures most journal variants
-# Simplified pattern: match any line starting with Table, Figure, Scheme, Chart, etc.
-# This catches all variations including "Table S 2", "Figure.", etc.
-_CAPTION_PREFIX_RE = re.compile(
-    r"^(Table|Figure|Fig|Scheme|Chart|Extended\s+Data\s+Fig|ED\s+Fig|Supplementary\s+(?:Table|Figure)).*",
-    re.I | re.M
-)
+# Use universal caption pattern
+_CAPTION_PREFIX_RE = get_universal_caption_pattern()
 def _open_doc(pdf_path: str | Path | bytes):
     if isinstance(pdf_path, (str, Path)):
@@ -485,61 +487,134 @@ def extract_scheme_image(pdf_paths: List[Path], scheme_ref: str) -> Optional[str
     return None
-def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
-    """Extract text around a specific reference (e.g., 'Figure 3')."""
-    import re
-    extracted_sections = []
-    # Try to extract base figure/table reference
-    base_ref_match = re.match(r'((?:Figure|Fig|Table)\s*\d+)', ref, re.IGNORECASE)
-    base_ref = base_ref_match.group(1) if base_ref_match else ref
+def _build_caption_index(pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]:
+    """Build an index of all captions for quick lookup."""
+    cap_pattern = get_universal_caption_pattern()
+    caption_index = {}
-    for pdf_path in pdf_paths:
+    for pdf_idx, pdf_path in enumerate(pdf_paths):
         doc = _open_doc(pdf_path)
+        source = "manuscript" if pdf_idx == 0 else "supplementary"
         try:
             for page_num in range(doc.page_count):
                 page = doc.load_page(page_num)
                 page_text = page.get_text()
-                # Try different variations of the reference
-                ref_variations = [
-                    ref,  # Original (e.g., "Figure 3(a)")
-                    base_ref,  # Base reference (e.g., "Figure 3")
-                    ref.replace("(", " ").replace(")", ""),  # "Figure 3 a"
-                    ref.replace("(", "").replace(")", ""),    # "Figure 3a"
-                    ref.replace("Figure", "Fig"),  # "Fig 3(a)"
-                    base_ref.replace("Figure", "Fig"),  # "Fig 3"
-                ]
-                # Find the reference in the page
-                found = False
-                pos = -1
-                used_ref = ref
-                for ref_var in ref_variations:
-                    if ref_var.lower() in page_text.lower():
-                        pos = page_text.lower().find(ref_var.lower())
-                        used_ref = ref_var
-                        found = True
-                        break
-                if found and pos >= 0:
-                    # Extract context around it
-                    start = max(0, pos - context_chars)
-                    end = min(len(page_text), pos + len(used_ref) + context_chars)
+                for match in cap_pattern.finditer(page_text):
+                    caption_text = match.group(0).strip()
+                    caption_lower = caption_text.lower()
-                    section = page_text[start:end]
-                    extracted_sections.append(
-                        f"\n=== Context around '{ref}' (found as '{used_ref}') in {pdf_path.name}, page {page_num + 1} ===\n{section}"
-                    )
-                    log.debug("Found '%s' as '%s' on page %d of %s", ref, used_ref, page_num + 1, pdf_path.name)
+                    # Store caption info
+                    caption_info = {
+                        'full_caption': caption_text,
+                        'page_text': page_text,
+                        'page_num': page_num + 1,
+                        'pdf_path': pdf_path,
+                        'source': source,
+                        'match_start': match.start(),
+                        'doc': doc  # Keep doc reference for page extraction
+                    }
+                    # Create multiple keys for flexible matching
+                    # Key 1: Full caption text (first 100 chars)
+                    key1 = caption_text[:100].lower().strip()
+                    caption_index[key1] = caption_info
+                    # Key 2: Simplified reference (e.g., "table 5", "figure s3")
+                    ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
+                    if ref_match:
+                        key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
+                        caption_index[key2] = caption_info
+                        # Also store with 's' prefix if in SI
+                        if source == "supplementary" and 's' not in key2:
+                            key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
+                            caption_index[key3] = caption_info
         finally:
             doc.close()
+    return caption_index
+def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
+    """Extract text around a specific reference using caption index."""
+    import re
+    # Build caption index if not already built
+    if not hasattr(_extract_text_around_reference, '_caption_index'):
+        _extract_text_around_reference._caption_index = _build_caption_index(pdf_paths)
+    caption_index = _extract_text_around_reference._caption_index
+    ref_lower = ref.lower().strip()
+    # Try multiple matching strategies
+    matches = []
+    # Strategy 1: Direct key lookup
+    if ref_lower in caption_index:
+        matches.append(caption_index[ref_lower])
+    # Strategy 2: Normalized reference lookup
+    ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
+    if ref_match:
+        ref_type, ref_num = ref_match.groups()
+        if ref_type == 'fig':
+            ref_type = 'figure'
+        # Try different key formats
+        keys_to_try = [
+            f"{ref_type} {ref_num}",
+            f"{ref_type} s{ref_num}",
+            f"table {ref_num}",
+            f"fig {ref_num}",
+            f"figure {ref_num}"
+        ]
+        for key in keys_to_try:
+            if key in caption_index and caption_index[key] not in matches:
+                matches.append(caption_index[key])
+    # Strategy 3: Fuzzy matching
+    if not matches and ref_match:
+        for key, info in caption_index.items():
+            if ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
+                if info not in matches:
+                    matches.append(info)
+    # Extract text from matches
+    extracted_sections = []
+    for match in matches:
+        page_text = match['page_text']
+        caption_start = match['match_start']
+        # Extract context around the caption
+        start = max(0, caption_start - context_chars // 2)
+        end = min(len(page_text), caption_start + context_chars)
+        section = page_text[start:end]
+        source_label = f"{match['source'].upper()} page {match['page_num']}"
+        extracted_sections.append(f"\n[From {source_label}]\n{section}")
     if not extracted_sections:
-        log.warning("Could not find reference '%s' or base reference '%s' in any PDF", ref, base_ref)
+        log.warning(f"No matches found for reference '{ref}'")
+        # Fallback to old approach
+        for pdf_path in pdf_paths:
+            doc = _open_doc(pdf_path)
+            try:
+                for page_num in range(doc.page_count):
+                    page = doc.load_page(page_num)
+                    page_text = page.get_text()
+                    if ref_lower in page_text.lower():
+                        pos = page_text.lower().find(ref_lower)
+                        start = max(0, pos - context_chars // 2)
+                        end = min(len(page_text), pos + context_chars)
+                        extracted_sections.append(page_text[start:end])
+                        break
+            finally:
+                doc.close()
-    return "\n".join(extracted_sections)
+    return "\n\n".join(extracted_sections)
 def _extract_sections_by_title(pdf_paths: List[Path], section_titles: List[str], max_chars_per_section: int = 10000) -> str:
     """Extract sections by their titles from PDFs."""

debase/wrapper.py CHANGED Viewed

@@ -7,10 +7,10 @@ Pipeline flow:
 2. cleanup_sequence.py - Clean and validate protein sequences
 3. reaction_info_extractor.py - Extract reaction performance metrics
 4. substrate_scope_extractor.py - Extract substrate scope data (runs independently)
-5. lineage_format_o3.py - Format and merge all data into final CSV
+5. lineage_format.py - Format and merge all data into final CSV
 The reaction_info and substrate_scope extractors run in parallel,
-then their outputs are combined in lineage_format_o3.
+then their outputs are combined in lineage_format.
 """
 import os
 import sys
@@ -579,7 +579,7 @@ Pipeline steps:
   2. cleanup_sequence - Validate and clean protein sequences
   3. reaction_info_extractor - Extract reaction performance metrics
   4. substrate_scope_extractor - Extract substrate scope data
-  5. lineage_format_o3 - Format and merge into final CSV
+  5. lineage_format - Format and merge into final CSV
 The pipeline automatically handles all steps sequentially.
         """

{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.5.1
+Version: 0.6.1
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.6.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=Cbfy3WdPDTjtgnzdUc6e5F779YhAJJGX5LN-2SJMvCI,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
+debase/caption_pattern.py,sha256=nMLj2tK4MhD4jQ9d1IUDJ6xnY0MOx-UioIT-k_b3OWA,1770
+debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
+debase/enzyme_lineage_extractor.py,sha256=RKsjvcs6O2wnw2dpts3AynDRVKqMAeBVOMql2mayCGY,170120
+debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
+debase/reaction_info_extractor.py,sha256=qUrVi9chQcQG1zWwQlTbYF8dczvQqctdjwhvkAkBnZw,187032
+debase/substrate_scope_extractor.py,sha256=dikdEELi4RGlP2lGHcR93WdUbtIchOdHVB5G45BMCNk,118709
+debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
+debase-0.6.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.6.1.dist-info/METADATA,sha256=fXvGhqDP5Bl33gTEvUvvjqNy-cXYs9jYFl1NyM5ALsc,4047
+debase-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.6.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.6.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.6.1.dist-info/RECORD,,

debase-0.5.1.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=iDuv12GVbaAFXFufv3yqwn-3Hwv9Kua4nJZQ-gUNJXw,49
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
-debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
-debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
-debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
-debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
-debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
-debase-0.5.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.5.1.dist-info/METADATA,sha256=GoaBFl0kdh8dtrApBTMoLWH6fe5GYLiSYC5JrohbPcI,4047
-debase-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.5.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.5.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.5.1.dist-info/RECORD,,

{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.5.1.dist-info → debase-0.6.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

debase 0.5.1py3-none-any.whl → 0.6.1py3-none-any.whl