PyPI - debase - Versions diffs - 0.1.18__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

debase 0.1.18py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +40 -8
debase/enzyme_lineage_extractor.py +153 -9
debase/reaction_info_extractor.py +1181 -493
debase/substrate_scope_extractor.py +83 -34
debase/wrapper.py +75 -0
{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/METADATA +1 -1
debase-0.4.0.dist-info/RECORD +16 -0
debase/PIPELINE_FLOW.md +0 -100
debase-0.1.18.dist-info/RECORD +0 -17
{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/WHEEL +0 -0
{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/entry_points.txt +0 -0
{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/top_level.txt +0 -0

debase/substrate_scope_extractor.py CHANGED Viewed

@@ -278,30 +278,30 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
                 # Check if this page contains the figure caption
                 found = False
-                caption_instances = None
+                caption_rect = None
-                # Look for figure caption
-                variations = [
-                    f"{base_figure_ref}.",  # "Figure 1." - most reliable
-                    f"{base_figure_ref} ",  # "Figure 1 "
-                    base_figure_ref,
+                # Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
+                figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
+                # Look for actual figure captions using regex patterns
+                caption_patterns = [
+                    rf"^Figure\s+{re.escape(figure_num)}\.",  # "Figure 3." at start of line
+                    rf"^Figure\s+{re.escape(figure_num)}:",   # "Figure 3:" at start of line
+                    rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]",  # "Figure 3 Substrate scope"
+                    rf"Figure\s+{re.escape(figure_num)}\s*\.",  # "Figure 3." anywhere
+                    rf"Figure\s+{re.escape(figure_num)}\s*:",  # "Figure 3:" anywhere
                 ]
-                for variation in variations:
-                    caption_instances = page.search_for(variation, quads=False)
-                    if caption_instances:
-                        # Check if this is likely a caption (not a reference in text)
-                        for rect in caption_instances:
-                            # Get text around this location
-                            x0, y0, x1, y1 = rect
-                            text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+300, y1+20))
-                            # Check if it looks like a figure caption
-                            if any(keyword in text_around.lower() for keyword in
-                                   ['directed evolution', 'substrate scope', '(a)', '(b)', '(c)']):
-                                found = True
-                                caption_rect = rect
-                                break
-                        if found:
+                for pattern in caption_patterns:
+                    matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
+                    if matches:
+                        # Found actual figure caption, get its position
+                        caption_text = matches.group(0)
+                        caption_instances = page.search_for(caption_text, quads=False)
+                        if caption_instances:
+                            caption_rect = caption_instances[0]
+                            found = True
+                            log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
                             break
                 if not found:
@@ -309,34 +309,68 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
                 log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
-                # Extract a region of the page above the caption
-                # The figure should be between the top of the viewable area and the caption
+                # Extract multi-page region including the figure and content below
+                # The figure should be between the top of the viewable area and extend to subsequent pages
                 page_rect = page.rect
                 # Define the region to extract
-                # Extract everything above the caption
+                # Extract everything above the caption plus additional content from subsequent pages
                 top_margin = 0  # Start from the very top of the page
-                bottom_margin = 5  # Small margin above caption
+                additional_pages = 2  # Number of additional pages to include
                 left_margin = 0  # Use full page width
                 right_margin = 0
-                # Calculate the figure region - everything from top to caption
+                # Calculate the figure region for the first page
                 fig_top = top_margin
-                fig_bottom = caption_rect.y0 - bottom_margin
+                fig_bottom = max(caption_rect.y0 + 150, page_rect.height)  # At least 150px below caption or full page
                 fig_left = left_margin
                 fig_right = page_rect.width - right_margin
-                # Create the clip rectangle
-                clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
+                # Create list to store all page images
+                page_images = []
-                # Extract the region as an image
+                # Extract first page (from top to bottom)
+                clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
                 mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
                 pix = page.get_pixmap(clip=clip_rect, matrix=mat)
+                page_images.append(pix)
+                # Extract additional pages if they exist
+                for additional_page_offset in range(1, additional_pages + 1):
+                    next_page_num = page_num + additional_page_offset
+                    if next_page_num < doc.page_count:
+                        next_page = doc.load_page(next_page_num)
+                        next_page_rect = next_page.rect
+                        # Extract full page for additional pages
+                        next_clip_rect = fitz.Rect(0, 0, next_page_rect.width, next_page_rect.height)
+                        next_pix = next_page.get_pixmap(clip=next_clip_rect, matrix=mat)
+                        page_images.append(next_pix)
+                        log.info("Added page %d to multi-page extraction", next_page_num + 1)
+                # Combine all page images vertically
+                if len(page_images) == 1:
+                    # Single page extraction
+                    combined_pix = page_images[0]
+                else:
+                    # Multi-page extraction - combine vertically
+                    total_width = max(pix.width for pix in page_images)
+                    total_height = sum(pix.height for pix in page_images)
+                    # Create a new pixmap to hold the combined image
+                    combined_pix = fitz.Pixmap(fitz.csRGB, fitz.IRect(0, 0, total_width, total_height))
+                    combined_pix.clear_with(255)  # White background
+                    current_y = 0
+                    for pix in page_images:
+                        # Copy each page image to the combined image
+                        combined_pix.copy(pix, fitz.IRect(0, current_y, pix.width, current_y + pix.height))
+                        current_y += pix.height
                 # Convert to PNG
-                img_bytes = pix.tobytes("png")
-                log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
-                         clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
+                img_bytes = combined_pix.tobytes("png")
+                log.info("Extracted multi-page figure region: %dx%d pixels from %d pages starting at page %d",
+                         combined_pix.width, combined_pix.height, len(page_images), page_num + 1)
                 return b64encode(img_bytes).decode()
@@ -653,10 +687,25 @@ def get_model():
 def _extract_text(resp) -> str:
     """
     Pull the *first* textual part out of a GenerativeAI response, handling both
-    the old prerelease SDK and the >=1.0 SDK.
+    the old prerelease SDK and the >=1.0 SDK. Also tracks token usage.
     Returns an empty string if no textual content is found.
     """
+    # Track token usage if available
+    try:
+        if hasattr(resp, 'usage_metadata'):
+            input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
+            output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
+            if input_tokens or output_tokens:
+                # Import wrapper token tracking
+                try:
+                    from .wrapper import add_token_usage
+                    add_token_usage('substrate_scope_extractor', input_tokens, output_tokens)
+                except ImportError:
+                    pass  # wrapper not available
+    except Exception:
+        pass  # token tracking is best-effort
     # 1) Legacy SDK (<= 0.4) - still has nice `.text`
     if getattr(resp, "text", None):
         return resp.text

debase/wrapper.py CHANGED Viewed

@@ -19,6 +19,7 @@ import logging
 import time
 from datetime import datetime
 from pathlib import Path
+import threading
 # Setup logging
 logging.basicConfig(
@@ -27,6 +28,53 @@ logging.basicConfig(
 )
 logger = logging.getLogger("EnzymePipeline")
+# Global token tracking
+_token_lock = threading.Lock()
+_token_usage = {
+    'total_input_tokens': 0,
+    'total_output_tokens': 0,
+    'calls_by_module': {
+        'enzyme_lineage_extractor': {'input': 0, 'output': 0, 'calls': 0},
+        'reaction_info_extractor': {'input': 0, 'output': 0, 'calls': 0},
+        'substrate_scope_extractor': {'input': 0, 'output': 0, 'calls': 0}
+    }
+}
+def add_token_usage(module_name: str, input_tokens: int, output_tokens: int):
+    """Add token usage from a module to the global tracking."""
+    with _token_lock:
+        _token_usage['total_input_tokens'] += input_tokens
+        _token_usage['total_output_tokens'] += output_tokens
+        if module_name in _token_usage['calls_by_module']:
+            _token_usage['calls_by_module'][module_name]['input'] += input_tokens
+            _token_usage['calls_by_module'][module_name]['output'] += output_tokens
+            _token_usage['calls_by_module'][module_name]['calls'] += 1
+def calculate_token_usage_and_cost():
+    """Calculate total token usage and estimated cost for Gemini 2.5 Flash."""
+    with _token_lock:
+        total_input = _token_usage['total_input_tokens']
+        total_output = _token_usage['total_output_tokens']
+        # Gemini 2.5 Flash pricing (as of 2025)
+        # Input: $0.30 per 1M tokens
+        # Output: $2.50 per 1M tokens
+        input_cost = (total_input / 1_000_000) * 0.30
+        output_cost = (total_output / 1_000_000) * 2.50
+        total_cost = input_cost + output_cost
+        return total_input, total_output, total_cost
+def reset_token_usage():
+    """Reset token usage counters."""
+    with _token_lock:
+        _token_usage['total_input_tokens'] = 0
+        _token_usage['total_output_tokens'] = 0
+        for module_data in _token_usage['calls_by_module'].values():
+            module_data['input'] = 0
+            module_data['output'] = 0
+            module_data['calls'] = 0
 def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
     """
@@ -405,6 +453,9 @@ def run_pipeline(
     substrate_csv = output_dir / "3b_substrate_scope.csv"
     try:
+        # Reset token usage tracking for this pipeline run
+        reset_token_usage()
         logger.info("="*60)
         logger.info("Starting DEBase Enzyme Analysis Pipeline")
         logger.info(f"Manuscript: {manuscript_path}")
@@ -449,12 +500,36 @@ def run_pipeline(
         else:
             logger.info("Note: Use --keep-intermediates to save intermediate files")
+        # Calculate token usage and estimated costs
+        total_input_tokens, total_output_tokens, estimated_cost = calculate_token_usage_and_cost()
         logger.info("\n" + "="*60)
         logger.info("PIPELINE COMPLETED SUCCESSFULLY")
         logger.info(f"Comprehensive output: {output_path}")
         if final_output != output_path:
             logger.info(f"Plate-based output: {final_output}")
         logger.info(f"Runtime: {elapsed:.1f} seconds")
+        logger.info("")
+        logger.info("TOKEN USAGE & COST ESTIMATE:")
+        logger.info(f"  Input tokens:  {total_input_tokens:,}")
+        logger.info(f"  Output tokens: {total_output_tokens:,}")
+        logger.info(f"  Total tokens:  {total_input_tokens + total_output_tokens:,}")
+        logger.info(f"  Estimated cost: ${estimated_cost:.4f} USD")
+        logger.info("  (Based on Gemini 2.5 Flash pricing: $0.30/1M input, $2.50/1M output)")
+        logger.info("")
+        # Show breakdown by module
+        with _token_lock:
+            logger.info("BREAKDOWN BY MODULE:")
+            for module_name, usage in _token_usage['calls_by_module'].items():
+                if usage['calls'] > 0:
+                    logger.info(f"  {module_name}:")
+                    logger.info(f"    API calls: {usage['calls']}")
+                    logger.info(f"    Input tokens: {usage['input']:,}")
+                    logger.info(f"    Output tokens: {usage['output']:,}")
+                    module_cost = (usage['input'] / 1_000_000) * 0.30 + (usage['output'] / 1_000_000) * 2.50
+                    logger.info(f"    Module cost: ${module_cost:.4f} USD")
         logger.info("="*60)
         return final_output

{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: debase
-Version: 0.1.18
+Version: 0.4.0
 Summary: Enzyme lineage analysis and sequence extraction package
 Home-page: https://github.com/YuemingLong/DEBase
 Author: DEBase Team

debase-0.4.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
+debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
+debase/_version.py,sha256=X9pfcQjm1Y8ILtLtdscGnfFKSp5XWTeamXgSHPOw2K0,49
+debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
+debase/cleanup_sequence.py,sha256=ngxb_tPekjCWvampAjyuFqK4wLk_meFSj_TwfvOxheQ,33978
+debase/enzyme_lineage_extractor.py,sha256=laIw9A5AuJ_kJe9h6Fp_WzMh_ctCN31bo2b2-RKrFd4,124019
+debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
+debase/reaction_info_extractor.py,sha256=xRyYoQKqSzer-k8FZwg55nDd0D-6QBc0F-HAyfvisG0,150368
+debase/substrate_scope_extractor.py,sha256=ny4n_J4SDFQnxhCHHHan1xouqM8FkueJm_z-hm6gr-o,103761
+debase/wrapper.py,sha256=TGU5eq0qWTrkRR35ztsp8WMb1E9Nt64BdbHuYHROmYA,24279
+debase-0.4.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
+debase-0.4.0.dist-info/METADATA,sha256=Qpvyi4nbq_wmhbl_089pRIlGAubVxjwVfX1eUSK3lLY,10789
+debase-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+debase-0.4.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
+debase-0.4.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
+debase-0.4.0.dist-info/RECORD,,

debase/PIPELINE_FLOW.md DELETED Viewed

@@ -1,100 +0,0 @@
-# DEBase Pipeline Flow
-## Overview
-The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
-## Pipeline Architecture
-```
-┌─────────────────────┐     ┌─────────────────────┐
-│   Manuscript PDF    │     │       SI PDF        │
-└──────────┬──────────┘     └──────────┬──────────┘
-           │                           │
-           └───────────┬───────────────┘
-                       │
-                       ▼
-         ┌─────────────────────────────┐
-         │ 1. enzyme_lineage_extractor │
-         │   - Extract enzyme variants │
-         │   - Parse mutations         │
-         │   - Get basic metadata      │
-         └─────────────┬───────────────┘
-                       │
-                       ▼
-         ┌─────────────────────────────┐
-         │    2. cleanup_sequence      │
-         │   - Validate sequences      │
-         │   - Fix formatting issues   │
-         │   - Generate full sequences │
-         └─────────────┬───────────────┘
-                       │
-           ┌───────────┴───────────────┐
-           │                           │
-           ▼                           ▼
-┌─────────────────────────┐ ┌─────────────────────────┐
-│ 3a. reaction_info       │ │ 3b. substrate_scope     │
-│     _extractor          │ │     _extractor          │
-│ - Performance metrics   │ │ - Substrate variations  │
-│ - Model reaction        │ │ - Additional variants   │
-│ - Conditions            │ │ - Scope data            │
-└───────────┬─────────────┘ └───────────┬─────────────┘
-            │                           │
-            └───────────┬───────────────┘
-                        │
-                        ▼
-          ┌─────────────────────────────┐
-          │    4. lineage_format_o3     │
-          │   - Merge all data          │
-          │   - Fill missing sequences  │
-          │   - Format final output     │
-          └─────────────┬───────────────┘
-                        │
-                        ▼
-                ┌─────────────┐
-                │ Final CSV   │
-                └─────────────┘
-```
-## Module Details
-### 1. enzyme_lineage_extractor.py
-- **Input**: Manuscript PDF, SI PDF
-- **Output**: CSV with enzyme variants and mutations
-- **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
-### 2. cleanup_sequence.py
-- **Input**: Enzyme lineage CSV
-- **Output**: CSV with validated sequences
-- **Function**: Validates protein sequences, generates full sequences from mutations
-### 3a. reaction_info_extractor.py
-- **Input**: PDFs + cleaned enzyme CSV
-- **Output**: CSV with reaction performance data
-- **Function**: Extracts yield, TTN, selectivity, reaction conditions
-### 3b. substrate_scope_extractor.py
-- **Input**: PDFs + cleaned enzyme CSV
-- **Output**: CSV with substrate scope entries
-- **Function**: Extracts substrate variations tested with different enzymes
-### 4. lineage_format_o3.py
-- **Input**: Reaction CSV + Substrate scope CSV
-- **Output**: Final formatted CSV
-- **Function**: Merges data, fills missing sequences, applies consistent formatting
-## Key Features
-1. **Modular Design**: Each step can be run independently
-2. **Parallel Extraction**: Steps 3a and 3b run independently
-3. **Error Recovery**: Pipeline can resume from any step
-4. **Clean Interfaces**: Each module has well-defined inputs/outputs
-## Usage
-```bash
-# Full pipeline
-python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
-# With intermediate files kept for debugging
-python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
-```

debase-0.1.18.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
-debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
-debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
-debase/_version.py,sha256=Qd1kKsssesKE5FvJnDdAuZsx_BrxTSJJyt68SK99D54,50
-debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
-debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
-debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
-debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
-debase/reaction_info_extractor.py,sha256=W9CS0puFTdhJ_T2Fpy931EgnjOCsHHjbtU6RdnzDlhw,113140
-debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
-debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
-debase-0.1.18.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
-debase-0.1.18.dist-info/METADATA,sha256=XvSrveJ0Y40c53JYUfiveaQNJ3qoEkxaQ61n3_--1cQ,10790
-debase-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-debase-0.1.18.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
-debase-0.1.18.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
-debase-0.1.18.dist-info/RECORD,,

{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{debase-0.1.18.dist-info → debase-0.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

debase 0.1.18__py3-none-any.whl → 0.4.0__py3-none-any.whl

debase 0.1.18py3-none-any.whl → 0.4.0py3-none-any.whl