debase 0.1.19__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +40 -8
- debase/enzyme_lineage_extractor.py +153 -9
- debase/reaction_info_extractor.py +1119 -504
- debase/substrate_scope_extractor.py +50 -41
- debase/wrapper.py +75 -0
- {debase-0.1.19.dist-info → debase-0.4.1.dist-info}/METADATA +1 -1
- debase-0.4.1.dist-info/RECORD +16 -0
- debase/PIPELINE_FLOW.md +0 -100
- debase-0.1.19.dist-info/RECORD +0 -17
- {debase-0.1.19.dist-info → debase-0.4.1.dist-info}/WHEEL +0 -0
- {debase-0.1.19.dist-info → debase-0.4.1.dist-info}/entry_points.txt +0 -0
- {debase-0.1.19.dist-info → debase-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.19.dist-info → debase-0.4.1.dist-info}/top_level.txt +0 -0
@@ -278,30 +278,30 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
278
278
|
|
279
279
|
# Check if this page contains the figure caption
|
280
280
|
found = False
|
281
|
-
|
281
|
+
caption_rect = None
|
282
282
|
|
283
|
-
#
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
283
|
+
# Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
|
284
|
+
figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
|
285
|
+
|
286
|
+
# Look for actual figure captions using regex patterns
|
287
|
+
caption_patterns = [
|
288
|
+
rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
|
289
|
+
rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
|
290
|
+
rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
|
291
|
+
rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
|
292
|
+
rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
|
288
293
|
]
|
289
294
|
|
290
|
-
for
|
291
|
-
|
292
|
-
if
|
293
|
-
#
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
['directed evolution', 'substrate scope', '(a)', '(b)', '(c)']):
|
301
|
-
found = True
|
302
|
-
caption_rect = rect
|
303
|
-
break
|
304
|
-
if found:
|
295
|
+
for pattern in caption_patterns:
|
296
|
+
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
297
|
+
if matches:
|
298
|
+
# Found actual figure caption, get its position
|
299
|
+
caption_text = matches.group(0)
|
300
|
+
caption_instances = page.search_for(caption_text, quads=False)
|
301
|
+
if caption_instances:
|
302
|
+
caption_rect = caption_instances[0]
|
303
|
+
found = True
|
304
|
+
log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
|
305
305
|
break
|
306
306
|
|
307
307
|
if not found:
|
@@ -309,34 +309,28 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
309
309
|
|
310
310
|
log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
|
311
311
|
|
312
|
-
# Extract
|
313
|
-
# The figure should be between the top of the viewable area and the caption
|
312
|
+
# Extract just the figure with its caption, avoiding excessive white space
|
314
313
|
page_rect = page.rect
|
315
314
|
|
316
|
-
#
|
317
|
-
# Extract
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
# Calculate the figure region - everything from top to caption
|
324
|
-
fig_top = top_margin
|
325
|
-
fig_bottom = caption_rect.y0 - bottom_margin
|
326
|
-
fig_left = left_margin
|
327
|
-
fig_right = page_rect.width - right_margin
|
315
|
+
# Calculate the figure region on current page only
|
316
|
+
# Extract from top of page to just below the caption
|
317
|
+
fig_top = 0 # Start from top of page
|
318
|
+
fig_bottom = min(caption_rect.y0 + 200, page_rect.height) # 200px below caption, but not more than page height
|
319
|
+
fig_left = 0 # Full width
|
320
|
+
fig_right = page_rect.width
|
328
321
|
|
329
|
-
#
|
322
|
+
# Extract only the figure region (no additional pages to avoid white space)
|
330
323
|
clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
|
331
|
-
|
332
|
-
# Extract the region as an image
|
333
324
|
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
334
325
|
pix = page.get_pixmap(clip=clip_rect, matrix=mat)
|
335
326
|
|
327
|
+
log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
|
328
|
+
pix.width, pix.height, page_num + 1)
|
329
|
+
|
336
330
|
# Convert to PNG
|
337
331
|
img_bytes = pix.tobytes("png")
|
338
|
-
log.info("Extracted figure region:
|
339
|
-
|
332
|
+
log.info("Extracted figure region: %dx%d pixels from page %d",
|
333
|
+
pix.width, pix.height, page_num + 1)
|
340
334
|
|
341
335
|
return b64encode(img_bytes).decode()
|
342
336
|
|
@@ -653,10 +647,25 @@ def get_model():
|
|
653
647
|
def _extract_text(resp) -> str:
|
654
648
|
"""
|
655
649
|
Pull the *first* textual part out of a GenerativeAI response, handling both
|
656
|
-
the old prerelease SDK and the >=1.0 SDK.
|
650
|
+
the old prerelease SDK and the >=1.0 SDK. Also tracks token usage.
|
657
651
|
|
658
652
|
Returns an empty string if no textual content is found.
|
659
653
|
"""
|
654
|
+
# Track token usage if available
|
655
|
+
try:
|
656
|
+
if hasattr(resp, 'usage_metadata'):
|
657
|
+
input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
|
658
|
+
output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
|
659
|
+
if input_tokens or output_tokens:
|
660
|
+
# Import wrapper token tracking
|
661
|
+
try:
|
662
|
+
from .wrapper import add_token_usage
|
663
|
+
add_token_usage('substrate_scope_extractor', input_tokens, output_tokens)
|
664
|
+
except ImportError:
|
665
|
+
pass # wrapper not available
|
666
|
+
except Exception:
|
667
|
+
pass # token tracking is best-effort
|
668
|
+
|
660
669
|
# 1) Legacy SDK (<= 0.4) - still has nice `.text`
|
661
670
|
if getattr(resp, "text", None):
|
662
671
|
return resp.text
|
debase/wrapper.py
CHANGED
@@ -19,6 +19,7 @@ import logging
|
|
19
19
|
import time
|
20
20
|
from datetime import datetime
|
21
21
|
from pathlib import Path
|
22
|
+
import threading
|
22
23
|
|
23
24
|
# Setup logging
|
24
25
|
logging.basicConfig(
|
@@ -27,6 +28,53 @@ logging.basicConfig(
|
|
27
28
|
)
|
28
29
|
logger = logging.getLogger("EnzymePipeline")
|
29
30
|
|
31
|
+
# Global token tracking
|
32
|
+
_token_lock = threading.Lock()
|
33
|
+
_token_usage = {
|
34
|
+
'total_input_tokens': 0,
|
35
|
+
'total_output_tokens': 0,
|
36
|
+
'calls_by_module': {
|
37
|
+
'enzyme_lineage_extractor': {'input': 0, 'output': 0, 'calls': 0},
|
38
|
+
'reaction_info_extractor': {'input': 0, 'output': 0, 'calls': 0},
|
39
|
+
'substrate_scope_extractor': {'input': 0, 'output': 0, 'calls': 0}
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
def add_token_usage(module_name: str, input_tokens: int, output_tokens: int):
|
44
|
+
"""Add token usage from a module to the global tracking."""
|
45
|
+
with _token_lock:
|
46
|
+
_token_usage['total_input_tokens'] += input_tokens
|
47
|
+
_token_usage['total_output_tokens'] += output_tokens
|
48
|
+
if module_name in _token_usage['calls_by_module']:
|
49
|
+
_token_usage['calls_by_module'][module_name]['input'] += input_tokens
|
50
|
+
_token_usage['calls_by_module'][module_name]['output'] += output_tokens
|
51
|
+
_token_usage['calls_by_module'][module_name]['calls'] += 1
|
52
|
+
|
53
|
+
def calculate_token_usage_and_cost():
|
54
|
+
"""Calculate total token usage and estimated cost for Gemini 2.5 Flash."""
|
55
|
+
with _token_lock:
|
56
|
+
total_input = _token_usage['total_input_tokens']
|
57
|
+
total_output = _token_usage['total_output_tokens']
|
58
|
+
|
59
|
+
# Gemini 2.5 Flash pricing (as of 2025)
|
60
|
+
# Input: $0.30 per 1M tokens
|
61
|
+
# Output: $2.50 per 1M tokens
|
62
|
+
input_cost = (total_input / 1_000_000) * 0.30
|
63
|
+
output_cost = (total_output / 1_000_000) * 2.50
|
64
|
+
total_cost = input_cost + output_cost
|
65
|
+
|
66
|
+
return total_input, total_output, total_cost
|
67
|
+
|
68
|
+
def reset_token_usage():
|
69
|
+
"""Reset token usage counters."""
|
70
|
+
with _token_lock:
|
71
|
+
_token_usage['total_input_tokens'] = 0
|
72
|
+
_token_usage['total_output_tokens'] = 0
|
73
|
+
for module_data in _token_usage['calls_by_module'].values():
|
74
|
+
module_data['input'] = 0
|
75
|
+
module_data['output'] = 0
|
76
|
+
module_data['calls'] = 0
|
77
|
+
|
30
78
|
|
31
79
|
def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
|
32
80
|
"""
|
@@ -405,6 +453,9 @@ def run_pipeline(
|
|
405
453
|
substrate_csv = output_dir / "3b_substrate_scope.csv"
|
406
454
|
|
407
455
|
try:
|
456
|
+
# Reset token usage tracking for this pipeline run
|
457
|
+
reset_token_usage()
|
458
|
+
|
408
459
|
logger.info("="*60)
|
409
460
|
logger.info("Starting DEBase Enzyme Analysis Pipeline")
|
410
461
|
logger.info(f"Manuscript: {manuscript_path}")
|
@@ -449,12 +500,36 @@ def run_pipeline(
|
|
449
500
|
else:
|
450
501
|
logger.info("Note: Use --keep-intermediates to save intermediate files")
|
451
502
|
|
503
|
+
# Calculate token usage and estimated costs
|
504
|
+
total_input_tokens, total_output_tokens, estimated_cost = calculate_token_usage_and_cost()
|
505
|
+
|
452
506
|
logger.info("\n" + "="*60)
|
453
507
|
logger.info("PIPELINE COMPLETED SUCCESSFULLY")
|
454
508
|
logger.info(f"Comprehensive output: {output_path}")
|
455
509
|
if final_output != output_path:
|
456
510
|
logger.info(f"Plate-based output: {final_output}")
|
457
511
|
logger.info(f"Runtime: {elapsed:.1f} seconds")
|
512
|
+
logger.info("")
|
513
|
+
logger.info("TOKEN USAGE & COST ESTIMATE:")
|
514
|
+
logger.info(f" Input tokens: {total_input_tokens:,}")
|
515
|
+
logger.info(f" Output tokens: {total_output_tokens:,}")
|
516
|
+
logger.info(f" Total tokens: {total_input_tokens + total_output_tokens:,}")
|
517
|
+
logger.info(f" Estimated cost: ${estimated_cost:.4f} USD")
|
518
|
+
logger.info(" (Based on Gemini 2.5 Flash pricing: $0.30/1M input, $2.50/1M output)")
|
519
|
+
logger.info("")
|
520
|
+
|
521
|
+
# Show breakdown by module
|
522
|
+
with _token_lock:
|
523
|
+
logger.info("BREAKDOWN BY MODULE:")
|
524
|
+
for module_name, usage in _token_usage['calls_by_module'].items():
|
525
|
+
if usage['calls'] > 0:
|
526
|
+
logger.info(f" {module_name}:")
|
527
|
+
logger.info(f" API calls: {usage['calls']}")
|
528
|
+
logger.info(f" Input tokens: {usage['input']:,}")
|
529
|
+
logger.info(f" Output tokens: {usage['output']:,}")
|
530
|
+
module_cost = (usage['input'] / 1_000_000) * 0.30 + (usage['output'] / 1_000_000) * 2.50
|
531
|
+
logger.info(f" Module cost: ${module_cost:.4f} USD")
|
532
|
+
|
458
533
|
logger.info("="*60)
|
459
534
|
|
460
535
|
return final_output
|
@@ -0,0 +1,16 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=qJBDmAhFTv4pYwvc3Umy_Lwc_v8doVBVCiysT7Eoh3E,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/cleanup_sequence.py,sha256=ngxb_tPekjCWvampAjyuFqK4wLk_meFSj_TwfvOxheQ,33978
|
6
|
+
debase/enzyme_lineage_extractor.py,sha256=laIw9A5AuJ_kJe9h6Fp_WzMh_ctCN31bo2b2-RKrFd4,124019
|
7
|
+
debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
|
8
|
+
debase/reaction_info_extractor.py,sha256=xRyYoQKqSzer-k8FZwg55nDd0D-6QBc0F-HAyfvisG0,150368
|
9
|
+
debase/substrate_scope_extractor.py,sha256=JLXHEEeMDFiFQRt8gVCnnhimrxDF23-z0jq3N4-3gn8,101469
|
10
|
+
debase/wrapper.py,sha256=TGU5eq0qWTrkRR35ztsp8WMb1E9Nt64BdbHuYHROmYA,24279
|
11
|
+
debase-0.4.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
+
debase-0.4.1.dist-info/METADATA,sha256=kSH58QfBv6WGb8Ds3mcei-DUmWQormuSyPHNOmpbcQ8,10789
|
13
|
+
debase-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
debase-0.4.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
+
debase-0.4.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
+
debase-0.4.1.dist-info/RECORD,,
|
debase/PIPELINE_FLOW.md
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
# DEBase Pipeline Flow
|
2
|
-
|
3
|
-
## Overview
|
4
|
-
The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
|
5
|
-
|
6
|
-
## Pipeline Architecture
|
7
|
-
|
8
|
-
```
|
9
|
-
┌─────────────────────┐ ┌─────────────────────┐
|
10
|
-
│ Manuscript PDF │ │ SI PDF │
|
11
|
-
└──────────┬──────────┘ └──────────┬──────────┘
|
12
|
-
│ │
|
13
|
-
└───────────┬───────────────┘
|
14
|
-
│
|
15
|
-
▼
|
16
|
-
┌─────────────────────────────┐
|
17
|
-
│ 1. enzyme_lineage_extractor │
|
18
|
-
│ - Extract enzyme variants │
|
19
|
-
│ - Parse mutations │
|
20
|
-
│ - Get basic metadata │
|
21
|
-
└─────────────┬───────────────┘
|
22
|
-
│
|
23
|
-
▼
|
24
|
-
┌─────────────────────────────┐
|
25
|
-
│ 2. cleanup_sequence │
|
26
|
-
│ - Validate sequences │
|
27
|
-
│ - Fix formatting issues │
|
28
|
-
│ - Generate full sequences │
|
29
|
-
└─────────────┬───────────────┘
|
30
|
-
│
|
31
|
-
┌───────────┴───────────────┐
|
32
|
-
│ │
|
33
|
-
▼ ▼
|
34
|
-
┌─────────────────────────┐ ┌─────────────────────────┐
|
35
|
-
│ 3a. reaction_info │ │ 3b. substrate_scope │
|
36
|
-
│ _extractor │ │ _extractor │
|
37
|
-
│ - Performance metrics │ │ - Substrate variations │
|
38
|
-
│ - Model reaction │ │ - Additional variants │
|
39
|
-
│ - Conditions │ │ - Scope data │
|
40
|
-
└───────────┬─────────────┘ └───────────┬─────────────┘
|
41
|
-
│ │
|
42
|
-
└───────────┬───────────────┘
|
43
|
-
│
|
44
|
-
▼
|
45
|
-
┌─────────────────────────────┐
|
46
|
-
│ 4. lineage_format_o3 │
|
47
|
-
│ - Merge all data │
|
48
|
-
│ - Fill missing sequences │
|
49
|
-
│ - Format final output │
|
50
|
-
└─────────────┬───────────────┘
|
51
|
-
│
|
52
|
-
▼
|
53
|
-
┌─────────────┐
|
54
|
-
│ Final CSV │
|
55
|
-
└─────────────┘
|
56
|
-
```
|
57
|
-
|
58
|
-
## Module Details
|
59
|
-
|
60
|
-
### 1. enzyme_lineage_extractor.py
|
61
|
-
- **Input**: Manuscript PDF, SI PDF
|
62
|
-
- **Output**: CSV with enzyme variants and mutations
|
63
|
-
- **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
|
64
|
-
|
65
|
-
### 2. cleanup_sequence.py
|
66
|
-
- **Input**: Enzyme lineage CSV
|
67
|
-
- **Output**: CSV with validated sequences
|
68
|
-
- **Function**: Validates protein sequences, generates full sequences from mutations
|
69
|
-
|
70
|
-
### 3a. reaction_info_extractor.py
|
71
|
-
- **Input**: PDFs + cleaned enzyme CSV
|
72
|
-
- **Output**: CSV with reaction performance data
|
73
|
-
- **Function**: Extracts yield, TTN, selectivity, reaction conditions
|
74
|
-
|
75
|
-
### 3b. substrate_scope_extractor.py
|
76
|
-
- **Input**: PDFs + cleaned enzyme CSV
|
77
|
-
- **Output**: CSV with substrate scope entries
|
78
|
-
- **Function**: Extracts substrate variations tested with different enzymes
|
79
|
-
|
80
|
-
### 4. lineage_format_o3.py
|
81
|
-
- **Input**: Reaction CSV + Substrate scope CSV
|
82
|
-
- **Output**: Final formatted CSV
|
83
|
-
- **Function**: Merges data, fills missing sequences, applies consistent formatting
|
84
|
-
|
85
|
-
## Key Features
|
86
|
-
|
87
|
-
1. **Modular Design**: Each step can be run independently
|
88
|
-
2. **Parallel Extraction**: Steps 3a and 3b run independently
|
89
|
-
3. **Error Recovery**: Pipeline can resume from any step
|
90
|
-
4. **Clean Interfaces**: Each module has well-defined inputs/outputs
|
91
|
-
|
92
|
-
## Usage
|
93
|
-
|
94
|
-
```bash
|
95
|
-
# Full pipeline
|
96
|
-
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
|
97
|
-
|
98
|
-
# With intermediate files kept for debugging
|
99
|
-
python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
|
100
|
-
```
|
debase-0.1.19.dist-info/RECORD
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
|
2
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
3
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
4
|
-
debase/_version.py,sha256=VbYiJzmzValsIDmCyQWPabFFsmy_TQ_Qp35j2mo-UKc,50
|
5
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
6
|
-
debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
|
7
|
-
debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
|
8
|
-
debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
|
9
|
-
debase/reaction_info_extractor.py,sha256=otj8D3MnrThhUR_xOCc3sSVIw8hrCKnB4OY6y6NnaWA,116674
|
10
|
-
debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
|
11
|
-
debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
|
12
|
-
debase-0.1.19.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
13
|
-
debase-0.1.19.dist-info/METADATA,sha256=i1dFEB8kPkfTt8q8hJpAAAkZA29T2kb1bzPFMjzPdJU,10790
|
14
|
-
debase-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
-
debase-0.1.19.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
16
|
-
debase-0.1.19.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
17
|
-
debase-0.1.19.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|