debase 0.1.19__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -278,30 +278,30 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
278
278
 
279
279
  # Check if this page contains the figure caption
280
280
  found = False
281
- caption_instances = None
281
+ caption_rect = None
282
282
 
283
- # Look for figure caption
284
- variations = [
285
- f"{base_figure_ref}.", # "Figure 1." - most reliable
286
- f"{base_figure_ref} ", # "Figure 1 "
287
- base_figure_ref,
283
+ # Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
284
+ figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
285
+
286
+ # Look for actual figure captions using regex patterns
287
+ caption_patterns = [
288
+ rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
289
+ rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
290
+ rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
291
+ rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
292
+ rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
288
293
  ]
289
294
 
290
- for variation in variations:
291
- caption_instances = page.search_for(variation, quads=False)
292
- if caption_instances:
293
- # Check if this is likely a caption (not a reference in text)
294
- for rect in caption_instances:
295
- # Get text around this location
296
- x0, y0, x1, y1 = rect
297
- text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+300, y1+20))
298
- # Check if it looks like a figure caption
299
- if any(keyword in text_around.lower() for keyword in
300
- ['directed evolution', 'substrate scope', '(a)', '(b)', '(c)']):
301
- found = True
302
- caption_rect = rect
303
- break
304
- if found:
295
+ for pattern in caption_patterns:
296
+ matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
297
+ if matches:
298
+ # Found actual figure caption, get its position
299
+ caption_text = matches.group(0)
300
+ caption_instances = page.search_for(caption_text, quads=False)
301
+ if caption_instances:
302
+ caption_rect = caption_instances[0]
303
+ found = True
304
+ log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
305
305
  break
306
306
 
307
307
  if not found:
@@ -309,34 +309,28 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
309
309
 
310
310
  log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
311
311
 
312
- # Extract a region of the page above the caption
313
- # The figure should be between the top of the viewable area and the caption
312
+ # Extract just the figure with its caption, avoiding excessive white space
314
313
  page_rect = page.rect
315
314
 
316
- # Define the region to extract
317
- # Extract everything above the caption
318
- top_margin = 0 # Start from the very top of the page
319
- bottom_margin = 5 # Small margin above caption
320
- left_margin = 0 # Use full page width
321
- right_margin = 0
322
-
323
- # Calculate the figure region - everything from top to caption
324
- fig_top = top_margin
325
- fig_bottom = caption_rect.y0 - bottom_margin
326
- fig_left = left_margin
327
- fig_right = page_rect.width - right_margin
315
+ # Calculate the figure region on current page only
316
+ # Extract from top of page to just below the caption
317
+ fig_top = 0 # Start from top of page
318
+ fig_bottom = min(caption_rect.y0 + 200, page_rect.height) # 200px below caption, but not more than page height
319
+ fig_left = 0 # Full width
320
+ fig_right = page_rect.width
328
321
 
329
- # Create the clip rectangle
322
+ # Extract only the figure region (no additional pages to avoid white space)
330
323
  clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
331
-
332
- # Extract the region as an image
333
324
  mat = fitz.Matrix(2, 2) # 2x zoom for better quality
334
325
  pix = page.get_pixmap(clip=clip_rect, matrix=mat)
335
326
 
327
+ log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
328
+ pix.width, pix.height, page_num + 1)
329
+
336
330
  # Convert to PNG
337
331
  img_bytes = pix.tobytes("png")
338
- log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
339
- clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
332
+ log.info("Extracted figure region: %dx%d pixels from page %d",
333
+ pix.width, pix.height, page_num + 1)
340
334
 
341
335
  return b64encode(img_bytes).decode()
342
336
 
@@ -653,10 +647,25 @@ def get_model():
653
647
  def _extract_text(resp) -> str:
654
648
  """
655
649
  Pull the *first* textual part out of a GenerativeAI response, handling both
656
- the old prerelease SDK and the >=1.0 SDK.
650
+ the old prerelease SDK and the >=1.0 SDK. Also tracks token usage.
657
651
 
658
652
  Returns an empty string if no textual content is found.
659
653
  """
654
+ # Track token usage if available
655
+ try:
656
+ if hasattr(resp, 'usage_metadata'):
657
+ input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
658
+ output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
659
+ if input_tokens or output_tokens:
660
+ # Import wrapper token tracking
661
+ try:
662
+ from .wrapper import add_token_usage
663
+ add_token_usage('substrate_scope_extractor', input_tokens, output_tokens)
664
+ except ImportError:
665
+ pass # wrapper not available
666
+ except Exception:
667
+ pass # token tracking is best-effort
668
+
660
669
  # 1) Legacy SDK (<= 0.4) - still has nice `.text`
661
670
  if getattr(resp, "text", None):
662
671
  return resp.text
debase/wrapper.py CHANGED
@@ -19,6 +19,7 @@ import logging
19
19
  import time
20
20
  from datetime import datetime
21
21
  from pathlib import Path
22
+ import threading
22
23
 
23
24
  # Setup logging
24
25
  logging.basicConfig(
@@ -27,6 +28,53 @@ logging.basicConfig(
27
28
  )
28
29
  logger = logging.getLogger("EnzymePipeline")
29
30
 
31
+ # Global token tracking
32
+ _token_lock = threading.Lock()
33
+ _token_usage = {
34
+ 'total_input_tokens': 0,
35
+ 'total_output_tokens': 0,
36
+ 'calls_by_module': {
37
+ 'enzyme_lineage_extractor': {'input': 0, 'output': 0, 'calls': 0},
38
+ 'reaction_info_extractor': {'input': 0, 'output': 0, 'calls': 0},
39
+ 'substrate_scope_extractor': {'input': 0, 'output': 0, 'calls': 0}
40
+ }
41
+ }
42
+
43
+ def add_token_usage(module_name: str, input_tokens: int, output_tokens: int):
44
+ """Add token usage from a module to the global tracking."""
45
+ with _token_lock:
46
+ _token_usage['total_input_tokens'] += input_tokens
47
+ _token_usage['total_output_tokens'] += output_tokens
48
+ if module_name in _token_usage['calls_by_module']:
49
+ _token_usage['calls_by_module'][module_name]['input'] += input_tokens
50
+ _token_usage['calls_by_module'][module_name]['output'] += output_tokens
51
+ _token_usage['calls_by_module'][module_name]['calls'] += 1
52
+
53
+ def calculate_token_usage_and_cost():
54
+ """Calculate total token usage and estimated cost for Gemini 2.5 Flash."""
55
+ with _token_lock:
56
+ total_input = _token_usage['total_input_tokens']
57
+ total_output = _token_usage['total_output_tokens']
58
+
59
+ # Gemini 2.5 Flash pricing (as of 2025)
60
+ # Input: $0.30 per 1M tokens
61
+ # Output: $2.50 per 1M tokens
62
+ input_cost = (total_input / 1_000_000) * 0.30
63
+ output_cost = (total_output / 1_000_000) * 2.50
64
+ total_cost = input_cost + output_cost
65
+
66
+ return total_input, total_output, total_cost
67
+
68
+ def reset_token_usage():
69
+ """Reset token usage counters."""
70
+ with _token_lock:
71
+ _token_usage['total_input_tokens'] = 0
72
+ _token_usage['total_output_tokens'] = 0
73
+ for module_data in _token_usage['calls_by_module'].values():
74
+ module_data['input'] = 0
75
+ module_data['output'] = 0
76
+ module_data['calls'] = 0
77
+
30
78
 
31
79
  def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
32
80
  """
@@ -405,6 +453,9 @@ def run_pipeline(
405
453
  substrate_csv = output_dir / "3b_substrate_scope.csv"
406
454
 
407
455
  try:
456
+ # Reset token usage tracking for this pipeline run
457
+ reset_token_usage()
458
+
408
459
  logger.info("="*60)
409
460
  logger.info("Starting DEBase Enzyme Analysis Pipeline")
410
461
  logger.info(f"Manuscript: {manuscript_path}")
@@ -449,12 +500,36 @@ def run_pipeline(
449
500
  else:
450
501
  logger.info("Note: Use --keep-intermediates to save intermediate files")
451
502
 
503
+ # Calculate token usage and estimated costs
504
+ total_input_tokens, total_output_tokens, estimated_cost = calculate_token_usage_and_cost()
505
+
452
506
  logger.info("\n" + "="*60)
453
507
  logger.info("PIPELINE COMPLETED SUCCESSFULLY")
454
508
  logger.info(f"Comprehensive output: {output_path}")
455
509
  if final_output != output_path:
456
510
  logger.info(f"Plate-based output: {final_output}")
457
511
  logger.info(f"Runtime: {elapsed:.1f} seconds")
512
+ logger.info("")
513
+ logger.info("TOKEN USAGE & COST ESTIMATE:")
514
+ logger.info(f" Input tokens: {total_input_tokens:,}")
515
+ logger.info(f" Output tokens: {total_output_tokens:,}")
516
+ logger.info(f" Total tokens: {total_input_tokens + total_output_tokens:,}")
517
+ logger.info(f" Estimated cost: ${estimated_cost:.4f} USD")
518
+ logger.info(" (Based on Gemini 2.5 Flash pricing: $0.30/1M input, $2.50/1M output)")
519
+ logger.info("")
520
+
521
+ # Show breakdown by module
522
+ with _token_lock:
523
+ logger.info("BREAKDOWN BY MODULE:")
524
+ for module_name, usage in _token_usage['calls_by_module'].items():
525
+ if usage['calls'] > 0:
526
+ logger.info(f" {module_name}:")
527
+ logger.info(f" API calls: {usage['calls']}")
528
+ logger.info(f" Input tokens: {usage['input']:,}")
529
+ logger.info(f" Output tokens: {usage['output']:,}")
530
+ module_cost = (usage['input'] / 1_000_000) * 0.30 + (usage['output'] / 1_000_000) * 2.50
531
+ logger.info(f" Module cost: ${module_cost:.4f} USD")
532
+
458
533
  logger.info("="*60)
459
534
 
460
535
  return final_output
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.19
3
+ Version: 0.4.1
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,16 @@
1
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
+ debase/_version.py,sha256=qJBDmAhFTv4pYwvc3Umy_Lwc_v8doVBVCiysT7Eoh3E,49
4
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
+ debase/cleanup_sequence.py,sha256=ngxb_tPekjCWvampAjyuFqK4wLk_meFSj_TwfvOxheQ,33978
6
+ debase/enzyme_lineage_extractor.py,sha256=laIw9A5AuJ_kJe9h6Fp_WzMh_ctCN31bo2b2-RKrFd4,124019
7
+ debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
8
+ debase/reaction_info_extractor.py,sha256=xRyYoQKqSzer-k8FZwg55nDd0D-6QBc0F-HAyfvisG0,150368
9
+ debase/substrate_scope_extractor.py,sha256=JLXHEEeMDFiFQRt8gVCnnhimrxDF23-z0jq3N4-3gn8,101469
10
+ debase/wrapper.py,sha256=TGU5eq0qWTrkRR35ztsp8WMb1E9Nt64BdbHuYHROmYA,24279
11
+ debase-0.4.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
+ debase-0.4.1.dist-info/METADATA,sha256=kSH58QfBv6WGb8Ds3mcei-DUmWQormuSyPHNOmpbcQ8,10789
13
+ debase-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ debase-0.4.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
+ debase-0.4.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
+ debase-0.4.1.dist-info/RECORD,,
debase/PIPELINE_FLOW.md DELETED
@@ -1,100 +0,0 @@
1
- # DEBase Pipeline Flow
2
-
3
- ## Overview
4
- The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
5
-
6
- ## Pipeline Architecture
7
-
8
- ```
9
- ┌─────────────────────┐ ┌─────────────────────┐
10
- │ Manuscript PDF │ │ SI PDF │
11
- └──────────┬──────────┘ └──────────┬──────────┘
12
- │ │
13
- └───────────┬───────────────┘
14
-
15
-
16
- ┌─────────────────────────────┐
17
- │ 1. enzyme_lineage_extractor │
18
- │ - Extract enzyme variants │
19
- │ - Parse mutations │
20
- │ - Get basic metadata │
21
- └─────────────┬───────────────┘
22
-
23
-
24
- ┌─────────────────────────────┐
25
- │ 2. cleanup_sequence │
26
- │ - Validate sequences │
27
- │ - Fix formatting issues │
28
- │ - Generate full sequences │
29
- └─────────────┬───────────────┘
30
-
31
- ┌───────────┴───────────────┐
32
- │ │
33
- ▼ ▼
34
- ┌─────────────────────────┐ ┌─────────────────────────┐
35
- │ 3a. reaction_info │ │ 3b. substrate_scope │
36
- │ _extractor │ │ _extractor │
37
- │ - Performance metrics │ │ - Substrate variations │
38
- │ - Model reaction │ │ - Additional variants │
39
- │ - Conditions │ │ - Scope data │
40
- └───────────┬─────────────┘ └───────────┬─────────────┘
41
- │ │
42
- └───────────┬───────────────┘
43
-
44
-
45
- ┌─────────────────────────────┐
46
- │ 4. lineage_format_o3 │
47
- │ - Merge all data │
48
- │ - Fill missing sequences │
49
- │ - Format final output │
50
- └─────────────┬───────────────┘
51
-
52
-
53
- ┌─────────────┐
54
- │ Final CSV │
55
- └─────────────┘
56
- ```
57
-
58
- ## Module Details
59
-
60
- ### 1. enzyme_lineage_extractor.py
61
- - **Input**: Manuscript PDF, SI PDF
62
- - **Output**: CSV with enzyme variants and mutations
63
- - **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
64
-
65
- ### 2. cleanup_sequence.py
66
- - **Input**: Enzyme lineage CSV
67
- - **Output**: CSV with validated sequences
68
- - **Function**: Validates protein sequences, generates full sequences from mutations
69
-
70
- ### 3a. reaction_info_extractor.py
71
- - **Input**: PDFs + cleaned enzyme CSV
72
- - **Output**: CSV with reaction performance data
73
- - **Function**: Extracts yield, TTN, selectivity, reaction conditions
74
-
75
- ### 3b. substrate_scope_extractor.py
76
- - **Input**: PDFs + cleaned enzyme CSV
77
- - **Output**: CSV with substrate scope entries
78
- - **Function**: Extracts substrate variations tested with different enzymes
79
-
80
- ### 4. lineage_format_o3.py
81
- - **Input**: Reaction CSV + Substrate scope CSV
82
- - **Output**: Final formatted CSV
83
- - **Function**: Merges data, fills missing sequences, applies consistent formatting
84
-
85
- ## Key Features
86
-
87
- 1. **Modular Design**: Each step can be run independently
88
- 2. **Parallel Extraction**: Steps 3a and 3b run independently
89
- 3. **Error Recovery**: Pipeline can resume from any step
90
- 4. **Clean Interfaces**: Each module has well-defined inputs/outputs
91
-
92
- ## Usage
93
-
94
- ```bash
95
- # Full pipeline
96
- python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
97
-
98
- # With intermediate files kept for debugging
99
- python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
100
- ```
@@ -1,17 +0,0 @@
1
- debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=VbYiJzmzValsIDmCyQWPabFFsmy_TQ_Qp35j2mo-UKc,50
5
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
- debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
8
- debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
9
- debase/reaction_info_extractor.py,sha256=otj8D3MnrThhUR_xOCc3sSVIw8hrCKnB4OY6y6NnaWA,116674
10
- debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
11
- debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
12
- debase-0.1.19.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.19.dist-info/METADATA,sha256=i1dFEB8kPkfTt8q8hJpAAAkZA29T2kb1bzPFMjzPdJU,10790
14
- debase-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.19.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.19.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.19.dist-info/RECORD,,