debase 0.1.18__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -278,30 +278,30 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
278
278
 
279
279
  # Check if this page contains the figure caption
280
280
  found = False
281
- caption_instances = None
281
+ caption_rect = None
282
282
 
283
- # Look for figure caption
284
- variations = [
285
- f"{base_figure_ref}.", # "Figure 1." - most reliable
286
- f"{base_figure_ref} ", # "Figure 1 "
287
- base_figure_ref,
283
+ # Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
284
+ figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
285
+
286
+ # Look for actual figure captions using regex patterns
287
+ caption_patterns = [
288
+ rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
289
+ rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
290
+ rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
291
+ rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
292
+ rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
288
293
  ]
289
294
 
290
- for variation in variations:
291
- caption_instances = page.search_for(variation, quads=False)
292
- if caption_instances:
293
- # Check if this is likely a caption (not a reference in text)
294
- for rect in caption_instances:
295
- # Get text around this location
296
- x0, y0, x1, y1 = rect
297
- text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+300, y1+20))
298
- # Check if it looks like a figure caption
299
- if any(keyword in text_around.lower() for keyword in
300
- ['directed evolution', 'substrate scope', '(a)', '(b)', '(c)']):
301
- found = True
302
- caption_rect = rect
303
- break
304
- if found:
295
+ for pattern in caption_patterns:
296
+ matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
297
+ if matches:
298
+ # Found actual figure caption, get its position
299
+ caption_text = matches.group(0)
300
+ caption_instances = page.search_for(caption_text, quads=False)
301
+ if caption_instances:
302
+ caption_rect = caption_instances[0]
303
+ found = True
304
+ log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
305
305
  break
306
306
 
307
307
  if not found:
@@ -309,34 +309,68 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
309
309
 
310
310
  log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
311
311
 
312
- # Extract a region of the page above the caption
313
- # The figure should be between the top of the viewable area and the caption
312
+ # Extract multi-page region including the figure and content below
313
+ # The figure should be between the top of the viewable area and extend to subsequent pages
314
314
  page_rect = page.rect
315
315
 
316
316
  # Define the region to extract
317
- # Extract everything above the caption
317
+ # Extract everything above the caption plus additional content from subsequent pages
318
318
  top_margin = 0 # Start from the very top of the page
319
- bottom_margin = 5 # Small margin above caption
319
+ additional_pages = 2 # Number of additional pages to include
320
320
  left_margin = 0 # Use full page width
321
321
  right_margin = 0
322
322
 
323
- # Calculate the figure region - everything from top to caption
323
+ # Calculate the figure region for the first page
324
324
  fig_top = top_margin
325
- fig_bottom = caption_rect.y0 - bottom_margin
325
+ fig_bottom = max(caption_rect.y0 + 150, page_rect.height) # At least 150px below caption or full page
326
326
  fig_left = left_margin
327
327
  fig_right = page_rect.width - right_margin
328
328
 
329
- # Create the clip rectangle
330
- clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
329
+ # Create list to store all page images
330
+ page_images = []
331
331
 
332
- # Extract the region as an image
332
+ # Extract first page (from top to bottom)
333
+ clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
333
334
  mat = fitz.Matrix(2, 2) # 2x zoom for better quality
334
335
  pix = page.get_pixmap(clip=clip_rect, matrix=mat)
336
+ page_images.append(pix)
337
+
338
+ # Extract additional pages if they exist
339
+ for additional_page_offset in range(1, additional_pages + 1):
340
+ next_page_num = page_num + additional_page_offset
341
+ if next_page_num < doc.page_count:
342
+ next_page = doc.load_page(next_page_num)
343
+ next_page_rect = next_page.rect
344
+
345
+ # Extract full page for additional pages
346
+ next_clip_rect = fitz.Rect(0, 0, next_page_rect.width, next_page_rect.height)
347
+ next_pix = next_page.get_pixmap(clip=next_clip_rect, matrix=mat)
348
+ page_images.append(next_pix)
349
+ log.info("Added page %d to multi-page extraction", next_page_num + 1)
350
+
351
+ # Combine all page images vertically
352
+ if len(page_images) == 1:
353
+ # Single page extraction
354
+ combined_pix = page_images[0]
355
+ else:
356
+ # Multi-page extraction - combine vertically
357
+ total_width = max(pix.width for pix in page_images)
358
+ total_height = sum(pix.height for pix in page_images)
359
+
360
+ # Create a new pixmap to hold the combined image
361
+ combined_pix = fitz.Pixmap(fitz.csRGB, fitz.IRect(0, 0, total_width, total_height))
362
+ combined_pix.clear_with(255) # White background
363
+
364
+ current_y = 0
365
+ for pix in page_images:
366
+ # Copy each page image to the combined image
367
+ combined_pix.copy(pix, fitz.IRect(0, current_y, pix.width, current_y + pix.height))
368
+ current_y += pix.height
335
369
 
336
370
  # Convert to PNG
337
- img_bytes = pix.tobytes("png")
338
- log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
339
- clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
371
+ img_bytes = combined_pix.tobytes("png")
372
+ log.info("Extracted multi-page figure region: %dx%d pixels from %d pages starting at page %d",
373
+ combined_pix.width, combined_pix.height, len(page_images), page_num + 1)
340
374
 
341
375
  return b64encode(img_bytes).decode()
342
376
 
@@ -653,10 +687,25 @@ def get_model():
653
687
  def _extract_text(resp) -> str:
654
688
  """
655
689
  Pull the *first* textual part out of a GenerativeAI response, handling both
656
- the old prerelease SDK and the >=1.0 SDK.
690
+ the old prerelease SDK and the >=1.0 SDK. Also tracks token usage.
657
691
 
658
692
  Returns an empty string if no textual content is found.
659
693
  """
694
+ # Track token usage if available
695
+ try:
696
+ if hasattr(resp, 'usage_metadata'):
697
+ input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
698
+ output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
699
+ if input_tokens or output_tokens:
700
+ # Import wrapper token tracking
701
+ try:
702
+ from .wrapper import add_token_usage
703
+ add_token_usage('substrate_scope_extractor', input_tokens, output_tokens)
704
+ except ImportError:
705
+ pass # wrapper not available
706
+ except Exception:
707
+ pass # token tracking is best-effort
708
+
660
709
  # 1) Legacy SDK (<= 0.4) - still has nice `.text`
661
710
  if getattr(resp, "text", None):
662
711
  return resp.text
debase/wrapper.py CHANGED
@@ -19,6 +19,7 @@ import logging
19
19
  import time
20
20
  from datetime import datetime
21
21
  from pathlib import Path
22
+ import threading
22
23
 
23
24
  # Setup logging
24
25
  logging.basicConfig(
@@ -27,6 +28,53 @@ logging.basicConfig(
27
28
  )
28
29
  logger = logging.getLogger("EnzymePipeline")
29
30
 
31
+ # Global token tracking
32
+ _token_lock = threading.Lock()
33
+ _token_usage = {
34
+ 'total_input_tokens': 0,
35
+ 'total_output_tokens': 0,
36
+ 'calls_by_module': {
37
+ 'enzyme_lineage_extractor': {'input': 0, 'output': 0, 'calls': 0},
38
+ 'reaction_info_extractor': {'input': 0, 'output': 0, 'calls': 0},
39
+ 'substrate_scope_extractor': {'input': 0, 'output': 0, 'calls': 0}
40
+ }
41
+ }
42
+
43
+ def add_token_usage(module_name: str, input_tokens: int, output_tokens: int):
44
+ """Add token usage from a module to the global tracking."""
45
+ with _token_lock:
46
+ _token_usage['total_input_tokens'] += input_tokens
47
+ _token_usage['total_output_tokens'] += output_tokens
48
+ if module_name in _token_usage['calls_by_module']:
49
+ _token_usage['calls_by_module'][module_name]['input'] += input_tokens
50
+ _token_usage['calls_by_module'][module_name]['output'] += output_tokens
51
+ _token_usage['calls_by_module'][module_name]['calls'] += 1
52
+
53
+ def calculate_token_usage_and_cost():
54
+ """Calculate total token usage and estimated cost for Gemini 2.5 Flash."""
55
+ with _token_lock:
56
+ total_input = _token_usage['total_input_tokens']
57
+ total_output = _token_usage['total_output_tokens']
58
+
59
+ # Gemini 2.5 Flash pricing (as of 2025)
60
+ # Input: $0.30 per 1M tokens
61
+ # Output: $2.50 per 1M tokens
62
+ input_cost = (total_input / 1_000_000) * 0.30
63
+ output_cost = (total_output / 1_000_000) * 2.50
64
+ total_cost = input_cost + output_cost
65
+
66
+ return total_input, total_output, total_cost
67
+
68
+ def reset_token_usage():
69
+ """Reset token usage counters."""
70
+ with _token_lock:
71
+ _token_usage['total_input_tokens'] = 0
72
+ _token_usage['total_output_tokens'] = 0
73
+ for module_data in _token_usage['calls_by_module'].values():
74
+ module_data['input'] = 0
75
+ module_data['output'] = 0
76
+ module_data['calls'] = 0
77
+
30
78
 
31
79
  def run_lineage_extraction(manuscript: Path, si: Path, output: Path, debug_dir: Path = None) -> Path:
32
80
  """
@@ -405,6 +453,9 @@ def run_pipeline(
405
453
  substrate_csv = output_dir / "3b_substrate_scope.csv"
406
454
 
407
455
  try:
456
+ # Reset token usage tracking for this pipeline run
457
+ reset_token_usage()
458
+
408
459
  logger.info("="*60)
409
460
  logger.info("Starting DEBase Enzyme Analysis Pipeline")
410
461
  logger.info(f"Manuscript: {manuscript_path}")
@@ -449,12 +500,36 @@ def run_pipeline(
449
500
  else:
450
501
  logger.info("Note: Use --keep-intermediates to save intermediate files")
451
502
 
503
+ # Calculate token usage and estimated costs
504
+ total_input_tokens, total_output_tokens, estimated_cost = calculate_token_usage_and_cost()
505
+
452
506
  logger.info("\n" + "="*60)
453
507
  logger.info("PIPELINE COMPLETED SUCCESSFULLY")
454
508
  logger.info(f"Comprehensive output: {output_path}")
455
509
  if final_output != output_path:
456
510
  logger.info(f"Plate-based output: {final_output}")
457
511
  logger.info(f"Runtime: {elapsed:.1f} seconds")
512
+ logger.info("")
513
+ logger.info("TOKEN USAGE & COST ESTIMATE:")
514
+ logger.info(f" Input tokens: {total_input_tokens:,}")
515
+ logger.info(f" Output tokens: {total_output_tokens:,}")
516
+ logger.info(f" Total tokens: {total_input_tokens + total_output_tokens:,}")
517
+ logger.info(f" Estimated cost: ${estimated_cost:.4f} USD")
518
+ logger.info(" (Based on Gemini 2.5 Flash pricing: $0.30/1M input, $2.50/1M output)")
519
+ logger.info("")
520
+
521
+ # Show breakdown by module
522
+ with _token_lock:
523
+ logger.info("BREAKDOWN BY MODULE:")
524
+ for module_name, usage in _token_usage['calls_by_module'].items():
525
+ if usage['calls'] > 0:
526
+ logger.info(f" {module_name}:")
527
+ logger.info(f" API calls: {usage['calls']}")
528
+ logger.info(f" Input tokens: {usage['input']:,}")
529
+ logger.info(f" Output tokens: {usage['output']:,}")
530
+ module_cost = (usage['input'] / 1_000_000) * 0.30 + (usage['output'] / 1_000_000) * 2.50
531
+ logger.info(f" Module cost: ${module_cost:.4f} USD")
532
+
458
533
  logger.info("="*60)
459
534
 
460
535
  return final_output
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.1.18
3
+ Version: 0.4.0
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,16 @@
1
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
+ debase/_version.py,sha256=X9pfcQjm1Y8ILtLtdscGnfFKSp5XWTeamXgSHPOw2K0,49
4
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
+ debase/cleanup_sequence.py,sha256=ngxb_tPekjCWvampAjyuFqK4wLk_meFSj_TwfvOxheQ,33978
6
+ debase/enzyme_lineage_extractor.py,sha256=laIw9A5AuJ_kJe9h6Fp_WzMh_ctCN31bo2b2-RKrFd4,124019
7
+ debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
8
+ debase/reaction_info_extractor.py,sha256=xRyYoQKqSzer-k8FZwg55nDd0D-6QBc0F-HAyfvisG0,150368
9
+ debase/substrate_scope_extractor.py,sha256=ny4n_J4SDFQnxhCHHHan1xouqM8FkueJm_z-hm6gr-o,103761
10
+ debase/wrapper.py,sha256=TGU5eq0qWTrkRR35ztsp8WMb1E9Nt64BdbHuYHROmYA,24279
11
+ debase-0.4.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
+ debase-0.4.0.dist-info/METADATA,sha256=Qpvyi4nbq_wmhbl_089pRIlGAubVxjwVfX1eUSK3lLY,10789
13
+ debase-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ debase-0.4.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
+ debase-0.4.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
+ debase-0.4.0.dist-info/RECORD,,
debase/PIPELINE_FLOW.md DELETED
@@ -1,100 +0,0 @@
1
- # DEBase Pipeline Flow
2
-
3
- ## Overview
4
- The DEBase pipeline extracts enzyme engineering data from chemistry papers through a series of modular steps.
5
-
6
- ## Pipeline Architecture
7
-
8
- ```
9
- ┌─────────────────────┐ ┌─────────────────────┐
10
- │ Manuscript PDF │ │ SI PDF │
11
- └──────────┬──────────┘ └──────────┬──────────┘
12
- │ │
13
- └───────────┬───────────────┘
14
-
15
-
16
- ┌─────────────────────────────┐
17
- │ 1. enzyme_lineage_extractor │
18
- │ - Extract enzyme variants │
19
- │ - Parse mutations │
20
- │ - Get basic metadata │
21
- └─────────────┬───────────────┘
22
-
23
-
24
- ┌─────────────────────────────┐
25
- │ 2. cleanup_sequence │
26
- │ - Validate sequences │
27
- │ - Fix formatting issues │
28
- │ - Generate full sequences │
29
- └─────────────┬───────────────┘
30
-
31
- ┌───────────┴───────────────┐
32
- │ │
33
- ▼ ▼
34
- ┌─────────────────────────┐ ┌─────────────────────────┐
35
- │ 3a. reaction_info │ │ 3b. substrate_scope │
36
- │ _extractor │ │ _extractor │
37
- │ - Performance metrics │ │ - Substrate variations │
38
- │ - Model reaction │ │ - Additional variants │
39
- │ - Conditions │ │ - Scope data │
40
- └───────────┬─────────────┘ └───────────┬─────────────┘
41
- │ │
42
- └───────────┬───────────────┘
43
-
44
-
45
- ┌─────────────────────────────┐
46
- │ 4. lineage_format_o3 │
47
- │ - Merge all data │
48
- │ - Fill missing sequences │
49
- │ - Format final output │
50
- └─────────────┬───────────────┘
51
-
52
-
53
- ┌─────────────┐
54
- │ Final CSV │
55
- └─────────────┘
56
- ```
57
-
58
- ## Module Details
59
-
60
- ### 1. enzyme_lineage_extractor.py
61
- - **Input**: Manuscript PDF, SI PDF
62
- - **Output**: CSV with enzyme variants and mutations
63
- - **Function**: Extracts enzyme identifiers, mutation lists, and basic metadata
64
-
65
- ### 2. cleanup_sequence.py
66
- - **Input**: Enzyme lineage CSV
67
- - **Output**: CSV with validated sequences
68
- - **Function**: Validates protein sequences, generates full sequences from mutations
69
-
70
- ### 3a. reaction_info_extractor.py
71
- - **Input**: PDFs + cleaned enzyme CSV
72
- - **Output**: CSV with reaction performance data
73
- - **Function**: Extracts yield, TTN, selectivity, reaction conditions
74
-
75
- ### 3b. substrate_scope_extractor.py
76
- - **Input**: PDFs + cleaned enzyme CSV
77
- - **Output**: CSV with substrate scope entries
78
- - **Function**: Extracts substrate variations tested with different enzymes
79
-
80
- ### 4. lineage_format_o3.py
81
- - **Input**: Reaction CSV + Substrate scope CSV
82
- - **Output**: Final formatted CSV
83
- - **Function**: Merges data, fills missing sequences, applies consistent formatting
84
-
85
- ## Key Features
86
-
87
- 1. **Modular Design**: Each step can be run independently
88
- 2. **Parallel Extraction**: Steps 3a and 3b run independently
89
- 3. **Error Recovery**: Pipeline can resume from any step
90
- 4. **Clean Interfaces**: Each module has well-defined inputs/outputs
91
-
92
- ## Usage
93
-
94
- ```bash
95
- # Full pipeline
96
- python -m debase.wrapper_clean manuscript.pdf --si si.pdf --output results.csv
97
-
98
- # With intermediate files kept for debugging
99
- python -m debase.wrapper_clean manuscript.pdf --si si.pdf --keep-intermediates
100
- ```
@@ -1,17 +0,0 @@
1
- debase/PIPELINE_FLOW.md,sha256=S4nQyZlX39-Bchw1gQWPK60sHiFpB1eWHqo5GR9oTY8,4741
2
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
3
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
4
- debase/_version.py,sha256=Qd1kKsssesKE5FvJnDdAuZsx_BrxTSJJyt68SK99D54,50
5
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
6
- debase/cleanup_sequence.py,sha256=QyhUqvTBVFTGM7ebAHmP3tif3Jq-8hvoLApYwAJtpH4,32702
7
- debase/enzyme_lineage_extractor.py,sha256=xbNKkIMRCM2dYHsX24vWX1EsQINaGSWBj-iTX10B8Mw,117057
8
- debase/lineage_format.py,sha256=IS9ig-Uv7KxtI9enZKM6YgQ7sitqwOo4cdXbOy38J3s,34232
9
- debase/reaction_info_extractor.py,sha256=W9CS0puFTdhJ_T2Fpy931EgnjOCsHHjbtU6RdnzDlhw,113140
10
- debase/substrate_scope_extractor.py,sha256=9XDF-DxOqB63AwaVceAMvg7BcjoTQXE_pG2c_seM_DA,100698
11
- debase/wrapper.py,sha256=V9bs8ZiyCpJHMM5VuN74kiKdkQRVU6vyvLKCrO1BUB8,20890
12
- debase-0.1.18.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
13
- debase-0.1.18.dist-info/METADATA,sha256=XvSrveJ0Y40c53JYUfiveaQNJ3qoEkxaQ61n3_--1cQ,10790
14
- debase-0.1.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
- debase-0.1.18.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
16
- debase-0.1.18.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
17
- debase-0.1.18.dist-info/RECORD,,