debase 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/campaign_utils.py +146 -0
- debase/caption_pattern.py +39 -0
- debase/enzyme_lineage_extractor.py +58 -20
- debase/reaction_info_extractor.py +407 -75
- debase/substrate_scope_extractor.py +124 -49
- debase/wrapper.py +3 -3
- {debase-0.6.0.dist-info → debase-0.6.1.dist-info}/METADATA +1 -1
- debase-0.6.1.dist-info/RECORD +18 -0
- debase-0.6.0.dist-info/RECORD +0 -16
- {debase-0.6.0.dist-info → debase-0.6.1.dist-info}/WHEEL +0 -0
- {debase-0.6.0.dist-info → debase-0.6.1.dist-info}/entry_points.txt +0 -0
- {debase-0.6.0.dist-info → debase-0.6.1.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.0.dist-info → debase-0.6.1.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,13 @@ import re
|
|
28
28
|
import json
|
29
29
|
import time
|
30
30
|
import logging
|
31
|
+
|
32
|
+
# Import universal caption pattern
|
33
|
+
try:
|
34
|
+
from .caption_pattern import get_universal_caption_pattern
|
35
|
+
except ImportError:
|
36
|
+
# Fallback if running as standalone script
|
37
|
+
from caption_pattern import get_universal_caption_pattern
|
31
38
|
import subprocess
|
32
39
|
from pathlib import Path
|
33
40
|
from dataclasses import dataclass, field
|
@@ -183,13 +190,8 @@ except ImportError as exc: # pragma: no cover
|
|
183
190
|
|
184
191
|
from base64 import b64encode
|
185
192
|
|
186
|
-
#
|
187
|
-
|
188
|
-
# This catches all variations including "Table S 2", "Figure.", etc.
|
189
|
-
_CAPTION_PREFIX_RE = re.compile(
|
190
|
-
r"^(Table|Figure|Fig|Scheme|Chart|Extended\s+Data\s+Fig|ED\s+Fig|Supplementary\s+(?:Table|Figure)).*",
|
191
|
-
re.I | re.M
|
192
|
-
)
|
193
|
+
# Use universal caption pattern
|
194
|
+
_CAPTION_PREFIX_RE = get_universal_caption_pattern()
|
193
195
|
|
194
196
|
def _open_doc(pdf_path: str | Path | bytes):
|
195
197
|
if isinstance(pdf_path, (str, Path)):
|
@@ -485,61 +487,134 @@ def extract_scheme_image(pdf_paths: List[Path], scheme_ref: str) -> Optional[str
|
|
485
487
|
return None
|
486
488
|
|
487
489
|
|
488
|
-
def
|
489
|
-
"""
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
# Try to extract base figure/table reference
|
494
|
-
base_ref_match = re.match(r'((?:Figure|Fig|Table)\s*\d+)', ref, re.IGNORECASE)
|
495
|
-
base_ref = base_ref_match.group(1) if base_ref_match else ref
|
490
|
+
def _build_caption_index(pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]:
|
491
|
+
"""Build an index of all captions for quick lookup."""
|
492
|
+
cap_pattern = get_universal_caption_pattern()
|
493
|
+
caption_index = {}
|
496
494
|
|
497
|
-
for pdf_path in pdf_paths:
|
495
|
+
for pdf_idx, pdf_path in enumerate(pdf_paths):
|
498
496
|
doc = _open_doc(pdf_path)
|
497
|
+
source = "manuscript" if pdf_idx == 0 else "supplementary"
|
498
|
+
|
499
499
|
try:
|
500
500
|
for page_num in range(doc.page_count):
|
501
501
|
page = doc.load_page(page_num)
|
502
502
|
page_text = page.get_text()
|
503
503
|
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
base_ref, # Base reference (e.g., "Figure 3")
|
508
|
-
ref.replace("(", " ").replace(")", ""), # "Figure 3 a"
|
509
|
-
ref.replace("(", "").replace(")", ""), # "Figure 3a"
|
510
|
-
ref.replace("Figure", "Fig"), # "Fig 3(a)"
|
511
|
-
base_ref.replace("Figure", "Fig"), # "Fig 3"
|
512
|
-
]
|
513
|
-
|
514
|
-
# Find the reference in the page
|
515
|
-
found = False
|
516
|
-
pos = -1
|
517
|
-
used_ref = ref
|
518
|
-
|
519
|
-
for ref_var in ref_variations:
|
520
|
-
if ref_var.lower() in page_text.lower():
|
521
|
-
pos = page_text.lower().find(ref_var.lower())
|
522
|
-
used_ref = ref_var
|
523
|
-
found = True
|
524
|
-
break
|
525
|
-
|
526
|
-
if found and pos >= 0:
|
527
|
-
# Extract context around it
|
528
|
-
start = max(0, pos - context_chars)
|
529
|
-
end = min(len(page_text), pos + len(used_ref) + context_chars)
|
504
|
+
for match in cap_pattern.finditer(page_text):
|
505
|
+
caption_text = match.group(0).strip()
|
506
|
+
caption_lower = caption_text.lower()
|
530
507
|
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
508
|
+
# Store caption info
|
509
|
+
caption_info = {
|
510
|
+
'full_caption': caption_text,
|
511
|
+
'page_text': page_text,
|
512
|
+
'page_num': page_num + 1,
|
513
|
+
'pdf_path': pdf_path,
|
514
|
+
'source': source,
|
515
|
+
'match_start': match.start(),
|
516
|
+
'doc': doc # Keep doc reference for page extraction
|
517
|
+
}
|
518
|
+
|
519
|
+
# Create multiple keys for flexible matching
|
520
|
+
# Key 1: Full caption text (first 100 chars)
|
521
|
+
key1 = caption_text[:100].lower().strip()
|
522
|
+
caption_index[key1] = caption_info
|
523
|
+
|
524
|
+
# Key 2: Simplified reference (e.g., "table 5", "figure s3")
|
525
|
+
ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
|
526
|
+
if ref_match:
|
527
|
+
key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
|
528
|
+
caption_index[key2] = caption_info
|
529
|
+
|
530
|
+
# Also store with 's' prefix if in SI
|
531
|
+
if source == "supplementary" and 's' not in key2:
|
532
|
+
key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
|
533
|
+
caption_index[key3] = caption_info
|
536
534
|
finally:
|
537
535
|
doc.close()
|
538
536
|
|
537
|
+
return caption_index
|
538
|
+
|
539
|
+
def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
|
540
|
+
"""Extract text around a specific reference using caption index."""
|
541
|
+
import re
|
542
|
+
|
543
|
+
# Build caption index if not already built
|
544
|
+
if not hasattr(_extract_text_around_reference, '_caption_index'):
|
545
|
+
_extract_text_around_reference._caption_index = _build_caption_index(pdf_paths)
|
546
|
+
|
547
|
+
caption_index = _extract_text_around_reference._caption_index
|
548
|
+
ref_lower = ref.lower().strip()
|
549
|
+
|
550
|
+
# Try multiple matching strategies
|
551
|
+
matches = []
|
552
|
+
|
553
|
+
# Strategy 1: Direct key lookup
|
554
|
+
if ref_lower in caption_index:
|
555
|
+
matches.append(caption_index[ref_lower])
|
556
|
+
|
557
|
+
# Strategy 2: Normalized reference lookup
|
558
|
+
ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
|
559
|
+
if ref_match:
|
560
|
+
ref_type, ref_num = ref_match.groups()
|
561
|
+
if ref_type == 'fig':
|
562
|
+
ref_type = 'figure'
|
563
|
+
|
564
|
+
# Try different key formats
|
565
|
+
keys_to_try = [
|
566
|
+
f"{ref_type} {ref_num}",
|
567
|
+
f"{ref_type} s{ref_num}",
|
568
|
+
f"table {ref_num}",
|
569
|
+
f"fig {ref_num}",
|
570
|
+
f"figure {ref_num}"
|
571
|
+
]
|
572
|
+
|
573
|
+
for key in keys_to_try:
|
574
|
+
if key in caption_index and caption_index[key] not in matches:
|
575
|
+
matches.append(caption_index[key])
|
576
|
+
|
577
|
+
# Strategy 3: Fuzzy matching
|
578
|
+
if not matches and ref_match:
|
579
|
+
for key, info in caption_index.items():
|
580
|
+
if ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
|
581
|
+
if info not in matches:
|
582
|
+
matches.append(info)
|
583
|
+
|
584
|
+
# Extract text from matches
|
585
|
+
extracted_sections = []
|
586
|
+
for match in matches:
|
587
|
+
page_text = match['page_text']
|
588
|
+
caption_start = match['match_start']
|
589
|
+
|
590
|
+
# Extract context around the caption
|
591
|
+
start = max(0, caption_start - context_chars // 2)
|
592
|
+
end = min(len(page_text), caption_start + context_chars)
|
593
|
+
|
594
|
+
section = page_text[start:end]
|
595
|
+
source_label = f"{match['source'].upper()} page {match['page_num']}"
|
596
|
+
extracted_sections.append(f"\n[From {source_label}]\n{section}")
|
597
|
+
|
539
598
|
if not extracted_sections:
|
540
|
-
log.warning("
|
599
|
+
log.warning(f"No matches found for reference '{ref}'")
|
600
|
+
# Fallback to old approach
|
601
|
+
for pdf_path in pdf_paths:
|
602
|
+
doc = _open_doc(pdf_path)
|
603
|
+
try:
|
604
|
+
for page_num in range(doc.page_count):
|
605
|
+
page = doc.load_page(page_num)
|
606
|
+
page_text = page.get_text()
|
607
|
+
|
608
|
+
if ref_lower in page_text.lower():
|
609
|
+
pos = page_text.lower().find(ref_lower)
|
610
|
+
start = max(0, pos - context_chars // 2)
|
611
|
+
end = min(len(page_text), pos + context_chars)
|
612
|
+
extracted_sections.append(page_text[start:end])
|
613
|
+
break
|
614
|
+
finally:
|
615
|
+
doc.close()
|
541
616
|
|
542
|
-
return "\n".join(extracted_sections)
|
617
|
+
return "\n\n".join(extracted_sections)
|
543
618
|
|
544
619
|
def _extract_sections_by_title(pdf_paths: List[Path], section_titles: List[str], max_chars_per_section: int = 10000) -> str:
|
545
620
|
"""Extract sections by their titles from PDFs."""
|
debase/wrapper.py
CHANGED
@@ -7,10 +7,10 @@ Pipeline flow:
|
|
7
7
|
2. cleanup_sequence.py - Clean and validate protein sequences
|
8
8
|
3. reaction_info_extractor.py - Extract reaction performance metrics
|
9
9
|
4. substrate_scope_extractor.py - Extract substrate scope data (runs independently)
|
10
|
-
5.
|
10
|
+
5. lineage_format.py - Format and merge all data into final CSV
|
11
11
|
|
12
12
|
The reaction_info and substrate_scope extractors run in parallel,
|
13
|
-
then their outputs are combined in
|
13
|
+
then their outputs are combined in lineage_format.
|
14
14
|
"""
|
15
15
|
import os
|
16
16
|
import sys
|
@@ -579,7 +579,7 @@ Pipeline steps:
|
|
579
579
|
2. cleanup_sequence - Validate and clean protein sequences
|
580
580
|
3. reaction_info_extractor - Extract reaction performance metrics
|
581
581
|
4. substrate_scope_extractor - Extract substrate scope data
|
582
|
-
5.
|
582
|
+
5. lineage_format - Format and merge into final CSV
|
583
583
|
|
584
584
|
The pipeline automatically handles all steps sequentially.
|
585
585
|
"""
|
@@ -0,0 +1,18 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=Cbfy3WdPDTjtgnzdUc6e5F779YhAJJGX5LN-2SJMvCI,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
|
6
|
+
debase/caption_pattern.py,sha256=nMLj2tK4MhD4jQ9d1IUDJ6xnY0MOx-UioIT-k_b3OWA,1770
|
7
|
+
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
8
|
+
debase/enzyme_lineage_extractor.py,sha256=RKsjvcs6O2wnw2dpts3AynDRVKqMAeBVOMql2mayCGY,170120
|
9
|
+
debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
|
10
|
+
debase/reaction_info_extractor.py,sha256=qUrVi9chQcQG1zWwQlTbYF8dczvQqctdjwhvkAkBnZw,187032
|
11
|
+
debase/substrate_scope_extractor.py,sha256=dikdEELi4RGlP2lGHcR93WdUbtIchOdHVB5G45BMCNk,118709
|
12
|
+
debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
|
13
|
+
debase-0.6.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
14
|
+
debase-0.6.1.dist-info/METADATA,sha256=fXvGhqDP5Bl33gTEvUvvjqNy-cXYs9jYFl1NyM5ALsc,4047
|
15
|
+
debase-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
+
debase-0.6.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
17
|
+
debase-0.6.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
18
|
+
debase-0.6.1.dist-info/RECORD,,
|
debase-0.6.0.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=C3pfAAiv8eypvKmkKXrxXIwx9sLe4TtShkzld5GJJ7k,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
|
7
|
-
debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
|
8
|
-
debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
|
9
|
-
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
-
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
-
debase-0.6.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.6.0.dist-info/METADATA,sha256=o_Iq-jjk_ekBLLheIQZuLEeC7tWbduq70b5jqfiCmro,4047
|
13
|
-
debase-0.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.6.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.6.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|