debase 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,13 @@ import re
28
28
  import json
29
29
  import time
30
30
  import logging
31
+
32
+ # Import universal caption pattern
33
+ try:
34
+ from .caption_pattern import get_universal_caption_pattern
35
+ except ImportError:
36
+ # Fallback if running as standalone script
37
+ from caption_pattern import get_universal_caption_pattern
31
38
  import subprocess
32
39
  from pathlib import Path
33
40
  from dataclasses import dataclass, field
@@ -183,13 +190,8 @@ except ImportError as exc: # pragma: no cover
183
190
 
184
191
  from base64 import b64encode
185
192
 
186
- # Improved caption prefix regex - captures most journal variants
187
- # Simplified pattern: match any line starting with Table, Figure, Scheme, Chart, etc.
188
- # This catches all variations including "Table S 2", "Figure.", etc.
189
- _CAPTION_PREFIX_RE = re.compile(
190
- r"^(Table|Figure|Fig|Scheme|Chart|Extended\s+Data\s+Fig|ED\s+Fig|Supplementary\s+(?:Table|Figure)).*",
191
- re.I | re.M
192
- )
193
+ # Use universal caption pattern
194
+ _CAPTION_PREFIX_RE = get_universal_caption_pattern()
193
195
 
194
196
  def _open_doc(pdf_path: str | Path | bytes):
195
197
  if isinstance(pdf_path, (str, Path)):
@@ -485,61 +487,134 @@ def extract_scheme_image(pdf_paths: List[Path], scheme_ref: str) -> Optional[str
485
487
  return None
486
488
 
487
489
 
488
- def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
489
- """Extract text around a specific reference (e.g., 'Figure 3')."""
490
- import re
491
- extracted_sections = []
492
-
493
- # Try to extract base figure/table reference
494
- base_ref_match = re.match(r'((?:Figure|Fig|Table)\s*\d+)', ref, re.IGNORECASE)
495
- base_ref = base_ref_match.group(1) if base_ref_match else ref
490
+ def _build_caption_index(pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]:
491
+ """Build an index of all captions for quick lookup."""
492
+ cap_pattern = get_universal_caption_pattern()
493
+ caption_index = {}
496
494
 
497
- for pdf_path in pdf_paths:
495
+ for pdf_idx, pdf_path in enumerate(pdf_paths):
498
496
  doc = _open_doc(pdf_path)
497
+ source = "manuscript" if pdf_idx == 0 else "supplementary"
498
+
499
499
  try:
500
500
  for page_num in range(doc.page_count):
501
501
  page = doc.load_page(page_num)
502
502
  page_text = page.get_text()
503
503
 
504
- # Try different variations of the reference
505
- ref_variations = [
506
- ref, # Original (e.g., "Figure 3(a)")
507
- base_ref, # Base reference (e.g., "Figure 3")
508
- ref.replace("(", " ").replace(")", ""), # "Figure 3 a"
509
- ref.replace("(", "").replace(")", ""), # "Figure 3a"
510
- ref.replace("Figure", "Fig"), # "Fig 3(a)"
511
- base_ref.replace("Figure", "Fig"), # "Fig 3"
512
- ]
513
-
514
- # Find the reference in the page
515
- found = False
516
- pos = -1
517
- used_ref = ref
518
-
519
- for ref_var in ref_variations:
520
- if ref_var.lower() in page_text.lower():
521
- pos = page_text.lower().find(ref_var.lower())
522
- used_ref = ref_var
523
- found = True
524
- break
525
-
526
- if found and pos >= 0:
527
- # Extract context around it
528
- start = max(0, pos - context_chars)
529
- end = min(len(page_text), pos + len(used_ref) + context_chars)
504
+ for match in cap_pattern.finditer(page_text):
505
+ caption_text = match.group(0).strip()
506
+ caption_lower = caption_text.lower()
530
507
 
531
- section = page_text[start:end]
532
- extracted_sections.append(
533
- f"\n=== Context around '{ref}' (found as '{used_ref}') in {pdf_path.name}, page {page_num + 1} ===\n{section}"
534
- )
535
- log.debug("Found '%s' as '%s' on page %d of %s", ref, used_ref, page_num + 1, pdf_path.name)
508
+ # Store caption info
509
+ caption_info = {
510
+ 'full_caption': caption_text,
511
+ 'page_text': page_text,
512
+ 'page_num': page_num + 1,
513
+ 'pdf_path': pdf_path,
514
+ 'source': source,
515
+ 'match_start': match.start(),
516
+ 'doc': doc # Keep doc reference for page extraction
517
+ }
518
+
519
+ # Create multiple keys for flexible matching
520
+ # Key 1: Full caption text (first 100 chars)
521
+ key1 = caption_text[:100].lower().strip()
522
+ caption_index[key1] = caption_info
523
+
524
+ # Key 2: Simplified reference (e.g., "table 5", "figure s3")
525
+ ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
526
+ if ref_match:
527
+ key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
528
+ caption_index[key2] = caption_info
529
+
530
+ # Also store with 's' prefix if in SI
531
+ if source == "supplementary" and 's' not in key2:
532
+ key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
533
+ caption_index[key3] = caption_info
536
534
  finally:
537
535
  doc.close()
538
536
 
537
+ return caption_index
538
+
539
+ def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
540
+ """Extract text around a specific reference using caption index."""
541
+ import re
542
+
543
+ # Build caption index if not already built
544
+ if not hasattr(_extract_text_around_reference, '_caption_index'):
545
+ _extract_text_around_reference._caption_index = _build_caption_index(pdf_paths)
546
+
547
+ caption_index = _extract_text_around_reference._caption_index
548
+ ref_lower = ref.lower().strip()
549
+
550
+ # Try multiple matching strategies
551
+ matches = []
552
+
553
+ # Strategy 1: Direct key lookup
554
+ if ref_lower in caption_index:
555
+ matches.append(caption_index[ref_lower])
556
+
557
+ # Strategy 2: Normalized reference lookup
558
+ ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
559
+ if ref_match:
560
+ ref_type, ref_num = ref_match.groups()
561
+ if ref_type == 'fig':
562
+ ref_type = 'figure'
563
+
564
+ # Try different key formats
565
+ keys_to_try = [
566
+ f"{ref_type} {ref_num}",
567
+ f"{ref_type} s{ref_num}",
568
+ f"table {ref_num}",
569
+ f"fig {ref_num}",
570
+ f"figure {ref_num}"
571
+ ]
572
+
573
+ for key in keys_to_try:
574
+ if key in caption_index and caption_index[key] not in matches:
575
+ matches.append(caption_index[key])
576
+
577
+ # Strategy 3: Fuzzy matching
578
+ if not matches and ref_match:
579
+ for key, info in caption_index.items():
580
+ if ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
581
+ if info not in matches:
582
+ matches.append(info)
583
+
584
+ # Extract text from matches
585
+ extracted_sections = []
586
+ for match in matches:
587
+ page_text = match['page_text']
588
+ caption_start = match['match_start']
589
+
590
+ # Extract context around the caption
591
+ start = max(0, caption_start - context_chars // 2)
592
+ end = min(len(page_text), caption_start + context_chars)
593
+
594
+ section = page_text[start:end]
595
+ source_label = f"{match['source'].upper()} page {match['page_num']}"
596
+ extracted_sections.append(f"\n[From {source_label}]\n{section}")
597
+
539
598
  if not extracted_sections:
540
- log.warning("Could not find reference '%s' or base reference '%s' in any PDF", ref, base_ref)
599
+ log.warning(f"No matches found for reference '{ref}'")
600
+ # Fallback to old approach
601
+ for pdf_path in pdf_paths:
602
+ doc = _open_doc(pdf_path)
603
+ try:
604
+ for page_num in range(doc.page_count):
605
+ page = doc.load_page(page_num)
606
+ page_text = page.get_text()
607
+
608
+ if ref_lower in page_text.lower():
609
+ pos = page_text.lower().find(ref_lower)
610
+ start = max(0, pos - context_chars // 2)
611
+ end = min(len(page_text), pos + context_chars)
612
+ extracted_sections.append(page_text[start:end])
613
+ break
614
+ finally:
615
+ doc.close()
541
616
 
542
- return "\n".join(extracted_sections)
617
+ return "\n\n".join(extracted_sections)
543
618
 
544
619
  def _extract_sections_by_title(pdf_paths: List[Path], section_titles: List[str], max_chars_per_section: int = 10000) -> str:
545
620
  """Extract sections by their titles from PDFs."""
debase/wrapper.py CHANGED
@@ -7,10 +7,10 @@ Pipeline flow:
7
7
  2. cleanup_sequence.py - Clean and validate protein sequences
8
8
  3. reaction_info_extractor.py - Extract reaction performance metrics
9
9
  4. substrate_scope_extractor.py - Extract substrate scope data (runs independently)
10
- 5. lineage_format_o3.py - Format and merge all data into final CSV
10
+ 5. lineage_format.py - Format and merge all data into final CSV
11
11
 
12
12
  The reaction_info and substrate_scope extractors run in parallel,
13
- then their outputs are combined in lineage_format_o3.
13
+ then their outputs are combined in lineage_format.
14
14
  """
15
15
  import os
16
16
  import sys
@@ -579,7 +579,7 @@ Pipeline steps:
579
579
  2. cleanup_sequence - Validate and clean protein sequences
580
580
  3. reaction_info_extractor - Extract reaction performance metrics
581
581
  4. substrate_scope_extractor - Extract substrate scope data
582
- 5. lineage_format_o3 - Format and merge into final CSV
582
+ 5. lineage_format - Format and merge into final CSV
583
583
 
584
584
  The pipeline automatically handles all steps sequentially.
585
585
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.5.1
3
+ Version: 0.6.1
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,18 @@
1
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
+ debase/_version.py,sha256=Cbfy3WdPDTjtgnzdUc6e5F779YhAJJGX5LN-2SJMvCI,49
4
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
+ debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
6
+ debase/caption_pattern.py,sha256=nMLj2tK4MhD4jQ9d1IUDJ6xnY0MOx-UioIT-k_b3OWA,1770
7
+ debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
8
+ debase/enzyme_lineage_extractor.py,sha256=RKsjvcs6O2wnw2dpts3AynDRVKqMAeBVOMql2mayCGY,170120
9
+ debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
10
+ debase/reaction_info_extractor.py,sha256=qUrVi9chQcQG1zWwQlTbYF8dczvQqctdjwhvkAkBnZw,187032
11
+ debase/substrate_scope_extractor.py,sha256=dikdEELi4RGlP2lGHcR93WdUbtIchOdHVB5G45BMCNk,118709
12
+ debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
13
+ debase-0.6.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
14
+ debase-0.6.1.dist-info/METADATA,sha256=fXvGhqDP5Bl33gTEvUvvjqNy-cXYs9jYFl1NyM5ALsc,4047
15
+ debase-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ debase-0.6.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
17
+ debase-0.6.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
18
+ debase-0.6.1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
- debase/_version.py,sha256=iDuv12GVbaAFXFufv3yqwn-3Hwv9Kua4nJZQ-gUNJXw,49
4
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
- debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
6
- debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
7
- debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
8
- debase/reaction_info_extractor.py,sha256=9QXbtp0RSP6QMqQ_azBWDceGIqiw2JPCg3eJ0Ba_lxA,167849
9
- debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
10
- debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
11
- debase-0.5.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
- debase-0.5.1.dist-info/METADATA,sha256=GoaBFl0kdh8dtrApBTMoLWH6fe5GYLiSYC5JrohbPcI,4047
13
- debase-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- debase-0.5.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
- debase-0.5.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
- debase-0.5.1.dist-info/RECORD,,
File without changes