debase 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -321,51 +321,81 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: O
321
321
  else:
322
322
  base_figure_ref = figure_ref
323
323
 
324
- # Determine search order based on document hint
325
- search_paths = list(pdf_paths) # Create a copy
324
+ # Determine which PDFs to search based on document hint
326
325
  if document_hint and len(pdf_paths) > 1:
327
326
  if document_hint.lower() == "manuscript":
328
- # Prioritize manuscript (first PDF)
329
- search_paths = [pdf_paths[0]] + pdf_paths[1:]
330
- log.info("Prioritizing manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
327
+ # ONLY search in manuscript (first PDF)
328
+ search_paths = [pdf_paths[0]]
329
+ log.info("Searching ONLY in manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
331
330
  elif document_hint.lower() == "supplementary":
332
- # Prioritize SI (second PDF if available)
333
- search_paths = [pdf_paths[1], pdf_paths[0]] if len(pdf_paths) > 1 else pdf_paths
334
- log.info("Prioritizing supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
331
+ # ONLY search in SI (second PDF if available)
332
+ search_paths = [pdf_paths[1]] if len(pdf_paths) > 1 else pdf_paths
333
+ log.info("Searching ONLY in supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
334
+ else:
335
+ # No specific hint, search all PDFs
336
+ search_paths = list(pdf_paths)
337
+ else:
338
+ # No hint or single PDF, search all available
339
+ search_paths = list(pdf_paths)
335
340
 
336
- for pdf_path in search_paths:
341
+ # Extract figure number BEFORE the loop (e.g., "Figure 3" -> "3", "Figure S3" -> "S3", "Fig. 3" -> "3")
342
+ figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '').replace('Fig. ', '').replace('fig. ', '').replace('Fig ', '').replace('fig ', '')
343
+
344
+ for pdf_idx, pdf_path in enumerate(search_paths):
337
345
  doc = _open_doc(pdf_path)
338
346
  try:
347
+ log.debug("Searching %s (document %d/%d) with %d pages for figure '%s'",
348
+ pdf_path.name, pdf_idx + 1, len(search_paths), doc.page_count, figure_num)
349
+
339
350
  for page_num in range(doc.page_count):
340
351
  page = doc.load_page(page_num)
341
352
  page_text = page.get_text()
342
353
 
354
+ # Debug: Check if page contains any mention of the figure
355
+ if figure_num.lower() in page_text.lower():
356
+ log.debug("Page %d contains mention of figure number '%s'", page_num + 1, figure_num)
357
+
343
358
  # Check if this page contains the figure caption
344
359
  found = False
345
360
  caption_rect = None
346
361
 
347
- # Extract figure number (e.g., "Figure 3" -> "3", "Figure S3" -> "S3")
348
- figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '')
349
-
350
362
  # First try to find using caption hint if provided
351
363
  if caption_hint and len(caption_hint) > 10:
352
- # Try to find the exact caption text
353
- caption_snippet = caption_hint[:100] # Use first 100 chars
354
- if caption_snippet in page_text:
355
- caption_instances = page.search_for(caption_snippet, quads=False)
356
- if caption_instances:
357
- caption_rect = caption_instances[0]
358
- found = True
359
- log.info("Found figure using caption hint on page %d", page_num + 1)
364
+ # Try to find a unique portion of the caption
365
+ # Start with longer snippets and work down to shorter ones
366
+ caption_lengths = [100, 50, 30, 20]
367
+
368
+ for length in caption_lengths:
369
+ if len(caption_hint) >= length:
370
+ snippet = caption_hint[:length]
371
+ # Clean up the snippet to avoid partial words
372
+ if length < len(caption_hint):
373
+ # Try to end at a word boundary
374
+ last_space = snippet.rfind(' ')
375
+ if last_space > length * 0.6: # Don't trim too much
376
+ snippet = snippet[:last_space]
377
+
378
+ if snippet in page_text:
379
+ caption_instances = page.search_for(snippet, quads=False)
380
+ if caption_instances:
381
+ caption_rect = caption_instances[0]
382
+ found = True
383
+ log.info("Found figure using caption snippet (%d chars) on page %d", len(snippet), page_num + 1)
384
+ break
360
385
 
361
386
  # If not found with hint, look for actual figure captions using regex patterns
362
387
  if not found:
363
388
  caption_patterns = [
364
- rf"^Figure\s+{re.escape(figure_num)}\.", # "Figure 3." at start of line
365
- rf"^Figure\s+{re.escape(figure_num)}:", # "Figure 3:" at start of line
389
+ # More flexible patterns to match various formats
390
+ rf"^Figure\s+{re.escape(figure_num)}[\s\.\:]", # "Figure 3." or "Figure 3:" or "Figure 3 " at start
391
+ rf"^Fig\.?\s*{re.escape(figure_num)}[\s\.\:]", # "Fig. 3." or "Fig 3:" at start
392
+ rf"Figure\s+{re.escape(figure_num)}[\s\.\:]", # "Figure 3." anywhere
393
+ rf"Fig\.?\s*{re.escape(figure_num)}[\s\.\:]", # "Fig. 3." anywhere
366
394
  rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
367
- rf"Figure\s+{re.escape(figure_num)}\s*\.", # "Figure 3." anywhere
368
- rf"Figure\s+{re.escape(figure_num)}\s*:", # "Figure 3:" anywhere
395
+ rf"^Fig\.?\s*{re.escape(figure_num)}\s+[A-Z]", # "Fig. 3 Substrate"
396
+ # Special patterns for edge cases
397
+ rf"Fig\.\s*{re.escape(figure_num)}\s*\|", # "Fig. 3 |"
398
+ rf"^\s*{re.escape(figure_num)}\.", # Just "3." at line start (some formats)
369
399
  ]
370
400
 
371
401
  for pattern in caption_patterns:
@@ -381,6 +411,29 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: O
381
411
  break
382
412
 
383
413
  if not found:
414
+ # Try a fuzzy search for lines that look like figure captions
415
+ lines = page_text.split('\n')
416
+ for i, line in enumerate(lines):
417
+ line_stripped = line.strip()
418
+ line_lower = line_stripped.lower()
419
+
420
+ # Check if this looks like a figure caption (starts with fig/figure)
421
+ # and NOT an inline reference (which would have text before it)
422
+ if (line_lower.startswith(('fig.', 'fig ', 'figure')) and
423
+ figure_num.lower() in line_lower and
424
+ len(line_stripped) < 200 and
425
+ not line_lower.endswith(')')): # Exclude inline refs like "(Fig. 2)"
426
+
427
+ # Found a potential caption line
428
+ caption_instances = page.search_for(line_stripped[:50], quads=False)
429
+ if caption_instances:
430
+ caption_rect = caption_instances[0]
431
+ found = True
432
+ log.info("Found figure via fuzzy search: '%s' on page %d", line_stripped[:50], page_num + 1)
433
+ break
434
+
435
+ if not found:
436
+ # Skip this page if we didn't find the actual figure caption
384
437
  continue
385
438
 
386
439
  log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
@@ -562,15 +615,29 @@ def _build_caption_index(pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]:
562
615
 
563
616
  return caption_index
564
617
 
565
- def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
618
+ def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000, document_hint: Optional[str] = None) -> str:
566
619
  """Extract text around a specific reference using caption index."""
567
620
  import re
568
621
 
569
- # Build caption index if not already built
570
- if not hasattr(_extract_text_around_reference, '_caption_index'):
571
- _extract_text_around_reference._caption_index = _build_caption_index(pdf_paths)
622
+ # Filter PDFs based on document hint BEFORE building caption index
623
+ search_paths = pdf_paths
624
+ if document_hint and len(pdf_paths) > 1:
625
+ if document_hint.lower() == "manuscript":
626
+ # ONLY search in manuscript (first PDF)
627
+ search_paths = [pdf_paths[0]]
628
+ log.info("Text extraction: Searching ONLY in manuscript document for '%s'", ref)
629
+ elif document_hint.lower() == "supplementary":
630
+ # ONLY search in SI (second PDF if available)
631
+ search_paths = [pdf_paths[1]] if len(pdf_paths) > 1 else pdf_paths
632
+ log.info("Text extraction: Searching ONLY in supplementary document for '%s'", ref)
633
+
634
+ # Build caption index from filtered paths
635
+ # Use a cache key that includes the document hint
636
+ cache_key = f"_caption_index_{document_hint or 'all'}"
637
+ if not hasattr(_extract_text_around_reference, cache_key):
638
+ setattr(_extract_text_around_reference, cache_key, _build_caption_index(search_paths))
572
639
 
573
- caption_index = _extract_text_around_reference._caption_index
640
+ caption_index = getattr(_extract_text_around_reference, cache_key)
574
641
  ref_lower = ref.lower().strip()
575
642
 
576
643
  # Try multiple matching strategies
@@ -607,6 +674,8 @@ def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_char
607
674
  if info not in matches:
608
675
  matches.append(info)
609
676
 
677
+ # No need to filter by document hint here since we already filtered the PDFs
678
+
610
679
  # Extract text from matches
611
680
  extracted_sections = []
612
681
  for match in matches:
@@ -1206,16 +1275,34 @@ def identify_iupac_sections(
1206
1275
  debug_dir: str | Path | None = None,
1207
1276
  ) -> List[dict]:
1208
1277
  """Identify sections containing IUPAC names from SI table of contents."""
1209
- # Extract only SI TOC pages (first 5 pages of SI)
1278
+ # Extract SI TOC pages and scan for compound synthesis sections
1210
1279
  si_toc_text = ""
1211
1280
  if pdf_paths and len(pdf_paths) > 1:
1212
1281
  si_pdf = pdf_paths[1] # Second PDF is SI
1213
1282
  doc = _open_doc(si_pdf)
1214
1283
  try:
1215
- for page_num in range(min(5, doc.page_count)):
1284
+ # First get TOC from first 10 pages
1285
+ for page_num in range(min(10, doc.page_count)):
1216
1286
  page = doc.load_page(page_num)
1217
1287
  page_text = page.get_text()
1218
1288
  si_toc_text += f"\n[SI Page {page_num + 1}]\n{page_text}"
1289
+
1290
+ # Also scan for pages containing "synthesis of" or "characterization" patterns
1291
+ compound_section_pages = []
1292
+ for page_num in range(doc.page_count):
1293
+ page = doc.load_page(page_num)
1294
+ page_text = page.get_text().lower()
1295
+ if any(pattern in page_text for pattern in [
1296
+ "synthesis of 1", "synthesis of 3", "compound 1", "compound 3",
1297
+ "characterization data", "nmr data", "1h nmr", "13c nmr"
1298
+ ]):
1299
+ compound_section_pages.append(page_num + 1)
1300
+
1301
+ if compound_section_pages:
1302
+ log.info("Found potential compound characterization on SI pages: %s", compound_section_pages[:10])
1303
+ # Add a hint about these pages
1304
+ si_toc_text += f"\n\n[Additional compound characterization detected on pages: {compound_section_pages[:10]}]"
1305
+
1219
1306
  finally:
1220
1307
  doc.close()
1221
1308
 
@@ -1685,17 +1772,17 @@ def _extract_text_for_compound_mapping(
1685
1772
  if page_text:
1686
1773
  extraction_text += f"\n\n=== Section: '{section_title}' starting from page {page_range} ===\n{page_text}"
1687
1774
  else:
1688
- # Try title-based extraction as fallback
1689
- section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=10000)
1775
+ # Try title-based extraction as fallback with more content
1776
+ section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=30000)
1690
1777
  if section_text:
1691
1778
  extraction_text += section_text
1692
1779
 
1693
1780
  if not extraction_text:
1694
- log.warning("No text extracted from IUPAC sections, falling back to limited text")
1695
- extraction_text = text_fallback[:30_000]
1781
+ log.warning("No text extracted from IUPAC sections, using full text")
1782
+ extraction_text = text_fallback
1696
1783
  else:
1697
- # Fallback to limited text
1698
- extraction_text = text_fallback[:30_000]
1784
+ # Fallback to full text - no limit
1785
+ extraction_text = text_fallback
1699
1786
 
1700
1787
  return extraction_text
1701
1788
 
@@ -1739,24 +1826,19 @@ def extract_compound_mappings(
1739
1826
  len(missing_compounds), sorted(missing_compounds))
1740
1827
  log.info("Expanding search to additional sections...")
1741
1828
 
1742
- # Define additional sections that might contain compound definitions
1743
- additional_sections = [
1744
- "Engineering strategy",
1745
- "Screening for benzyl acrylate cyclopropanation",
1746
- "Evolution campaign",
1747
- "General procedure",
1748
- "Experimental procedures",
1749
- "Materials and methods",
1750
- "Substrate synthesis"
1751
- ]
1829
+ # For expanded search, use full manuscript and SI text
1830
+ log.info("Using full manuscript and SI text for comprehensive compound search")
1752
1831
 
1753
- # Extract text from additional sections
1754
- additional_text = _extract_sections_by_title(
1755
- pdf_paths, additional_sections, max_chars_per_section=5000
1756
- )
1832
+ # Extract complete text from both PDFs
1833
+ additional_text = ""
1834
+ for i, pdf_path in enumerate(pdf_paths):
1835
+ doc_type = "manuscript" if i == 0 else "supplementary"
1836
+ doc_text = extract_text(pdf_path)
1837
+ additional_text += f"\n\n=== FULL {doc_type.upper()} TEXT ===\n{doc_text}"
1838
+ log.info("Added %d chars from %s", len(doc_text), doc_type)
1757
1839
 
1758
1840
  if additional_text:
1759
- log.info("Extracted %d chars from additional sections", len(additional_text))
1841
+ log.info("Total expanded text: %d chars", len(additional_text))
1760
1842
 
1761
1843
  # Second extraction attempt with expanded text
1762
1844
  expanded_mappings = _extract_compound_mappings_from_text(
@@ -1922,16 +2004,35 @@ def extract_substrate_scope_entries_for_campaign(
1922
2004
 
1923
2005
  for ref in all_refs:
1924
2006
  if ref and pdf_paths:
1925
- ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
2007
+ # Get document hint for this reference
2008
+ document_hint = location_map.get(ref, {}).get('document', '')
2009
+ ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000, document_hint=document_hint)
1926
2010
  if ref_text:
1927
2011
  extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
1928
2012
 
1929
2013
  # Extract figure images for this reference (crop page around figure)
1930
2014
  try:
1931
- # Get caption and document hints if available
2015
+ # Get caption hint if available
1932
2016
  caption_hint = location_map.get(ref, {}).get('caption', '')
1933
- document_hint = location_map.get(ref, {}).get('document', '')
1934
- fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
2017
+
2018
+ # If we have a good caption, try to extract based on caption pattern
2019
+ if caption_hint and len(caption_hint) > 20:
2020
+ # Extract the figure reference from the caption itself
2021
+ import re
2022
+ caption_fig_match = re.match(r'((?:Figure|Fig\.?)\s*\d+[a-zA-Z]?)', caption_hint, re.IGNORECASE)
2023
+ if caption_fig_match:
2024
+ # Use the figure reference from the caption for more accurate matching
2025
+ fig_ref_from_caption = caption_fig_match.group(1)
2026
+ log.info("Campaign %s - using figure reference from caption: '%s' (original: '%s')",
2027
+ campaign_id, fig_ref_from_caption, ref)
2028
+ fig_base64 = extract_figure_image(pdf_paths, fig_ref_from_caption, caption_hint=caption_hint, document_hint=document_hint)
2029
+ else:
2030
+ # Fallback to original reference
2031
+ fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
2032
+ else:
2033
+ # No caption hint, use original reference
2034
+ fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
2035
+
1935
2036
  if fig_base64:
1936
2037
  figure_images[ref] = fig_base64
1937
2038
  log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.6.2
3
+ Version: 0.7.0
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,18 @@
1
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
+ debase/_version.py,sha256=eraKmkwzA-nS1rqm0gvetysX_jPj7RUdCTZBi3t9g6g,49
4
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
+ debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
6
+ debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
7
+ debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
8
+ debase/enzyme_lineage_extractor.py,sha256=7BGkjp7Pl31etWrkjBw2f2KO3fd_1IJLjUnxvUswXuw,194404
9
+ debase/lineage_format.py,sha256=NhpkeBFE94t50uLAuE47ZP3Ti12XvzPHYt-k-_d7OLA,59730
10
+ debase/reaction_info_extractor.py,sha256=tmL-VBMtRKVy36QMTTLSCy9PU4RLpwaJyPRfc8jIalU,198299
11
+ debase/substrate_scope_extractor.py,sha256=AQFJ-6wvKFVZJ5vTwvYMez_v02W2ic95HQF5shidGFY,129282
12
+ debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
13
+ debase-0.7.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
14
+ debase-0.7.0.dist-info/METADATA,sha256=Wsz_cofQy3NaS_icwm6p0unDOlBknaggce2JQc-4k2c,4047
15
+ debase-0.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ debase-0.7.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
17
+ debase-0.7.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
18
+ debase-0.7.0.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
- debase/_version.py,sha256=t771GcmZTaJJGrIex6Ea6Q5pcMqVPIihCdRFRA1dMAM,49
4
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
- debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
6
- debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
7
- debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
8
- debase/enzyme_lineage_extractor.py,sha256=OXO2jUqAqF0pXrw17oIQERnek1uZ5gsFIuKRz4NMS1o,188556
9
- debase/lineage_format.py,sha256=YWAP9OhFN3MQWbqk5gguX0C2cCwGvKJAtMq9pG5TJp8,59515
10
- debase/reaction_info_extractor.py,sha256=kQBxPpzurjHXsHFWE_WM84ArSnc3E8f6xPMJpyTIGnU,188246
11
- debase/substrate_scope_extractor.py,sha256=hRlt8iWOURmgW4SJHB1Svoh3TTa4fa9YIE8qVUZPnY0,122621
12
- debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
13
- debase-0.6.2.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
14
- debase-0.6.2.dist-info/METADATA,sha256=gnPvTWvazrsdGrIKX8tA4Wwt8yKYph87POVKF25rkkg,4047
15
- debase-0.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- debase-0.6.2.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
17
- debase-0.6.2.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
18
- debase-0.6.2.dist-info/RECORD,,
File without changes