debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/caption_pattern.py +7 -2
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +673 -221
- debase/lineage_format.py +55 -6
- debase/reaction_info_extractor.py +282 -97
- debase/substrate_scope_extractor.py +218 -65
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
- debase-0.7.0.dist-info/RECORD +18 -0
- debase-0.6.1.dist-info/RECORD +0 -18
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0
@@ -296,12 +296,14 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
|
|
296
296
|
|
297
297
|
return "\n".join(chunks)
|
298
298
|
|
299
|
-
def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str]:
|
299
|
+
def extract_figure_image(pdf_paths: List[Path], figure_ref: str, caption_hint: Optional[str] = None, document_hint: Optional[str] = None) -> Optional[str]:
|
300
300
|
"""Extract figure as a page region when embedded images aren't available.
|
301
301
|
|
302
302
|
Args:
|
303
303
|
pdf_paths: List of PDF paths to search
|
304
304
|
figure_ref: Figure reference to search for (e.g., "Figure 3" or "Figure 3(a)")
|
305
|
+
caption_hint: Optional caption text to help identify the exact figure
|
306
|
+
document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
|
305
307
|
|
306
308
|
Returns:
|
307
309
|
Base64-encoded PNG string or None if not found
|
@@ -318,43 +320,120 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
318
320
|
log.info("Extracting entire figure '%s' from reference '%s'", base_figure_ref, figure_ref)
|
319
321
|
else:
|
320
322
|
base_figure_ref = figure_ref
|
323
|
+
|
324
|
+
# Determine which PDFs to search based on document hint
|
325
|
+
if document_hint and len(pdf_paths) > 1:
|
326
|
+
if document_hint.lower() == "manuscript":
|
327
|
+
# ONLY search in manuscript (first PDF)
|
328
|
+
search_paths = [pdf_paths[0]]
|
329
|
+
log.info("Searching ONLY in manuscript document for '%s' (hint: %s)", figure_ref, document_hint)
|
330
|
+
elif document_hint.lower() == "supplementary":
|
331
|
+
# ONLY search in SI (second PDF if available)
|
332
|
+
search_paths = [pdf_paths[1]] if len(pdf_paths) > 1 else pdf_paths
|
333
|
+
log.info("Searching ONLY in supplementary document for '%s' (hint: %s)", figure_ref, document_hint)
|
334
|
+
else:
|
335
|
+
# No specific hint, search all PDFs
|
336
|
+
search_paths = list(pdf_paths)
|
337
|
+
else:
|
338
|
+
# No hint or single PDF, search all available
|
339
|
+
search_paths = list(pdf_paths)
|
321
340
|
|
322
|
-
|
341
|
+
# Extract figure number BEFORE the loop (e.g., "Figure 3" -> "3", "Figure S3" -> "S3", "Fig. 3" -> "3")
|
342
|
+
figure_num = base_figure_ref.replace('Figure ', '').replace('figure ', '').replace('Fig. ', '').replace('fig. ', '').replace('Fig ', '').replace('fig ', '')
|
343
|
+
|
344
|
+
for pdf_idx, pdf_path in enumerate(search_paths):
|
323
345
|
doc = _open_doc(pdf_path)
|
324
346
|
try:
|
347
|
+
log.debug("Searching %s (document %d/%d) with %d pages for figure '%s'",
|
348
|
+
pdf_path.name, pdf_idx + 1, len(search_paths), doc.page_count, figure_num)
|
349
|
+
|
325
350
|
for page_num in range(doc.page_count):
|
326
351
|
page = doc.load_page(page_num)
|
327
352
|
page_text = page.get_text()
|
328
353
|
|
354
|
+
# Debug: Check if page contains any mention of the figure
|
355
|
+
if figure_num.lower() in page_text.lower():
|
356
|
+
log.debug("Page %d contains mention of figure number '%s'", page_num + 1, figure_num)
|
357
|
+
|
329
358
|
# Check if this page contains the figure caption
|
330
359
|
found = False
|
331
360
|
caption_rect = None
|
332
361
|
|
333
|
-
#
|
334
|
-
|
362
|
+
# First try to find using caption hint if provided
|
363
|
+
if caption_hint and len(caption_hint) > 10:
|
364
|
+
# Try to find a unique portion of the caption
|
365
|
+
# Start with longer snippets and work down to shorter ones
|
366
|
+
caption_lengths = [100, 50, 30, 20]
|
367
|
+
|
368
|
+
for length in caption_lengths:
|
369
|
+
if len(caption_hint) >= length:
|
370
|
+
snippet = caption_hint[:length]
|
371
|
+
# Clean up the snippet to avoid partial words
|
372
|
+
if length < len(caption_hint):
|
373
|
+
# Try to end at a word boundary
|
374
|
+
last_space = snippet.rfind(' ')
|
375
|
+
if last_space > length * 0.6: # Don't trim too much
|
376
|
+
snippet = snippet[:last_space]
|
377
|
+
|
378
|
+
if snippet in page_text:
|
379
|
+
caption_instances = page.search_for(snippet, quads=False)
|
380
|
+
if caption_instances:
|
381
|
+
caption_rect = caption_instances[0]
|
382
|
+
found = True
|
383
|
+
log.info("Found figure using caption snippet (%d chars) on page %d", len(snippet), page_num + 1)
|
384
|
+
break
|
335
385
|
|
336
|
-
#
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
386
|
+
# If not found with hint, look for actual figure captions using regex patterns
|
387
|
+
if not found:
|
388
|
+
caption_patterns = [
|
389
|
+
# More flexible patterns to match various formats
|
390
|
+
rf"^Figure\s+{re.escape(figure_num)}[\s\.\:]", # "Figure 3." or "Figure 3:" or "Figure 3 " at start
|
391
|
+
rf"^Fig\.?\s*{re.escape(figure_num)}[\s\.\:]", # "Fig. 3." or "Fig 3:" at start
|
392
|
+
rf"Figure\s+{re.escape(figure_num)}[\s\.\:]", # "Figure 3." anywhere
|
393
|
+
rf"Fig\.?\s*{re.escape(figure_num)}[\s\.\:]", # "Fig. 3." anywhere
|
394
|
+
rf"^Figure\s+{re.escape(figure_num)}\s+[A-Z]", # "Figure 3 Substrate scope"
|
395
|
+
rf"^Fig\.?\s*{re.escape(figure_num)}\s+[A-Z]", # "Fig. 3 Substrate"
|
396
|
+
# Special patterns for edge cases
|
397
|
+
rf"Fig\.\s*{re.escape(figure_num)}\s*\|", # "Fig. 3 |"
|
398
|
+
rf"^\s*{re.escape(figure_num)}\.", # Just "3." at line start (some formats)
|
399
|
+
]
|
400
|
+
|
401
|
+
for pattern in caption_patterns:
|
402
|
+
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
403
|
+
if matches:
|
404
|
+
# Found actual figure caption, get its position
|
405
|
+
caption_text = matches.group(0)
|
406
|
+
caption_instances = page.search_for(caption_text, quads=False)
|
407
|
+
if caption_instances:
|
408
|
+
caption_rect = caption_instances[0]
|
409
|
+
found = True
|
410
|
+
log.info("Found actual figure caption '%s' on page %d", caption_text, page_num + 1)
|
411
|
+
break
|
344
412
|
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
413
|
+
if not found:
|
414
|
+
# Try a fuzzy search for lines that look like figure captions
|
415
|
+
lines = page_text.split('\n')
|
416
|
+
for i, line in enumerate(lines):
|
417
|
+
line_stripped = line.strip()
|
418
|
+
line_lower = line_stripped.lower()
|
419
|
+
|
420
|
+
# Check if this looks like a figure caption (starts with fig/figure)
|
421
|
+
# and NOT an inline reference (which would have text before it)
|
422
|
+
if (line_lower.startswith(('fig.', 'fig ', 'figure')) and
|
423
|
+
figure_num.lower() in line_lower and
|
424
|
+
len(line_stripped) < 200 and
|
425
|
+
not line_lower.endswith(')')): # Exclude inline refs like "(Fig. 2)"
|
426
|
+
|
427
|
+
# Found a potential caption line
|
428
|
+
caption_instances = page.search_for(line_stripped[:50], quads=False)
|
429
|
+
if caption_instances:
|
430
|
+
caption_rect = caption_instances[0]
|
431
|
+
found = True
|
432
|
+
log.info("Found figure via fuzzy search: '%s' on page %d", line_stripped[:50], page_num + 1)
|
433
|
+
break
|
356
434
|
|
357
435
|
if not found:
|
436
|
+
# Skip this page if we didn't find the actual figure caption
|
358
437
|
continue
|
359
438
|
|
360
439
|
log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
|
@@ -536,15 +615,29 @@ def _build_caption_index(pdf_paths: List[Path]) -> Dict[str, Dict[str, Any]]:
|
|
536
615
|
|
537
616
|
return caption_index
|
538
617
|
|
539
|
-
def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
|
618
|
+
def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000, document_hint: Optional[str] = None) -> str:
|
540
619
|
"""Extract text around a specific reference using caption index."""
|
541
620
|
import re
|
542
621
|
|
543
|
-
#
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
622
|
+
# Filter PDFs based on document hint BEFORE building caption index
|
623
|
+
search_paths = pdf_paths
|
624
|
+
if document_hint and len(pdf_paths) > 1:
|
625
|
+
if document_hint.lower() == "manuscript":
|
626
|
+
# ONLY search in manuscript (first PDF)
|
627
|
+
search_paths = [pdf_paths[0]]
|
628
|
+
log.info("Text extraction: Searching ONLY in manuscript document for '%s'", ref)
|
629
|
+
elif document_hint.lower() == "supplementary":
|
630
|
+
# ONLY search in SI (second PDF if available)
|
631
|
+
search_paths = [pdf_paths[1]] if len(pdf_paths) > 1 else pdf_paths
|
632
|
+
log.info("Text extraction: Searching ONLY in supplementary document for '%s'", ref)
|
633
|
+
|
634
|
+
# Build caption index from filtered paths
|
635
|
+
# Use a cache key that includes the document hint
|
636
|
+
cache_key = f"_caption_index_{document_hint or 'all'}"
|
637
|
+
if not hasattr(_extract_text_around_reference, cache_key):
|
638
|
+
setattr(_extract_text_around_reference, cache_key, _build_caption_index(search_paths))
|
639
|
+
|
640
|
+
caption_index = getattr(_extract_text_around_reference, cache_key)
|
548
641
|
ref_lower = ref.lower().strip()
|
549
642
|
|
550
643
|
# Try multiple matching strategies
|
@@ -581,6 +674,8 @@ def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_char
|
|
581
674
|
if info not in matches:
|
582
675
|
matches.append(info)
|
583
676
|
|
677
|
+
# No need to filter by document hint here since we already filtered the PDFs
|
678
|
+
|
584
679
|
# Extract text from matches
|
585
680
|
extracted_sections = []
|
586
681
|
for match in matches:
|
@@ -1135,17 +1230,24 @@ Your task is to:
|
|
1135
1230
|
4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
|
1136
1231
|
5. Determine which enzyme variants from this campaign were tested in substrate scope studies
|
1137
1232
|
|
1233
|
+
IMPORTANT FIGURE REFERENCE RULES:
|
1234
|
+
- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
|
1235
|
+
- Include the figure caption if available to help with identification
|
1236
|
+
- The extraction system will handle retrieving the entire figure including all sub-panels
|
1237
|
+
|
1138
1238
|
Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
|
1139
1239
|
[
|
1140
1240
|
{{
|
1141
|
-
"location": "
|
1241
|
+
"location": "Main figure/table reference (e.g., 'Figure 2', 'Table S1', NOT 'Figure 2a')",
|
1142
1242
|
"type": "table|figure|text",
|
1143
1243
|
"confidence": 0.0-1.0,
|
1144
1244
|
"enzyme_variants": ["list of enzyme IDs found"],
|
1145
1245
|
"substrates_tested": ["list of substrates if identifiable"],
|
1146
1246
|
"campaign_match": true/false,
|
1147
1247
|
"is_substrate_scope": true/false,
|
1148
|
-
"model_reaction_excluded": "reason why this is not a model reaction"
|
1248
|
+
"model_reaction_excluded": "reason why this is not a model reaction",
|
1249
|
+
"caption": "Include the figure/table caption if available",
|
1250
|
+
"document": "manuscript|supplementary - specify whether this location is in the main manuscript or supplementary information"
|
1149
1251
|
}}
|
1150
1252
|
]
|
1151
1253
|
|
@@ -1173,16 +1275,34 @@ def identify_iupac_sections(
|
|
1173
1275
|
debug_dir: str | Path | None = None,
|
1174
1276
|
) -> List[dict]:
|
1175
1277
|
"""Identify sections containing IUPAC names from SI table of contents."""
|
1176
|
-
# Extract
|
1278
|
+
# Extract SI TOC pages and scan for compound synthesis sections
|
1177
1279
|
si_toc_text = ""
|
1178
1280
|
if pdf_paths and len(pdf_paths) > 1:
|
1179
1281
|
si_pdf = pdf_paths[1] # Second PDF is SI
|
1180
1282
|
doc = _open_doc(si_pdf)
|
1181
1283
|
try:
|
1182
|
-
|
1284
|
+
# First get TOC from first 10 pages
|
1285
|
+
for page_num in range(min(10, doc.page_count)):
|
1183
1286
|
page = doc.load_page(page_num)
|
1184
1287
|
page_text = page.get_text()
|
1185
1288
|
si_toc_text += f"\n[SI Page {page_num + 1}]\n{page_text}"
|
1289
|
+
|
1290
|
+
# Also scan for pages containing "synthesis of" or "characterization" patterns
|
1291
|
+
compound_section_pages = []
|
1292
|
+
for page_num in range(doc.page_count):
|
1293
|
+
page = doc.load_page(page_num)
|
1294
|
+
page_text = page.get_text().lower()
|
1295
|
+
if any(pattern in page_text for pattern in [
|
1296
|
+
"synthesis of 1", "synthesis of 3", "compound 1", "compound 3",
|
1297
|
+
"characterization data", "nmr data", "1h nmr", "13c nmr"
|
1298
|
+
]):
|
1299
|
+
compound_section_pages.append(page_num + 1)
|
1300
|
+
|
1301
|
+
if compound_section_pages:
|
1302
|
+
log.info("Found potential compound characterization on SI pages: %s", compound_section_pages[:10])
|
1303
|
+
# Add a hint about these pages
|
1304
|
+
si_toc_text += f"\n\n[Additional compound characterization detected on pages: {compound_section_pages[:10]}]"
|
1305
|
+
|
1186
1306
|
finally:
|
1187
1307
|
doc.close()
|
1188
1308
|
|
@@ -1652,17 +1772,17 @@ def _extract_text_for_compound_mapping(
|
|
1652
1772
|
if page_text:
|
1653
1773
|
extraction_text += f"\n\n=== Section: '{section_title}' starting from page {page_range} ===\n{page_text}"
|
1654
1774
|
else:
|
1655
|
-
# Try title-based extraction as fallback
|
1656
|
-
section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=
|
1775
|
+
# Try title-based extraction as fallback with more content
|
1776
|
+
section_text = _extract_sections_by_title(pdf_paths, [section_title], max_chars_per_section=30000)
|
1657
1777
|
if section_text:
|
1658
1778
|
extraction_text += section_text
|
1659
1779
|
|
1660
1780
|
if not extraction_text:
|
1661
|
-
log.warning("No text extracted from IUPAC sections,
|
1662
|
-
extraction_text = text_fallback
|
1781
|
+
log.warning("No text extracted from IUPAC sections, using full text")
|
1782
|
+
extraction_text = text_fallback
|
1663
1783
|
else:
|
1664
|
-
# Fallback to
|
1665
|
-
extraction_text = text_fallback
|
1784
|
+
# Fallback to full text - no limit
|
1785
|
+
extraction_text = text_fallback
|
1666
1786
|
|
1667
1787
|
return extraction_text
|
1668
1788
|
|
@@ -1706,24 +1826,19 @@ def extract_compound_mappings(
|
|
1706
1826
|
len(missing_compounds), sorted(missing_compounds))
|
1707
1827
|
log.info("Expanding search to additional sections...")
|
1708
1828
|
|
1709
|
-
#
|
1710
|
-
|
1711
|
-
"Engineering strategy",
|
1712
|
-
"Screening for benzyl acrylate cyclopropanation",
|
1713
|
-
"Evolution campaign",
|
1714
|
-
"General procedure",
|
1715
|
-
"Experimental procedures",
|
1716
|
-
"Materials and methods",
|
1717
|
-
"Substrate synthesis"
|
1718
|
-
]
|
1829
|
+
# For expanded search, use full manuscript and SI text
|
1830
|
+
log.info("Using full manuscript and SI text for comprehensive compound search")
|
1719
1831
|
|
1720
|
-
# Extract text from
|
1721
|
-
additional_text =
|
1722
|
-
|
1723
|
-
|
1832
|
+
# Extract complete text from both PDFs
|
1833
|
+
additional_text = ""
|
1834
|
+
for i, pdf_path in enumerate(pdf_paths):
|
1835
|
+
doc_type = "manuscript" if i == 0 else "supplementary"
|
1836
|
+
doc_text = extract_text(pdf_path)
|
1837
|
+
additional_text += f"\n\n=== FULL {doc_type.upper()} TEXT ===\n{doc_text}"
|
1838
|
+
log.info("Added %d chars from %s", len(doc_text), doc_type)
|
1724
1839
|
|
1725
1840
|
if additional_text:
|
1726
|
-
log.info("
|
1841
|
+
log.info("Total expanded text: %d chars", len(additional_text))
|
1727
1842
|
|
1728
1843
|
# Second extraction attempt with expanded text
|
1729
1844
|
expanded_mappings = _extract_compound_mappings_from_text(
|
@@ -1865,31 +1980,59 @@ def extract_substrate_scope_entries_for_campaign(
|
|
1865
1980
|
all_refs = []
|
1866
1981
|
|
1867
1982
|
if locations:
|
1868
|
-
#
|
1869
|
-
|
1870
|
-
|
1871
|
-
loc_str = loc.get('location', '')
|
1872
|
-
location_strs.append(loc_str)
|
1873
|
-
all_refs.append(loc_str)
|
1983
|
+
# Sort locations by confidence and use only the PRIMARY (most confident) location
|
1984
|
+
sorted_locations = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
|
1985
|
+
primary_location = sorted_locations[0] if sorted_locations else None
|
1874
1986
|
|
1875
|
-
|
1987
|
+
if primary_location:
|
1988
|
+
primary_ref = primary_location.get('location', '')
|
1989
|
+
all_refs = [primary_ref] # Only extract from primary location
|
1990
|
+
|
1991
|
+
extraction_hints = f"\nPRIMARY substrate scope location for campaign {campaign_id}: {primary_ref}"
|
1992
|
+
extraction_hints += f"\nLocation confidence: {primary_location.get('confidence', 0)}%"
|
1993
|
+
extraction_hints += f"\nLocation type: {primary_location.get('type', 'unknown')}"
|
1876
1994
|
|
1877
1995
|
# Focus on campaign-specific enzyme variants
|
1878
1996
|
extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
|
1879
1997
|
|
1880
|
-
# Extract text from
|
1998
|
+
# Extract text from ONLY the primary location
|
1881
1999
|
extraction_texts = []
|
1882
2000
|
figure_images = {}
|
1883
2001
|
|
2002
|
+
# Create a mapping of location strings to their full location data
|
2003
|
+
location_map = {loc.get('location', ''): loc for loc in locations}
|
2004
|
+
|
1884
2005
|
for ref in all_refs:
|
1885
2006
|
if ref and pdf_paths:
|
1886
|
-
|
2007
|
+
# Get document hint for this reference
|
2008
|
+
document_hint = location_map.get(ref, {}).get('document', '')
|
2009
|
+
ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000, document_hint=document_hint)
|
1887
2010
|
if ref_text:
|
1888
2011
|
extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
|
1889
2012
|
|
1890
2013
|
# Extract figure images for this reference (crop page around figure)
|
1891
2014
|
try:
|
1892
|
-
|
2015
|
+
# Get caption hint if available
|
2016
|
+
caption_hint = location_map.get(ref, {}).get('caption', '')
|
2017
|
+
|
2018
|
+
# If we have a good caption, try to extract based on caption pattern
|
2019
|
+
if caption_hint and len(caption_hint) > 20:
|
2020
|
+
# Extract the figure reference from the caption itself
|
2021
|
+
import re
|
2022
|
+
caption_fig_match = re.match(r'((?:Figure|Fig\.?)\s*\d+[a-zA-Z]?)', caption_hint, re.IGNORECASE)
|
2023
|
+
if caption_fig_match:
|
2024
|
+
# Use the figure reference from the caption for more accurate matching
|
2025
|
+
fig_ref_from_caption = caption_fig_match.group(1)
|
2026
|
+
log.info("Campaign %s - using figure reference from caption: '%s' (original: '%s')",
|
2027
|
+
campaign_id, fig_ref_from_caption, ref)
|
2028
|
+
fig_base64 = extract_figure_image(pdf_paths, fig_ref_from_caption, caption_hint=caption_hint, document_hint=document_hint)
|
2029
|
+
else:
|
2030
|
+
# Fallback to original reference
|
2031
|
+
fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
|
2032
|
+
else:
|
2033
|
+
# No caption hint, use original reference
|
2034
|
+
fig_base64 = extract_figure_image(pdf_paths, ref, caption_hint=caption_hint, document_hint=document_hint)
|
2035
|
+
|
1893
2036
|
if fig_base64:
|
1894
2037
|
figure_images[ref] = fig_base64
|
1895
2038
|
log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
|
@@ -1942,6 +2085,14 @@ IMPORTANT INSTRUCTIONS:
|
|
1942
2085
|
4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
|
1943
2086
|
5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
|
1944
2087
|
|
2088
|
+
CRITICAL DATA ACCURACY REQUIREMENTS:
|
2089
|
+
- BE EXTREMELY CAREFUL about which substrate ID maps to which yield, TTN, and selectivity values
|
2090
|
+
- Each substrate entry should have its OWN yield, ee, and TTN values - do not mix up values between substrates
|
2091
|
+
- If looking at a table or figure, carefully match each substrate with its corresponding row/bar/data point
|
2092
|
+
- Double-check that substrate 1a's data is not confused with substrate 1b's data, etc.
|
2093
|
+
- If values are unclear or ambiguous for a specific substrate, return null rather than guessing
|
2094
|
+
- Pay special attention when extracting from figures - ensure you're reading the correct bar/point for each substrate
|
2095
|
+
|
1945
2096
|
{extraction_hints}
|
1946
2097
|
|
1947
2098
|
Return your analysis as JSON in this format:
|
@@ -2287,13 +2438,15 @@ def get_substrate_scope(
|
|
2287
2438
|
if should_extract:
|
2288
2439
|
figure_ref = location_str
|
2289
2440
|
confidence = loc.get('confidence', 0)
|
2441
|
+
caption_hint = loc.get('caption', '')
|
2290
2442
|
log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
|
2291
2443
|
|
2292
2444
|
# Use appropriate extraction function based on type
|
2293
2445
|
if 'scheme' in location_str.lower() or location_type == 'scheme':
|
2294
2446
|
figure_image = extract_scheme_image(pdf_paths, figure_ref)
|
2295
2447
|
else:
|
2296
|
-
|
2448
|
+
document_hint = loc.get('document', '')
|
2449
|
+
figure_image = extract_figure_image(pdf_paths, figure_ref, caption_hint=caption_hint, document_hint=document_hint)
|
2297
2450
|
|
2298
2451
|
if figure_image:
|
2299
2452
|
log.info("Successfully extracted %s image for %s (%d bytes)",
|
@@ -0,0 +1,18 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=eraKmkwzA-nS1rqm0gvetysX_jPj7RUdCTZBi3t9g6g,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
|
6
|
+
debase/caption_pattern.py,sha256=F1cxQxyQDmzw3ogi3zXJp7iEvOdFdIN2kDzLrUg_amE,2043
|
7
|
+
debase/cleanup_sequence.py,sha256=XbA0pZFFIJRJf4XCEN-j4s7dnkdXN9mYdbcuz-ZSjg4,75520
|
8
|
+
debase/enzyme_lineage_extractor.py,sha256=7BGkjp7Pl31etWrkjBw2f2KO3fd_1IJLjUnxvUswXuw,194404
|
9
|
+
debase/lineage_format.py,sha256=NhpkeBFE94t50uLAuE47ZP3Ti12XvzPHYt-k-_d7OLA,59730
|
10
|
+
debase/reaction_info_extractor.py,sha256=tmL-VBMtRKVy36QMTTLSCy9PU4RLpwaJyPRfc8jIalU,198299
|
11
|
+
debase/substrate_scope_extractor.py,sha256=AQFJ-6wvKFVZJ5vTwvYMez_v02W2ic95HQF5shidGFY,129282
|
12
|
+
debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
|
13
|
+
debase-0.7.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
14
|
+
debase-0.7.0.dist-info/METADATA,sha256=Wsz_cofQy3NaS_icwm6p0unDOlBknaggce2JQc-4k2c,4047
|
15
|
+
debase-0.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
+
debase-0.7.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
17
|
+
debase-0.7.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
18
|
+
debase-0.7.0.dist-info/RECORD,,
|
debase-0.6.1.dist-info/RECORD
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=Cbfy3WdPDTjtgnzdUc6e5F779YhAJJGX5LN-2SJMvCI,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/campaign_utils.py,sha256=6Mo6ps8gIOxBrfNNshvny-9GTKBt8UARc8t59o1obAk,4756
|
6
|
-
debase/caption_pattern.py,sha256=nMLj2tK4MhD4jQ9d1IUDJ6xnY0MOx-UioIT-k_b3OWA,1770
|
7
|
-
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
8
|
-
debase/enzyme_lineage_extractor.py,sha256=RKsjvcs6O2wnw2dpts3AynDRVKqMAeBVOMql2mayCGY,170120
|
9
|
-
debase/lineage_format.py,sha256=BE8uW1XUCmxlcYKiD7QveF4r99xObfGf1vP1rZzJTV8,56525
|
10
|
-
debase/reaction_info_extractor.py,sha256=qUrVi9chQcQG1zWwQlTbYF8dczvQqctdjwhvkAkBnZw,187032
|
11
|
-
debase/substrate_scope_extractor.py,sha256=dikdEELi4RGlP2lGHcR93WdUbtIchOdHVB5G45BMCNk,118709
|
12
|
-
debase/wrapper.py,sha256=Vcad6c_f3jZHpefZMP9XJPI3fo7w-pCgcSqEEQyDgS0,24559
|
13
|
-
debase-0.6.1.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
14
|
-
debase-0.6.1.dist-info/METADATA,sha256=fXvGhqDP5Bl33gTEvUvvjqNy-cXYs9jYFl1NyM5ALsc,4047
|
15
|
-
debase-0.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
16
|
-
debase-0.6.1.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
17
|
-
debase-0.6.1.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
18
|
-
debase-0.6.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|