debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -346,6 +346,103 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
346
346
  log.warning("Could not find figure caption for '%s'", figure_ref)
347
347
  return None
348
348
 
349
+ def extract_scheme_image(pdf_paths: List[Path], scheme_ref: str) -> Optional[str]:
350
+ """Extract scheme as a page region, similar to figures.
351
+
352
+ Args:
353
+ pdf_paths: List of PDF paths to search
354
+ scheme_ref: Scheme reference to search for (e.g., "Scheme 2" or "Scheme S2")
355
+
356
+ Returns:
357
+ Base64-encoded PNG string or None if not found
358
+ """
359
+ if not pdf_paths:
360
+ return None
361
+
362
+ for pdf_path in pdf_paths:
363
+ doc = _open_doc(pdf_path)
364
+ try:
365
+ for page_num in range(doc.page_count):
366
+ page = doc.load_page(page_num)
367
+ page_text = page.get_text()
368
+
369
+ # Check if this page contains the scheme
370
+ found = False
371
+ scheme_instances = None
372
+
373
+ # Look for scheme reference with various patterns
374
+ variations = [
375
+ f"{scheme_ref}.", # "Scheme 2."
376
+ f"{scheme_ref}:", # "Scheme 2:"
377
+ f"{scheme_ref} ", # "Scheme 2 "
378
+ scheme_ref,
379
+ ]
380
+
381
+ for variation in variations:
382
+ scheme_instances = page.search_for(variation, quads=False)
383
+ if scheme_instances:
384
+ # Check if this is likely a scheme title (not a reference in text)
385
+ for rect in scheme_instances:
386
+ # Get text around this location
387
+ x0, y0, x1, y1 = rect
388
+ text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+400, y1+20))
389
+ # Check if it looks like a scheme title
390
+ if any(keyword in text_around.lower() for keyword in
391
+ ['substrate scope', 'reaction', 'synthesis', 'procedure', 'explored']):
392
+ found = True
393
+ scheme_rect = rect
394
+ break
395
+ if found:
396
+ break
397
+
398
+ if not found:
399
+ continue
400
+
401
+ log.info("Found scheme on page %d at y=%.0f", page_num + 1, scheme_rect.y0)
402
+
403
+ # For schemes, we often want to capture more of the page
404
+ # since they can be large and include multiple reactions
405
+ page_rect = page.rect
406
+
407
+ # Define the region to extract
408
+ # For schemes, we want to capture everything below the title
409
+ # until we hit significant text (which would be the next section)
410
+ top_margin = max(0, scheme_rect.y1 + 5) # Start just below the scheme title
411
+
412
+ # Look for the next major text block that might indicate end of scheme
413
+ # This is a simple heuristic - look for blocks of text below the scheme
414
+ text_blocks = page.get_text("blocks")
415
+ bottom_y = page_rect.height # Default to full page
416
+
417
+ for block in text_blocks:
418
+ block_y = block[1] # y-coordinate of block
419
+ block_text = block[4] # text content
420
+ # If we find a substantial text block below the scheme title
421
+ if block_y > scheme_rect.y1 + 50 and len(block_text) > 100:
422
+ # This might be the next section
423
+ bottom_y = block_y - 10
424
+ break
425
+
426
+ # Create the clip rectangle
427
+ clip_rect = fitz.Rect(0, top_margin, page_rect.width, bottom_y)
428
+
429
+ # Extract the region as an image
430
+ mat = fitz.Matrix(2, 2) # 2x zoom for better quality
431
+ pix = page.get_pixmap(clip=clip_rect, matrix=mat)
432
+
433
+ # Convert to PNG
434
+ img_bytes = pix.tobytes("png")
435
+ log.info("Extracted scheme region: %.0fx%.0f pixels from page %d",
436
+ clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
437
+
438
+ return b64encode(img_bytes).decode()
439
+
440
+ finally:
441
+ doc.close()
442
+
443
+ log.warning("Could not find scheme '%s'", scheme_ref)
444
+ return None
445
+
349
446
 
350
447
  def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
351
448
  """Extract text around a specific reference (e.g., 'Figure 3')."""
@@ -765,21 +862,28 @@ Return JSON:
765
862
  """.strip()
766
863
 
767
864
  _COMPOUND_MAPPING_PROMPT = """
768
- Extract compound identifiers and their chemical names EXACTLY as they appear in the text.
865
+ Extract compound identifiers and their chemical names from the provided text and any scheme images.
866
+
867
+ TASK:
868
+ 1. First, extract all compound IDs and names that are explicitly written in the text
869
+ 2. Then, analyze any provided scheme images to identify compound labeling patterns
870
+ 3. Look for relationships between compounds (e.g., when multiple variants share a base structure)
871
+ 4. Note any systematic naming conventions used in the schemes
769
872
 
770
- STRICT RULES:
771
- 1. ONLY extract what is explicitly written in the text
772
- 2. Look for patterns where compound IDs are paired with chemical names
773
- 3. DO NOT infer, generate, or guess any chemical names
774
- 4. If a compound ID appears without a chemical name, return null for iupac_name
775
- 5. If a product was "not detected" or "not formed", return null for iupac_name
873
+ ANALYSIS GUIDELINES:
874
+ - Some papers define a base compound and use letter suffixes for variants
875
+ - Schemes often show relationships that aren't explicitly stated in text
876
+ - Pay attention to how compounds are grouped or connected in schemes
877
+ - Identify any patterns in how compounds are numbered/lettered
776
878
 
777
879
  For each compound:
778
- - identifier: The exact compound ID as written (e.g., "1", "2a", "SM-1")
779
- - iupac_name: The chemical name if explicitly provided, otherwise null
780
- - common_names: Any alternative names mentioned
880
+ - identifier: The exact compound ID as written
881
+ - iupac_name: The chemical name if found in text
882
+ - common_names: Any alternative names
781
883
  - compound_type: substrate/product/reagent/catalyst/other
782
- - source_location: The exact text excerpt where this information was found
884
+ - source_location: Where found (text excerpt or "Scheme X")
885
+ - related_compounds: List of related compound IDs if a pattern is detected
886
+ - pattern_notes: Description of any labeling pattern observed
783
887
 
784
888
  Return as JSON:
785
889
  {
@@ -789,12 +893,12 @@ Return as JSON:
789
893
  "iupac_name": "string or null",
790
894
  "common_names": ["array of strings"],
791
895
  "compound_type": "string",
792
- "source_location": "string"
896
+ "source_location": "string",
897
+ "related_compounds": ["array of related IDs"],
898
+ "pattern_notes": "string or null"
793
899
  }
794
900
  ]
795
901
  }
796
-
797
- Note: It is better to return null than to hallucinate or infer chemical structures.
798
902
  """.strip()
799
903
 
800
904
  _SUBSTRATE_SCOPE_PROMPT = """
@@ -803,13 +907,17 @@ Extract ALL substrate scope data from the primary sources in one complete extrac
803
907
 
804
908
  For EACH reaction, extract:
805
909
  1. Enzyme variant ID
806
- 2. Substrate identifiers (e.g., "6a", "5")
910
+ 2. Substrate identifiers (e.g., "6a", "5") - ONLY if explicitly shown
807
911
  3. Product identifiers (e.g., "7a", "7b", "7d", "7e") - ALWAYS include even if no yield
808
912
  4. Performance metrics (yield%, ee%, dr, TTN)
809
913
  5. Reaction conditions (temperature, pH, buffer, substrate concentrations - NOT dithionite/reducing agents)
810
914
  6. Data location (which figure/table this comes from)
811
915
 
812
- CRITICAL - NO HALLUCINATION OR MODIFICATION:
916
+ CRITICAL - NO HALLUCINATION OR INFERENCE OF IDENTIFIERS:
917
+ - SUBSTRATE IDS: Only extract substrate identifiers that are EXPLICITLY WRITTEN in the source
918
+ - DO NOT INFER substrate IDs from patterns (e.g., if you see product "4a", DO NOT assume substrate is "3a")
919
+ - If substrate ID is not explicitly shown, use null for substrate_ids
920
+ - Product IDs should be extracted as shown (since they are usually labeled in schemes)
813
921
  - Extract values EXACTLY as written in the primary source - NO CHANGES WHATSOEVER
814
922
  - DO NOT round, estimate, convert, or modify any numbers
815
923
  - If the text shows "53%", report 53.0, not 53 or 53.00
@@ -821,19 +929,20 @@ CRITICAL - NO HALLUCINATION OR MODIFICATION:
821
929
  - If no value is shown, return null, not 0 or empty string
822
930
  - Extract ALL reactions from ALL identified locations
823
931
  - Use compound identifiers EXACTLY as shown (not IUPAC names)
824
- - For every entry, there needs to be identifier for both substrates and products, even if yield is null or activity is 0.
825
932
  - Extract reaction conditions EXACTLY as written - NO PARAPHRASING
826
933
  - IMPORTANT: Substrate concentration refers to the concentration of the actual chemical substrates being transformed in the reaction, NOT reducing agents (e.g., dithionite, NADH) or other additives
827
934
 
828
- IMPORTANT: Each substrate should have a corresponding product identifier. Even when there is no yield, return
829
- the exact identifier as seen in the reaction.
935
+ IMPORTANT:
936
+ - Substrate IDs must be EXPLICITLY visible in the source - DO NOT INFER FROM PATTERNS
937
+ - Product IDs should be extracted as labeled in the scheme/figure
938
+ - If only product ID is shown with yields/ee data, substrate_ids should be null
830
939
 
831
940
  Return as JSON:
832
941
  {{
833
942
  "substrate_scope_data": [
834
943
  {{
835
944
  "enzyme_id": "enzyme variant name",
836
- "substrate_ids": ["list of substrate identifiers"],
945
+ "substrate_ids": null or ["list of EXPLICITLY shown substrate identifiers"],
837
946
  "product_ids": ["list of product identifiers"],
838
947
  "yield_percent": null or number,
839
948
  "ee_percent": null or number,
@@ -959,6 +1068,10 @@ def _extract_compound_mappings_from_text(
959
1068
  source_location=item.get("source_location")
960
1069
  )
961
1070
 
1071
+ # Store pattern information for post-processing
1072
+ mapping._related_compounds = item.get("related_compounds", [])
1073
+ mapping._pattern_notes = item.get("pattern_notes", "")
1074
+
962
1075
  # Create lookup entries for all identifiers and common names
963
1076
  for identifier in mapping.identifiers + mapping.common_names:
964
1077
  if identifier:
@@ -970,6 +1083,180 @@ def _extract_compound_mappings_from_text(
970
1083
  log.error("Failed to extract compound mappings: %s", exc)
971
1084
  return {}
972
1085
 
1086
+ def _extract_json(text: str) -> str:
1087
+ """Extract JSON content from raw LLM response text."""
1088
+ # Remove common markdown code block markers
1089
+ text = text.strip()
1090
+ if text.startswith('```json'):
1091
+ text = text[7:]
1092
+ elif text.startswith('```'):
1093
+ text = text[3:]
1094
+
1095
+ if text.endswith('```'):
1096
+ text = text[:-3]
1097
+
1098
+ # Find JSON structure
1099
+ text = text.strip()
1100
+
1101
+ # Look for JSON object or array
1102
+ json_start = -1
1103
+ json_end = -1
1104
+
1105
+ for i, char in enumerate(text):
1106
+ if char in '[{' and json_start == -1:
1107
+ json_start = i
1108
+ break
1109
+
1110
+ if json_start >= 0:
1111
+ # Find the matching closing bracket
1112
+ bracket_stack = []
1113
+ in_string = False
1114
+ escape_next = False
1115
+
1116
+ for i in range(json_start, len(text)):
1117
+ char = text[i]
1118
+
1119
+ if escape_next:
1120
+ escape_next = False
1121
+ continue
1122
+
1123
+ if char == '\\':
1124
+ escape_next = True
1125
+ continue
1126
+
1127
+ if char == '"' and not escape_next:
1128
+ in_string = not in_string
1129
+ continue
1130
+
1131
+ if in_string:
1132
+ continue
1133
+
1134
+ if char in '[{':
1135
+ bracket_stack.append(char)
1136
+ elif char in ']}':
1137
+ if bracket_stack:
1138
+ opening = bracket_stack.pop()
1139
+ if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
1140
+ if not bracket_stack: # Found complete JSON
1141
+ json_end = i + 1
1142
+ break
1143
+
1144
+ if json_end > json_start:
1145
+ return text[json_start:json_end]
1146
+
1147
+ # If no JSON found, return the original text
1148
+ return text
1149
+
1150
+ def _resolve_missing_compounds_with_gemini(
1151
+ model,
1152
+ known_compounds: Dict[str, str],
1153
+ missing_compounds: List[str],
1154
+ figure_images: Dict[str, str] = None,
1155
+ primary_location_text: str = None,
1156
+ debug_dir: str | Path | None = None,
1157
+ ) -> Dict[str, str]:
1158
+ """Use Gemini to resolve missing compound names based on patterns."""
1159
+
1160
+ prompt = """You are an expert chemist analyzing compound naming patterns in a chemistry paper.
1161
+
1162
+ KNOWN COMPOUNDS WITH IUPAC NAMES:
1163
+ """
1164
+
1165
+ # Add known compounds
1166
+ for cid, name in sorted(known_compounds.items()):
1167
+ prompt += f"- Compound {cid}: {name}\n"
1168
+
1169
+ prompt += f"""
1170
+
1171
+ MISSING COMPOUNDS (need IUPAC names):
1172
+ {', '.join(sorted(missing_compounds))}
1173
+
1174
+ TASK:
1175
+ 1. Analyze the numbering/lettering pattern used in this paper
1176
+ 2. Look for relationships between compounds (e.g., 3 → 3a, 3b as enantiomers)
1177
+ 3. Determine the IUPAC names for the missing compounds
1178
+
1179
+ IMPORTANT PATTERNS TO CONSIDER:
1180
+ - If compound "X" has a known structure and "Xa", "Xb" are missing, they might be stereoisomers
1181
+ - Common patterns: 'a' = (S)-enantiomer, 'b' = (R)-enantiomer (but verify from context)
1182
+ - Some papers use 'a/b' for different stereogenic centers or regioisomers
1183
+ - Look at the scheme images AND the text to understand relationships
1184
+
1185
+ For each missing compound, provide the most likely IUPAC name based on:
1186
+ - The pattern analysis from text and schemes
1187
+ - Standard chemical nomenclature rules
1188
+ - The relationship to known compounds
1189
+
1190
+ Return ONLY compounds where you have high confidence in the IUPAC name.
1191
+ If unsure, return null for that compound.
1192
+
1193
+ Return as JSON:
1194
+ {{
1195
+ "resolved_compounds": {{
1196
+ "compound_id": "IUPAC name or null"
1197
+ }}
1198
+ }}
1199
+ """
1200
+
1201
+ # Add primary location text if available
1202
+ if primary_location_text:
1203
+ prompt += f"""
1204
+
1205
+ PRIMARY SUBSTRATE SCOPE TEXT (from scheme/table):
1206
+ {primary_location_text[:10000]} # Limit to prevent token overflow
1207
+ """
1208
+
1209
+ # Add figure images if available
1210
+ content_parts = [prompt]
1211
+ if figure_images:
1212
+ content_parts.append("\n\nANALYZE THE FOLLOWING SCHEME IMAGES TO UNDERSTAND THE COMPOUND RELATIONSHIPS:")
1213
+ import PIL.Image
1214
+ import io
1215
+ import base64
1216
+
1217
+ for fig_ref, fig_base64 in figure_images.items():
1218
+ if "scheme" in fig_ref.lower():
1219
+ try:
1220
+ img_bytes = base64.b64decode(fig_base64)
1221
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1222
+ content_parts.append(f"\n[{fig_ref}]")
1223
+ content_parts.append(image)
1224
+ except Exception as e:
1225
+ log.warning("Failed to add scheme image %s: %s", fig_ref, e)
1226
+
1227
+ try:
1228
+ # Use multimodal if we have images
1229
+ if len(content_parts) > 1:
1230
+ log.info("Using multimodal API with scheme images for compound resolution")
1231
+ response = model.generate_content(content_parts)
1232
+ raw_text = _extract_text(response).strip()
1233
+ else:
1234
+ # Text-only
1235
+ raw_text = generate_json_with_retry(
1236
+ model,
1237
+ prompt,
1238
+ debug_dir=debug_dir,
1239
+ tag="resolve_compounds",
1240
+ raw_response=True
1241
+ )
1242
+
1243
+ # Parse response
1244
+ data = json.loads(_extract_json(raw_text))
1245
+ resolved = data.get("resolved_compounds", {})
1246
+
1247
+ # Filter to only return non-null values
1248
+ result = {}
1249
+ for cid, name in resolved.items():
1250
+ if name and cid in missing_compounds:
1251
+ result[cid] = name
1252
+ log.info("Resolved compound %s: %s", cid, name[:60] + "..." if len(name) > 60 else name)
1253
+
1254
+ return result
1255
+
1256
+ except Exception as exc:
1257
+ log.error("Failed to resolve compounds: %s", exc)
1258
+ return {}
1259
+
973
1260
  def _extract_compound_mappings_with_figures(
974
1261
  text: str,
975
1262
  model,
@@ -1207,6 +1494,7 @@ def extract_compound_mappings(
1207
1494
  pdf_paths: List[Path] = None,
1208
1495
  iupac_sections: List[dict] = None,
1209
1496
  compound_ids: List[str] = None,
1497
+ primary_locations: List[dict] = None,
1210
1498
  debug_dir: str | Path | None = None,
1211
1499
  ) -> Dict[str, CompoundMapping]:
1212
1500
  """Extract compound ID to IUPAC name mappings from identified sections.
@@ -1284,6 +1572,63 @@ def extract_compound_mappings(
1284
1572
  if not mapping or not mapping.iupac_name:
1285
1573
  still_missing.append(cid)
1286
1574
 
1575
+ # Step 5.5: Use Gemini to resolve compound relationships and missing names
1576
+ if still_missing and len(mappings) > 0:
1577
+ log.info("Attempting to resolve %d missing compounds using pattern analysis...", len(still_missing))
1578
+
1579
+ # Prepare data about known compounds and missing ones
1580
+ known_compounds = {}
1581
+ for key, mapping in mappings.items():
1582
+ if mapping.iupac_name:
1583
+ # Get the primary identifier
1584
+ primary_id = mapping.identifiers[0] if mapping.identifiers else key
1585
+ known_compounds[primary_id] = mapping.iupac_name
1586
+
1587
+ # Extract primary location text if available
1588
+ primary_location_text = None
1589
+ if primary_locations and pdf_paths:
1590
+ # Get text from the first primary location (usually the main scheme)
1591
+ for loc in primary_locations[:1]: # Just the first one
1592
+ loc_str = loc.get('location', '')
1593
+ if loc_str:
1594
+ primary_text = _extract_text_around_reference(pdf_paths, loc_str, context_chars=10000)
1595
+ if primary_text:
1596
+ primary_location_text = primary_text
1597
+ log.info("Extracted %d chars from primary location %s for pattern analysis",
1598
+ len(primary_text), loc_str)
1599
+ break
1600
+
1601
+ # Ask Gemini to analyze patterns and resolve missing compounds
1602
+ resolved_mappings = _resolve_missing_compounds_with_gemini(
1603
+ model, known_compounds, still_missing,
1604
+ figure_images=getattr(extract_compound_mappings, '_figure_images_cache', {}),
1605
+ primary_location_text=primary_location_text,
1606
+ debug_dir=debug_dir
1607
+ )
1608
+
1609
+ # Merge resolved mappings
1610
+ resolved_count = 0
1611
+ for cid, iupac_name in resolved_mappings.items():
1612
+ key = cid.lower().strip()
1613
+ if key in mappings:
1614
+ if not mappings[key].iupac_name and iupac_name:
1615
+ mappings[key].iupac_name = iupac_name
1616
+ resolved_count += 1
1617
+ else:
1618
+ # Create new mapping
1619
+ new_mapping = CompoundMapping(
1620
+ identifiers=[cid],
1621
+ iupac_name=iupac_name,
1622
+ common_names=[],
1623
+ compound_type="product",
1624
+ source_location="Resolved from pattern analysis"
1625
+ )
1626
+ mappings[key] = new_mapping
1627
+ resolved_count += 1
1628
+
1629
+ if resolved_count > 0:
1630
+ log.info("Resolved %d compounds using pattern analysis", resolved_count)
1631
+
1287
1632
  # Step 6: Final fallback - use figures and full manuscript if compounds are still missing
1288
1633
  # COMMENTED OUT: Figure-based IUPAC extraction is unreliable
1289
1634
  # Generating IUPAC names from visual structures leads to errors
@@ -1525,24 +1870,30 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
1525
1870
  try:
1526
1871
  # Parse substrate IDs
1527
1872
  substrates = []
1528
- substrate_ids = item.get("substrate_ids") or []
1529
- # Also handle old format
1530
- if not substrate_ids and item.get("substrates"):
1531
- substrates_data = item.get("substrates") or []
1532
- for s in substrates_data:
1533
- if isinstance(s, dict):
1534
- substrate_ids.append(s.get("identifier") or s.get("name", ""))
1535
- else:
1536
- substrate_ids.append(str(s))
1873
+ substrate_ids = item.get("substrate_ids")
1537
1874
 
1538
- for sid in substrate_ids:
1539
- # Look up IUPAC name
1540
- iupac_name = None
1541
- mapping = compound_mappings.get(str(sid).lower())
1542
- if mapping:
1543
- iupac_name = mapping.iupac_name
1875
+ # Handle null substrate_ids
1876
+ if substrate_ids is None:
1877
+ # Leave substrates empty if substrate_ids is explicitly null
1878
+ pass
1879
+ else:
1880
+ # Also handle old format
1881
+ if not substrate_ids and item.get("substrates"):
1882
+ substrates_data = item.get("substrates") or []
1883
+ for s in substrates_data:
1884
+ if isinstance(s, dict):
1885
+ substrate_ids.append(s.get("identifier") or s.get("name", ""))
1886
+ else:
1887
+ substrate_ids.append(str(s))
1544
1888
 
1545
- substrates.append(SubstrateProduct(name=str(sid), iupac_name=iupac_name))
1889
+ for sid in substrate_ids:
1890
+ # Look up IUPAC name
1891
+ iupac_name = None
1892
+ mapping = compound_mappings.get(str(sid).lower())
1893
+ if mapping:
1894
+ iupac_name = mapping.iupac_name
1895
+
1896
+ substrates.append(SubstrateProduct(name=str(sid), iupac_name=iupac_name))
1546
1897
 
1547
1898
  # Parse product IDs
1548
1899
  products = []
@@ -1669,31 +2020,46 @@ def get_substrate_scope(
1669
2020
  time.sleep(2) # Rate limiting
1670
2021
  log.info("Extracting all substrate scope data from all identified sources...")
1671
2022
 
1672
- # Extract images for all figure locations
2023
+ # Extract images for all figure and scheme locations
1673
2024
  figure_images = {}
1674
2025
  for loc in locations:
1675
2026
  location_str = loc.get('location', '')
1676
- # Extract if it's marked as figure type OR if location contains "Figure" or "Fig"
1677
- if pdf_paths and ('figure' in location_str.lower() or 'fig' in location_str.lower() or loc.get('type') == 'figure'):
2027
+ location_type = loc.get('type', 'unknown')
2028
+
2029
+ # Extract if it's a figure, scheme, or contains those keywords
2030
+ should_extract = False
2031
+ if pdf_paths:
2032
+ if location_type in ['figure', 'scheme']:
2033
+ should_extract = True
2034
+ elif any(keyword in location_str.lower() for keyword in ['figure', 'fig', 'scheme']):
2035
+ should_extract = True
2036
+
2037
+ if should_extract:
1678
2038
  figure_ref = location_str
1679
2039
  confidence = loc.get('confidence', 0)
1680
- log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, loc.get('type', 'unknown'))
1681
- figure_image = extract_figure_image(pdf_paths, figure_ref)
2040
+ log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
2041
+
2042
+ # Use appropriate extraction function based on type
2043
+ if 'scheme' in location_str.lower() or location_type == 'scheme':
2044
+ figure_image = extract_scheme_image(pdf_paths, figure_ref)
2045
+ else:
2046
+ figure_image = extract_figure_image(pdf_paths, figure_ref)
2047
+
1682
2048
  if figure_image:
1683
- log.info("Successfully extracted figure image for %s (%d bytes)",
1684
- figure_ref, len(figure_image))
2049
+ log.info("Successfully extracted %s image for %s (%d bytes)",
2050
+ location_type, figure_ref, len(figure_image))
1685
2051
  figure_images[figure_ref] = figure_image
1686
2052
 
1687
2053
  # Save figure image if debug_dir is enabled
1688
2054
  if debug_dir:
1689
2055
  import base64
1690
2056
  debug_path = Path(debug_dir)
1691
- image_path = debug_path / f"figure_image_{figure_ref.replace(' ', '_')}.png"
2057
+ image_path = debug_path / f"{location_type}_image_{figure_ref.replace(' ', '_')}.png"
1692
2058
  with open(image_path, 'wb') as f:
1693
2059
  f.write(base64.b64decode(figure_image))
1694
- log.info("Saved figure image to %s", image_path)
2060
+ log.info("Saved %s image to %s", location_type, image_path)
1695
2061
  else:
1696
- log.warning("Failed to extract figure image for %s", figure_ref)
2062
+ log.warning("Failed to extract %s image for %s", location_type, figure_ref)
1697
2063
 
1698
2064
  # Extract all substrate scope data in one call
1699
2065
  raw_entries = extract_all_substrate_scope_data(
@@ -1734,6 +2100,7 @@ def get_substrate_scope(
1734
2100
  pdf_paths=pdf_paths,
1735
2101
  iupac_sections=iupac_sections,
1736
2102
  compound_ids=list(all_compound_ids),
2103
+ primary_locations=locations,
1737
2104
  debug_dir=debug_dir)
1738
2105
 
1739
2106
  # Step 5: Parse all entries with compound mappings
@@ -1793,11 +2160,65 @@ def validate_scope_entries(entries: List[ScopeEntry]) -> List[str]:
1793
2160
 
1794
2161
  return warnings
1795
2162
 
2163
+ def _match_enzymes_with_gemini(
2164
+ scope_enzymes: List[str],
2165
+ lineage_enzymes: List[str],
2166
+ model,
2167
+ debug_dir: Optional[Path] = None
2168
+ ) -> Dict[str, str]:
2169
+ """Use Gemini to match enzyme names between substrate scope and lineage data."""
2170
+
2171
+ prompt = """You are an expert at matching enzyme variant names that may have Unicode or formatting differences.
2172
+
2173
+ ENZYME NAMES FROM SUBSTRATE SCOPE DATA:
2174
+ """ + "\n".join(f"- {e}" for e in sorted(set(scope_enzymes))) + """
2175
+
2176
+ ENZYME NAMES FROM LINEAGE DATA:
2177
+ """ + "\n".join(f"- {e}" for e in sorted(set(lineage_enzymes))) + """
2178
+
2179
+ TASK:
2180
+ Match each substrate scope enzyme name to its corresponding lineage enzyme name.
2181
+ These are the SAME enzymes but may have different formatting:
2182
+ - Unicode vs ASCII characters (e.g., "ʟ" vs "L", "ᴅ" vs "D")
2183
+ - Different capitalization
2184
+ - Minor formatting differences
2185
+
2186
+ IMPORTANT:
2187
+ - Only match enzymes that are clearly the same variant
2188
+ - Look for matching generation numbers (G0, G1, G2, etc.)
2189
+ - Consider the pattern: [L/D]-ApPgb-αEsA-G[number]
2190
+ - If no clear match exists, return null for that enzyme
2191
+
2192
+ Return as JSON:
2193
+ {{
2194
+ "enzyme_matches": {{
2195
+ "substrate_scope_enzyme_name": "matching_lineage_enzyme_name_or_null"
2196
+ }}
2197
+ }}
2198
+ """
2199
+
2200
+ try:
2201
+ response = generate_json_with_retry(
2202
+ model,
2203
+ prompt,
2204
+ debug_dir=debug_dir,
2205
+ tag="enzyme_matching"
2206
+ )
2207
+
2208
+ matches = response.get("enzyme_matches", {})
2209
+ log.info("Gemini matched %d enzyme names", len([v for v in matches.values() if v]))
2210
+ return matches
2211
+
2212
+ except Exception as exc:
2213
+ log.error("Failed to match enzymes with Gemini: %s", exc)
2214
+ return {}
2215
+
1796
2216
  def merge_with_lineage(
1797
2217
  entries: List[ScopeEntry],
1798
- lineage_csv: Optional[Path]
2218
+ lineage_csv: Optional[Path],
2219
+ model=None
1799
2220
  ) -> List[ScopeEntry]:
1800
- """Merge substrate scope entries with enzyme lineage data."""
2221
+ """Merge substrate scope entries with enzyme lineage data using Gemini for matching."""
1801
2222
  if not lineage_csv or not lineage_csv.exists():
1802
2223
  return entries
1803
2224
 
@@ -1806,32 +2227,60 @@ def merge_with_lineage(
1806
2227
  lineage_df = pd.read_csv(lineage_csv)
1807
2228
  log.info("Loading lineage data from %s (%d enzymes)", lineage_csv, len(lineage_df))
1808
2229
 
1809
- # Create lookup map (case-insensitive)
2230
+ # Get unique enzyme names from both sources
2231
+ scope_enzymes = list(set(entry.enzyme_id for entry in entries if entry.enzyme_id))
2232
+ lineage_enzymes = list(lineage_df['enzyme_id'].dropna().unique())
2233
+
2234
+ log.info("Found %d unique enzymes in substrate scope data", len(scope_enzymes))
2235
+ log.info("Found %d unique enzymes in lineage data", len(lineage_enzymes))
2236
+
2237
+ # Use Gemini to match enzyme names if model is provided
2238
+ if model and scope_enzymes and lineage_enzymes:
2239
+ log.info("Using Gemini to match enzyme names between datasets...")
2240
+ enzyme_matches = _match_enzymes_with_gemini(
2241
+ scope_enzymes,
2242
+ lineage_enzymes,
2243
+ model,
2244
+ debug_dir=Path("examples/amino_esters_test/substrate_scope_debug_v4") if Path("examples/amino_esters_test/substrate_scope_debug_v4").exists() else None
2245
+ )
2246
+ else:
2247
+ # Fallback to simple case-insensitive matching
2248
+ log.info("Using simple case-insensitive matching (no model provided)")
2249
+ enzyme_matches = {}
2250
+ for scope_enzyme in scope_enzymes:
2251
+ for lineage_enzyme in lineage_enzymes:
2252
+ if scope_enzyme.lower() == lineage_enzyme.lower():
2253
+ enzyme_matches[scope_enzyme] = lineage_enzyme
2254
+ break
2255
+
2256
+ # Create lookup map with matched names
1810
2257
  lineage_map = {}
1811
2258
  for _, row in lineage_df.iterrows():
1812
2259
  enzyme_id = str(row.get('enzyme_id', ''))
1813
- lineage_map[enzyme_id.lower()] = {
1814
- 'parent_id': row.get('parent_id'),
1815
- 'mutations': row.get('mutations'),
2260
+ lineage_map[enzyme_id] = {
2261
+ 'parent_id': row.get('parent_enzyme_id', ''), # Note: might be 'parent_enzyme_id' not 'parent_id'
2262
+ 'mutations': row.get('mutations', ''),
1816
2263
  'generation': row.get('generation'),
1817
- 'aa_seq': row.get('aa_seq'),
1818
- 'dna_seq': row.get('dna_seq'),
1819
- 'confidence': row.get('confidence')
2264
+ 'aa_seq': row.get('protein_sequence', '') or row.get('aa_seq', ''), # Try both column names
2265
+ 'dna_seq': row.get('dna_seq', ''),
2266
+ 'confidence': row.get('seq_confidence', '') or row.get('confidence', '')
1820
2267
  }
1821
2268
 
1822
- # Merge
2269
+ # Merge using matched names
1823
2270
  merged_count = 0
1824
2271
  for entry in entries:
1825
- key = entry.enzyme_id.lower()
1826
- if key in lineage_map:
1827
- data = lineage_map[key]
1828
- entry.parent_id = data['parent_id']
1829
- entry.mutations = data['mutations']
1830
- entry.generation = data['generation']
1831
- entry.aa_seq = data['aa_seq']
1832
- entry.dna_seq = data['dna_seq']
1833
- entry.confidence = data['confidence']
1834
- merged_count += 1
2272
+ if entry.enzyme_id in enzyme_matches:
2273
+ matched_name = enzyme_matches[entry.enzyme_id]
2274
+ if matched_name and matched_name in lineage_map:
2275
+ data = lineage_map[matched_name]
2276
+ entry.parent_id = data['parent_id']
2277
+ entry.mutations = data['mutations']
2278
+ entry.generation = data['generation']
2279
+ entry.aa_seq = data['aa_seq']
2280
+ entry.dna_seq = data['dna_seq']
2281
+ entry.confidence = data['confidence']
2282
+ merged_count += 1
2283
+ log.debug("Merged %s -> %s", entry.enzyme_id, matched_name)
1835
2284
 
1836
2285
  log.info("Merged lineage data for %d/%d entries", merged_count, len(entries))
1837
2286
 
@@ -1957,7 +2406,7 @@ def run_pipeline(
1957
2406
 
1958
2407
  # 4. Merge with lineage if available ---------------------------------------
1959
2408
  if lineage_csv:
1960
- entries = merge_with_lineage(entries, Path(lineage_csv))
2409
+ entries = merge_with_lineage(entries, Path(lineage_csv), model)
1961
2410
 
1962
2411
  # 5. Validate entries ------------------------------------------------------
1963
2412
  warnings = validate_scope_entries(entries)