debase 0.1.11__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +623 -234
- debase/lineage_format.py +113 -11
- debase/reaction_info_extractor.py +21 -7
- debase/substrate_scope_extractor.py +516 -67
- debase/wrapper.py +301 -67
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/METADATA +1 -1
- debase-0.1.17.dist-info/RECORD +17 -0
- debase-0.1.11.dist-info/RECORD +0 -17
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/WHEEL +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/entry_points.txt +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.11.dist-info → debase-0.1.17.dist-info}/top_level.txt +0 -0
@@ -346,6 +346,103 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
346
346
|
log.warning("Could not find figure caption for '%s'", figure_ref)
|
347
347
|
return None
|
348
348
|
|
349
|
+
def extract_scheme_image(pdf_paths: List[Path], scheme_ref: str) -> Optional[str]:
|
350
|
+
"""Extract scheme as a page region, similar to figures.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
pdf_paths: List of PDF paths to search
|
354
|
+
scheme_ref: Scheme reference to search for (e.g., "Scheme 2" or "Scheme S2")
|
355
|
+
|
356
|
+
Returns:
|
357
|
+
Base64-encoded PNG string or None if not found
|
358
|
+
"""
|
359
|
+
if not pdf_paths:
|
360
|
+
return None
|
361
|
+
|
362
|
+
for pdf_path in pdf_paths:
|
363
|
+
doc = _open_doc(pdf_path)
|
364
|
+
try:
|
365
|
+
for page_num in range(doc.page_count):
|
366
|
+
page = doc.load_page(page_num)
|
367
|
+
page_text = page.get_text()
|
368
|
+
|
369
|
+
# Check if this page contains the scheme
|
370
|
+
found = False
|
371
|
+
scheme_instances = None
|
372
|
+
|
373
|
+
# Look for scheme reference with various patterns
|
374
|
+
variations = [
|
375
|
+
f"{scheme_ref}.", # "Scheme 2."
|
376
|
+
f"{scheme_ref}:", # "Scheme 2:"
|
377
|
+
f"{scheme_ref} ", # "Scheme 2 "
|
378
|
+
scheme_ref,
|
379
|
+
]
|
380
|
+
|
381
|
+
for variation in variations:
|
382
|
+
scheme_instances = page.search_for(variation, quads=False)
|
383
|
+
if scheme_instances:
|
384
|
+
# Check if this is likely a scheme title (not a reference in text)
|
385
|
+
for rect in scheme_instances:
|
386
|
+
# Get text around this location
|
387
|
+
x0, y0, x1, y1 = rect
|
388
|
+
text_around = page.get_textbox(fitz.Rect(x0-50, y0-5, x1+400, y1+20))
|
389
|
+
# Check if it looks like a scheme title
|
390
|
+
if any(keyword in text_around.lower() for keyword in
|
391
|
+
['substrate scope', 'reaction', 'synthesis', 'procedure', 'explored']):
|
392
|
+
found = True
|
393
|
+
scheme_rect = rect
|
394
|
+
break
|
395
|
+
if found:
|
396
|
+
break
|
397
|
+
|
398
|
+
if not found:
|
399
|
+
continue
|
400
|
+
|
401
|
+
log.info("Found scheme on page %d at y=%.0f", page_num + 1, scheme_rect.y0)
|
402
|
+
|
403
|
+
# For schemes, we often want to capture more of the page
|
404
|
+
# since they can be large and include multiple reactions
|
405
|
+
page_rect = page.rect
|
406
|
+
|
407
|
+
# Define the region to extract
|
408
|
+
# For schemes, we want to capture everything below the title
|
409
|
+
# until we hit significant text (which would be the next section)
|
410
|
+
top_margin = max(0, scheme_rect.y1 + 5) # Start just below the scheme title
|
411
|
+
|
412
|
+
# Look for the next major text block that might indicate end of scheme
|
413
|
+
# This is a simple heuristic - look for blocks of text below the scheme
|
414
|
+
text_blocks = page.get_text("blocks")
|
415
|
+
bottom_y = page_rect.height # Default to full page
|
416
|
+
|
417
|
+
for block in text_blocks:
|
418
|
+
block_y = block[1] # y-coordinate of block
|
419
|
+
block_text = block[4] # text content
|
420
|
+
# If we find a substantial text block below the scheme title
|
421
|
+
if block_y > scheme_rect.y1 + 50 and len(block_text) > 100:
|
422
|
+
# This might be the next section
|
423
|
+
bottom_y = block_y - 10
|
424
|
+
break
|
425
|
+
|
426
|
+
# Create the clip rectangle
|
427
|
+
clip_rect = fitz.Rect(0, top_margin, page_rect.width, bottom_y)
|
428
|
+
|
429
|
+
# Extract the region as an image
|
430
|
+
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
431
|
+
pix = page.get_pixmap(clip=clip_rect, matrix=mat)
|
432
|
+
|
433
|
+
# Convert to PNG
|
434
|
+
img_bytes = pix.tobytes("png")
|
435
|
+
log.info("Extracted scheme region: %.0fx%.0f pixels from page %d",
|
436
|
+
clip_rect.width * 2, clip_rect.height * 2, page_num + 1)
|
437
|
+
|
438
|
+
return b64encode(img_bytes).decode()
|
439
|
+
|
440
|
+
finally:
|
441
|
+
doc.close()
|
442
|
+
|
443
|
+
log.warning("Could not find scheme '%s'", scheme_ref)
|
444
|
+
return None
|
445
|
+
|
349
446
|
|
350
447
|
def _extract_text_around_reference(pdf_paths: List[Path], ref: str, context_chars: int = 2000) -> str:
|
351
448
|
"""Extract text around a specific reference (e.g., 'Figure 3')."""
|
@@ -765,21 +862,28 @@ Return JSON:
|
|
765
862
|
""".strip()
|
766
863
|
|
767
864
|
_COMPOUND_MAPPING_PROMPT = """
|
768
|
-
Extract compound identifiers and their chemical names
|
865
|
+
Extract compound identifiers and their chemical names from the provided text and any scheme images.
|
866
|
+
|
867
|
+
TASK:
|
868
|
+
1. First, extract all compound IDs and names that are explicitly written in the text
|
869
|
+
2. Then, analyze any provided scheme images to identify compound labeling patterns
|
870
|
+
3. Look for relationships between compounds (e.g., when multiple variants share a base structure)
|
871
|
+
4. Note any systematic naming conventions used in the schemes
|
769
872
|
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
5. If a product was "not detected" or "not formed", return null for iupac_name
|
873
|
+
ANALYSIS GUIDELINES:
|
874
|
+
- Some papers define a base compound and use letter suffixes for variants
|
875
|
+
- Schemes often show relationships that aren't explicitly stated in text
|
876
|
+
- Pay attention to how compounds are grouped or connected in schemes
|
877
|
+
- Identify any patterns in how compounds are numbered/lettered
|
776
878
|
|
777
879
|
For each compound:
|
778
|
-
- identifier: The exact compound ID as written
|
779
|
-
- iupac_name: The chemical name if
|
780
|
-
- common_names: Any alternative names
|
880
|
+
- identifier: The exact compound ID as written
|
881
|
+
- iupac_name: The chemical name if found in text
|
882
|
+
- common_names: Any alternative names
|
781
883
|
- compound_type: substrate/product/reagent/catalyst/other
|
782
|
-
- source_location:
|
884
|
+
- source_location: Where found (text excerpt or "Scheme X")
|
885
|
+
- related_compounds: List of related compound IDs if a pattern is detected
|
886
|
+
- pattern_notes: Description of any labeling pattern observed
|
783
887
|
|
784
888
|
Return as JSON:
|
785
889
|
{
|
@@ -789,12 +893,12 @@ Return as JSON:
|
|
789
893
|
"iupac_name": "string or null",
|
790
894
|
"common_names": ["array of strings"],
|
791
895
|
"compound_type": "string",
|
792
|
-
"source_location": "string"
|
896
|
+
"source_location": "string",
|
897
|
+
"related_compounds": ["array of related IDs"],
|
898
|
+
"pattern_notes": "string or null"
|
793
899
|
}
|
794
900
|
]
|
795
901
|
}
|
796
|
-
|
797
|
-
Note: It is better to return null than to hallucinate or infer chemical structures.
|
798
902
|
""".strip()
|
799
903
|
|
800
904
|
_SUBSTRATE_SCOPE_PROMPT = """
|
@@ -803,13 +907,17 @@ Extract ALL substrate scope data from the primary sources in one complete extrac
|
|
803
907
|
|
804
908
|
For EACH reaction, extract:
|
805
909
|
1. Enzyme variant ID
|
806
|
-
2. Substrate identifiers (e.g., "6a", "5")
|
910
|
+
2. Substrate identifiers (e.g., "6a", "5") - ONLY if explicitly shown
|
807
911
|
3. Product identifiers (e.g., "7a", "7b", "7d", "7e") - ALWAYS include even if no yield
|
808
912
|
4. Performance metrics (yield%, ee%, dr, TTN)
|
809
913
|
5. Reaction conditions (temperature, pH, buffer, substrate concentrations - NOT dithionite/reducing agents)
|
810
914
|
6. Data location (which figure/table this comes from)
|
811
915
|
|
812
|
-
CRITICAL - NO HALLUCINATION OR
|
916
|
+
CRITICAL - NO HALLUCINATION OR INFERENCE OF IDENTIFIERS:
|
917
|
+
- SUBSTRATE IDS: Only extract substrate identifiers that are EXPLICITLY WRITTEN in the source
|
918
|
+
- DO NOT INFER substrate IDs from patterns (e.g., if you see product "4a", DO NOT assume substrate is "3a")
|
919
|
+
- If substrate ID is not explicitly shown, use null for substrate_ids
|
920
|
+
- Product IDs should be extracted as shown (since they are usually labeled in schemes)
|
813
921
|
- Extract values EXACTLY as written in the primary source - NO CHANGES WHATSOEVER
|
814
922
|
- DO NOT round, estimate, convert, or modify any numbers
|
815
923
|
- If the text shows "53%", report 53.0, not 53 or 53.00
|
@@ -821,19 +929,20 @@ CRITICAL - NO HALLUCINATION OR MODIFICATION:
|
|
821
929
|
- If no value is shown, return null, not 0 or empty string
|
822
930
|
- Extract ALL reactions from ALL identified locations
|
823
931
|
- Use compound identifiers EXACTLY as shown (not IUPAC names)
|
824
|
-
- For every entry, there needs to be identifier for both substrates and products, even if yield is null or activity is 0.
|
825
932
|
- Extract reaction conditions EXACTLY as written - NO PARAPHRASING
|
826
933
|
- IMPORTANT: Substrate concentration refers to the concentration of the actual chemical substrates being transformed in the reaction, NOT reducing agents (e.g., dithionite, NADH) or other additives
|
827
934
|
|
828
|
-
IMPORTANT:
|
829
|
-
|
935
|
+
IMPORTANT:
|
936
|
+
- Substrate IDs must be EXPLICITLY visible in the source - DO NOT INFER FROM PATTERNS
|
937
|
+
- Product IDs should be extracted as labeled in the scheme/figure
|
938
|
+
- If only product ID is shown with yields/ee data, substrate_ids should be null
|
830
939
|
|
831
940
|
Return as JSON:
|
832
941
|
{{
|
833
942
|
"substrate_scope_data": [
|
834
943
|
{{
|
835
944
|
"enzyme_id": "enzyme variant name",
|
836
|
-
"substrate_ids": ["list of substrate identifiers"],
|
945
|
+
"substrate_ids": null or ["list of EXPLICITLY shown substrate identifiers"],
|
837
946
|
"product_ids": ["list of product identifiers"],
|
838
947
|
"yield_percent": null or number,
|
839
948
|
"ee_percent": null or number,
|
@@ -959,6 +1068,10 @@ def _extract_compound_mappings_from_text(
|
|
959
1068
|
source_location=item.get("source_location")
|
960
1069
|
)
|
961
1070
|
|
1071
|
+
# Store pattern information for post-processing
|
1072
|
+
mapping._related_compounds = item.get("related_compounds", [])
|
1073
|
+
mapping._pattern_notes = item.get("pattern_notes", "")
|
1074
|
+
|
962
1075
|
# Create lookup entries for all identifiers and common names
|
963
1076
|
for identifier in mapping.identifiers + mapping.common_names:
|
964
1077
|
if identifier:
|
@@ -970,6 +1083,180 @@ def _extract_compound_mappings_from_text(
|
|
970
1083
|
log.error("Failed to extract compound mappings: %s", exc)
|
971
1084
|
return {}
|
972
1085
|
|
1086
|
+
def _extract_json(text: str) -> str:
|
1087
|
+
"""Extract JSON content from raw LLM response text."""
|
1088
|
+
# Remove common markdown code block markers
|
1089
|
+
text = text.strip()
|
1090
|
+
if text.startswith('```json'):
|
1091
|
+
text = text[7:]
|
1092
|
+
elif text.startswith('```'):
|
1093
|
+
text = text[3:]
|
1094
|
+
|
1095
|
+
if text.endswith('```'):
|
1096
|
+
text = text[:-3]
|
1097
|
+
|
1098
|
+
# Find JSON structure
|
1099
|
+
text = text.strip()
|
1100
|
+
|
1101
|
+
# Look for JSON object or array
|
1102
|
+
json_start = -1
|
1103
|
+
json_end = -1
|
1104
|
+
|
1105
|
+
for i, char in enumerate(text):
|
1106
|
+
if char in '[{' and json_start == -1:
|
1107
|
+
json_start = i
|
1108
|
+
break
|
1109
|
+
|
1110
|
+
if json_start >= 0:
|
1111
|
+
# Find the matching closing bracket
|
1112
|
+
bracket_stack = []
|
1113
|
+
in_string = False
|
1114
|
+
escape_next = False
|
1115
|
+
|
1116
|
+
for i in range(json_start, len(text)):
|
1117
|
+
char = text[i]
|
1118
|
+
|
1119
|
+
if escape_next:
|
1120
|
+
escape_next = False
|
1121
|
+
continue
|
1122
|
+
|
1123
|
+
if char == '\\':
|
1124
|
+
escape_next = True
|
1125
|
+
continue
|
1126
|
+
|
1127
|
+
if char == '"' and not escape_next:
|
1128
|
+
in_string = not in_string
|
1129
|
+
continue
|
1130
|
+
|
1131
|
+
if in_string:
|
1132
|
+
continue
|
1133
|
+
|
1134
|
+
if char in '[{':
|
1135
|
+
bracket_stack.append(char)
|
1136
|
+
elif char in ']}':
|
1137
|
+
if bracket_stack:
|
1138
|
+
opening = bracket_stack.pop()
|
1139
|
+
if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
|
1140
|
+
if not bracket_stack: # Found complete JSON
|
1141
|
+
json_end = i + 1
|
1142
|
+
break
|
1143
|
+
|
1144
|
+
if json_end > json_start:
|
1145
|
+
return text[json_start:json_end]
|
1146
|
+
|
1147
|
+
# If no JSON found, return the original text
|
1148
|
+
return text
|
1149
|
+
|
1150
|
+
def _resolve_missing_compounds_with_gemini(
|
1151
|
+
model,
|
1152
|
+
known_compounds: Dict[str, str],
|
1153
|
+
missing_compounds: List[str],
|
1154
|
+
figure_images: Dict[str, str] = None,
|
1155
|
+
primary_location_text: str = None,
|
1156
|
+
debug_dir: str | Path | None = None,
|
1157
|
+
) -> Dict[str, str]:
|
1158
|
+
"""Use Gemini to resolve missing compound names based on patterns."""
|
1159
|
+
|
1160
|
+
prompt = """You are an expert chemist analyzing compound naming patterns in a chemistry paper.
|
1161
|
+
|
1162
|
+
KNOWN COMPOUNDS WITH IUPAC NAMES:
|
1163
|
+
"""
|
1164
|
+
|
1165
|
+
# Add known compounds
|
1166
|
+
for cid, name in sorted(known_compounds.items()):
|
1167
|
+
prompt += f"- Compound {cid}: {name}\n"
|
1168
|
+
|
1169
|
+
prompt += f"""
|
1170
|
+
|
1171
|
+
MISSING COMPOUNDS (need IUPAC names):
|
1172
|
+
{', '.join(sorted(missing_compounds))}
|
1173
|
+
|
1174
|
+
TASK:
|
1175
|
+
1. Analyze the numbering/lettering pattern used in this paper
|
1176
|
+
2. Look for relationships between compounds (e.g., 3 → 3a, 3b as enantiomers)
|
1177
|
+
3. Determine the IUPAC names for the missing compounds
|
1178
|
+
|
1179
|
+
IMPORTANT PATTERNS TO CONSIDER:
|
1180
|
+
- If compound "X" has a known structure and "Xa", "Xb" are missing, they might be stereoisomers
|
1181
|
+
- Common patterns: 'a' = (S)-enantiomer, 'b' = (R)-enantiomer (but verify from context)
|
1182
|
+
- Some papers use 'a/b' for different stereogenic centers or regioisomers
|
1183
|
+
- Look at the scheme images AND the text to understand relationships
|
1184
|
+
|
1185
|
+
For each missing compound, provide the most likely IUPAC name based on:
|
1186
|
+
- The pattern analysis from text and schemes
|
1187
|
+
- Standard chemical nomenclature rules
|
1188
|
+
- The relationship to known compounds
|
1189
|
+
|
1190
|
+
Return ONLY compounds where you have high confidence in the IUPAC name.
|
1191
|
+
If unsure, return null for that compound.
|
1192
|
+
|
1193
|
+
Return as JSON:
|
1194
|
+
{{
|
1195
|
+
"resolved_compounds": {{
|
1196
|
+
"compound_id": "IUPAC name or null"
|
1197
|
+
}}
|
1198
|
+
}}
|
1199
|
+
"""
|
1200
|
+
|
1201
|
+
# Add primary location text if available
|
1202
|
+
if primary_location_text:
|
1203
|
+
prompt += f"""
|
1204
|
+
|
1205
|
+
PRIMARY SUBSTRATE SCOPE TEXT (from scheme/table):
|
1206
|
+
{primary_location_text[:10000]} # Limit to prevent token overflow
|
1207
|
+
"""
|
1208
|
+
|
1209
|
+
# Add figure images if available
|
1210
|
+
content_parts = [prompt]
|
1211
|
+
if figure_images:
|
1212
|
+
content_parts.append("\n\nANALYZE THE FOLLOWING SCHEME IMAGES TO UNDERSTAND THE COMPOUND RELATIONSHIPS:")
|
1213
|
+
import PIL.Image
|
1214
|
+
import io
|
1215
|
+
import base64
|
1216
|
+
|
1217
|
+
for fig_ref, fig_base64 in figure_images.items():
|
1218
|
+
if "scheme" in fig_ref.lower():
|
1219
|
+
try:
|
1220
|
+
img_bytes = base64.b64decode(fig_base64)
|
1221
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1222
|
+
content_parts.append(f"\n[{fig_ref}]")
|
1223
|
+
content_parts.append(image)
|
1224
|
+
except Exception as e:
|
1225
|
+
log.warning("Failed to add scheme image %s: %s", fig_ref, e)
|
1226
|
+
|
1227
|
+
try:
|
1228
|
+
# Use multimodal if we have images
|
1229
|
+
if len(content_parts) > 1:
|
1230
|
+
log.info("Using multimodal API with scheme images for compound resolution")
|
1231
|
+
response = model.generate_content(content_parts)
|
1232
|
+
raw_text = _extract_text(response).strip()
|
1233
|
+
else:
|
1234
|
+
# Text-only
|
1235
|
+
raw_text = generate_json_with_retry(
|
1236
|
+
model,
|
1237
|
+
prompt,
|
1238
|
+
debug_dir=debug_dir,
|
1239
|
+
tag="resolve_compounds",
|
1240
|
+
raw_response=True
|
1241
|
+
)
|
1242
|
+
|
1243
|
+
# Parse response
|
1244
|
+
data = json.loads(_extract_json(raw_text))
|
1245
|
+
resolved = data.get("resolved_compounds", {})
|
1246
|
+
|
1247
|
+
# Filter to only return non-null values
|
1248
|
+
result = {}
|
1249
|
+
for cid, name in resolved.items():
|
1250
|
+
if name and cid in missing_compounds:
|
1251
|
+
result[cid] = name
|
1252
|
+
log.info("Resolved compound %s: %s", cid, name[:60] + "..." if len(name) > 60 else name)
|
1253
|
+
|
1254
|
+
return result
|
1255
|
+
|
1256
|
+
except Exception as exc:
|
1257
|
+
log.error("Failed to resolve compounds: %s", exc)
|
1258
|
+
return {}
|
1259
|
+
|
973
1260
|
def _extract_compound_mappings_with_figures(
|
974
1261
|
text: str,
|
975
1262
|
model,
|
@@ -1207,6 +1494,7 @@ def extract_compound_mappings(
|
|
1207
1494
|
pdf_paths: List[Path] = None,
|
1208
1495
|
iupac_sections: List[dict] = None,
|
1209
1496
|
compound_ids: List[str] = None,
|
1497
|
+
primary_locations: List[dict] = None,
|
1210
1498
|
debug_dir: str | Path | None = None,
|
1211
1499
|
) -> Dict[str, CompoundMapping]:
|
1212
1500
|
"""Extract compound ID to IUPAC name mappings from identified sections.
|
@@ -1284,6 +1572,63 @@ def extract_compound_mappings(
|
|
1284
1572
|
if not mapping or not mapping.iupac_name:
|
1285
1573
|
still_missing.append(cid)
|
1286
1574
|
|
1575
|
+
# Step 5.5: Use Gemini to resolve compound relationships and missing names
|
1576
|
+
if still_missing and len(mappings) > 0:
|
1577
|
+
log.info("Attempting to resolve %d missing compounds using pattern analysis...", len(still_missing))
|
1578
|
+
|
1579
|
+
# Prepare data about known compounds and missing ones
|
1580
|
+
known_compounds = {}
|
1581
|
+
for key, mapping in mappings.items():
|
1582
|
+
if mapping.iupac_name:
|
1583
|
+
# Get the primary identifier
|
1584
|
+
primary_id = mapping.identifiers[0] if mapping.identifiers else key
|
1585
|
+
known_compounds[primary_id] = mapping.iupac_name
|
1586
|
+
|
1587
|
+
# Extract primary location text if available
|
1588
|
+
primary_location_text = None
|
1589
|
+
if primary_locations and pdf_paths:
|
1590
|
+
# Get text from the first primary location (usually the main scheme)
|
1591
|
+
for loc in primary_locations[:1]: # Just the first one
|
1592
|
+
loc_str = loc.get('location', '')
|
1593
|
+
if loc_str:
|
1594
|
+
primary_text = _extract_text_around_reference(pdf_paths, loc_str, context_chars=10000)
|
1595
|
+
if primary_text:
|
1596
|
+
primary_location_text = primary_text
|
1597
|
+
log.info("Extracted %d chars from primary location %s for pattern analysis",
|
1598
|
+
len(primary_text), loc_str)
|
1599
|
+
break
|
1600
|
+
|
1601
|
+
# Ask Gemini to analyze patterns and resolve missing compounds
|
1602
|
+
resolved_mappings = _resolve_missing_compounds_with_gemini(
|
1603
|
+
model, known_compounds, still_missing,
|
1604
|
+
figure_images=getattr(extract_compound_mappings, '_figure_images_cache', {}),
|
1605
|
+
primary_location_text=primary_location_text,
|
1606
|
+
debug_dir=debug_dir
|
1607
|
+
)
|
1608
|
+
|
1609
|
+
# Merge resolved mappings
|
1610
|
+
resolved_count = 0
|
1611
|
+
for cid, iupac_name in resolved_mappings.items():
|
1612
|
+
key = cid.lower().strip()
|
1613
|
+
if key in mappings:
|
1614
|
+
if not mappings[key].iupac_name and iupac_name:
|
1615
|
+
mappings[key].iupac_name = iupac_name
|
1616
|
+
resolved_count += 1
|
1617
|
+
else:
|
1618
|
+
# Create new mapping
|
1619
|
+
new_mapping = CompoundMapping(
|
1620
|
+
identifiers=[cid],
|
1621
|
+
iupac_name=iupac_name,
|
1622
|
+
common_names=[],
|
1623
|
+
compound_type="product",
|
1624
|
+
source_location="Resolved from pattern analysis"
|
1625
|
+
)
|
1626
|
+
mappings[key] = new_mapping
|
1627
|
+
resolved_count += 1
|
1628
|
+
|
1629
|
+
if resolved_count > 0:
|
1630
|
+
log.info("Resolved %d compounds using pattern analysis", resolved_count)
|
1631
|
+
|
1287
1632
|
# Step 6: Final fallback - use figures and full manuscript if compounds are still missing
|
1288
1633
|
# COMMENTED OUT: Figure-based IUPAC extraction is unreliable
|
1289
1634
|
# Generating IUPAC names from visual structures leads to errors
|
@@ -1525,24 +1870,30 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
|
|
1525
1870
|
try:
|
1526
1871
|
# Parse substrate IDs
|
1527
1872
|
substrates = []
|
1528
|
-
substrate_ids = item.get("substrate_ids")
|
1529
|
-
# Also handle old format
|
1530
|
-
if not substrate_ids and item.get("substrates"):
|
1531
|
-
substrates_data = item.get("substrates") or []
|
1532
|
-
for s in substrates_data:
|
1533
|
-
if isinstance(s, dict):
|
1534
|
-
substrate_ids.append(s.get("identifier") or s.get("name", ""))
|
1535
|
-
else:
|
1536
|
-
substrate_ids.append(str(s))
|
1873
|
+
substrate_ids = item.get("substrate_ids")
|
1537
1874
|
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1875
|
+
# Handle null substrate_ids
|
1876
|
+
if substrate_ids is None:
|
1877
|
+
# Leave substrates empty if substrate_ids is explicitly null
|
1878
|
+
pass
|
1879
|
+
else:
|
1880
|
+
# Also handle old format
|
1881
|
+
if not substrate_ids and item.get("substrates"):
|
1882
|
+
substrates_data = item.get("substrates") or []
|
1883
|
+
for s in substrates_data:
|
1884
|
+
if isinstance(s, dict):
|
1885
|
+
substrate_ids.append(s.get("identifier") or s.get("name", ""))
|
1886
|
+
else:
|
1887
|
+
substrate_ids.append(str(s))
|
1544
1888
|
|
1545
|
-
|
1889
|
+
for sid in substrate_ids:
|
1890
|
+
# Look up IUPAC name
|
1891
|
+
iupac_name = None
|
1892
|
+
mapping = compound_mappings.get(str(sid).lower())
|
1893
|
+
if mapping:
|
1894
|
+
iupac_name = mapping.iupac_name
|
1895
|
+
|
1896
|
+
substrates.append(SubstrateProduct(name=str(sid), iupac_name=iupac_name))
|
1546
1897
|
|
1547
1898
|
# Parse product IDs
|
1548
1899
|
products = []
|
@@ -1669,31 +2020,46 @@ def get_substrate_scope(
|
|
1669
2020
|
time.sleep(2) # Rate limiting
|
1670
2021
|
log.info("Extracting all substrate scope data from all identified sources...")
|
1671
2022
|
|
1672
|
-
# Extract images for all figure locations
|
2023
|
+
# Extract images for all figure and scheme locations
|
1673
2024
|
figure_images = {}
|
1674
2025
|
for loc in locations:
|
1675
2026
|
location_str = loc.get('location', '')
|
1676
|
-
|
1677
|
-
|
2027
|
+
location_type = loc.get('type', 'unknown')
|
2028
|
+
|
2029
|
+
# Extract if it's a figure, scheme, or contains those keywords
|
2030
|
+
should_extract = False
|
2031
|
+
if pdf_paths:
|
2032
|
+
if location_type in ['figure', 'scheme']:
|
2033
|
+
should_extract = True
|
2034
|
+
elif any(keyword in location_str.lower() for keyword in ['figure', 'fig', 'scheme']):
|
2035
|
+
should_extract = True
|
2036
|
+
|
2037
|
+
if should_extract:
|
1678
2038
|
figure_ref = location_str
|
1679
2039
|
confidence = loc.get('confidence', 0)
|
1680
|
-
log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence,
|
1681
|
-
|
2040
|
+
log.info("Extracting image for %s (confidence: %d%%, type: %s)", figure_ref, confidence, location_type)
|
2041
|
+
|
2042
|
+
# Use appropriate extraction function based on type
|
2043
|
+
if 'scheme' in location_str.lower() or location_type == 'scheme':
|
2044
|
+
figure_image = extract_scheme_image(pdf_paths, figure_ref)
|
2045
|
+
else:
|
2046
|
+
figure_image = extract_figure_image(pdf_paths, figure_ref)
|
2047
|
+
|
1682
2048
|
if figure_image:
|
1683
|
-
log.info("Successfully extracted
|
1684
|
-
figure_ref, len(figure_image))
|
2049
|
+
log.info("Successfully extracted %s image for %s (%d bytes)",
|
2050
|
+
location_type, figure_ref, len(figure_image))
|
1685
2051
|
figure_images[figure_ref] = figure_image
|
1686
2052
|
|
1687
2053
|
# Save figure image if debug_dir is enabled
|
1688
2054
|
if debug_dir:
|
1689
2055
|
import base64
|
1690
2056
|
debug_path = Path(debug_dir)
|
1691
|
-
image_path = debug_path / f"
|
2057
|
+
image_path = debug_path / f"{location_type}_image_{figure_ref.replace(' ', '_')}.png"
|
1692
2058
|
with open(image_path, 'wb') as f:
|
1693
2059
|
f.write(base64.b64decode(figure_image))
|
1694
|
-
log.info("Saved
|
2060
|
+
log.info("Saved %s image to %s", location_type, image_path)
|
1695
2061
|
else:
|
1696
|
-
log.warning("Failed to extract
|
2062
|
+
log.warning("Failed to extract %s image for %s", location_type, figure_ref)
|
1697
2063
|
|
1698
2064
|
# Extract all substrate scope data in one call
|
1699
2065
|
raw_entries = extract_all_substrate_scope_data(
|
@@ -1734,6 +2100,7 @@ def get_substrate_scope(
|
|
1734
2100
|
pdf_paths=pdf_paths,
|
1735
2101
|
iupac_sections=iupac_sections,
|
1736
2102
|
compound_ids=list(all_compound_ids),
|
2103
|
+
primary_locations=locations,
|
1737
2104
|
debug_dir=debug_dir)
|
1738
2105
|
|
1739
2106
|
# Step 5: Parse all entries with compound mappings
|
@@ -1793,11 +2160,65 @@ def validate_scope_entries(entries: List[ScopeEntry]) -> List[str]:
|
|
1793
2160
|
|
1794
2161
|
return warnings
|
1795
2162
|
|
2163
|
+
def _match_enzymes_with_gemini(
|
2164
|
+
scope_enzymes: List[str],
|
2165
|
+
lineage_enzymes: List[str],
|
2166
|
+
model,
|
2167
|
+
debug_dir: Optional[Path] = None
|
2168
|
+
) -> Dict[str, str]:
|
2169
|
+
"""Use Gemini to match enzyme names between substrate scope and lineage data."""
|
2170
|
+
|
2171
|
+
prompt = """You are an expert at matching enzyme variant names that may have Unicode or formatting differences.
|
2172
|
+
|
2173
|
+
ENZYME NAMES FROM SUBSTRATE SCOPE DATA:
|
2174
|
+
""" + "\n".join(f"- {e}" for e in sorted(set(scope_enzymes))) + """
|
2175
|
+
|
2176
|
+
ENZYME NAMES FROM LINEAGE DATA:
|
2177
|
+
""" + "\n".join(f"- {e}" for e in sorted(set(lineage_enzymes))) + """
|
2178
|
+
|
2179
|
+
TASK:
|
2180
|
+
Match each substrate scope enzyme name to its corresponding lineage enzyme name.
|
2181
|
+
These are the SAME enzymes but may have different formatting:
|
2182
|
+
- Unicode vs ASCII characters (e.g., "ʟ" vs "L", "ᴅ" vs "D")
|
2183
|
+
- Different capitalization
|
2184
|
+
- Minor formatting differences
|
2185
|
+
|
2186
|
+
IMPORTANT:
|
2187
|
+
- Only match enzymes that are clearly the same variant
|
2188
|
+
- Look for matching generation numbers (G0, G1, G2, etc.)
|
2189
|
+
- Consider the pattern: [L/D]-ApPgb-αEsA-G[number]
|
2190
|
+
- If no clear match exists, return null for that enzyme
|
2191
|
+
|
2192
|
+
Return as JSON:
|
2193
|
+
{{
|
2194
|
+
"enzyme_matches": {{
|
2195
|
+
"substrate_scope_enzyme_name": "matching_lineage_enzyme_name_or_null"
|
2196
|
+
}}
|
2197
|
+
}}
|
2198
|
+
"""
|
2199
|
+
|
2200
|
+
try:
|
2201
|
+
response = generate_json_with_retry(
|
2202
|
+
model,
|
2203
|
+
prompt,
|
2204
|
+
debug_dir=debug_dir,
|
2205
|
+
tag="enzyme_matching"
|
2206
|
+
)
|
2207
|
+
|
2208
|
+
matches = response.get("enzyme_matches", {})
|
2209
|
+
log.info("Gemini matched %d enzyme names", len([v for v in matches.values() if v]))
|
2210
|
+
return matches
|
2211
|
+
|
2212
|
+
except Exception as exc:
|
2213
|
+
log.error("Failed to match enzymes with Gemini: %s", exc)
|
2214
|
+
return {}
|
2215
|
+
|
1796
2216
|
def merge_with_lineage(
|
1797
2217
|
entries: List[ScopeEntry],
|
1798
|
-
lineage_csv: Optional[Path]
|
2218
|
+
lineage_csv: Optional[Path],
|
2219
|
+
model=None
|
1799
2220
|
) -> List[ScopeEntry]:
|
1800
|
-
"""Merge substrate scope entries with enzyme lineage data."""
|
2221
|
+
"""Merge substrate scope entries with enzyme lineage data using Gemini for matching."""
|
1801
2222
|
if not lineage_csv or not lineage_csv.exists():
|
1802
2223
|
return entries
|
1803
2224
|
|
@@ -1806,32 +2227,60 @@ def merge_with_lineage(
|
|
1806
2227
|
lineage_df = pd.read_csv(lineage_csv)
|
1807
2228
|
log.info("Loading lineage data from %s (%d enzymes)", lineage_csv, len(lineage_df))
|
1808
2229
|
|
1809
|
-
#
|
2230
|
+
# Get unique enzyme names from both sources
|
2231
|
+
scope_enzymes = list(set(entry.enzyme_id for entry in entries if entry.enzyme_id))
|
2232
|
+
lineage_enzymes = list(lineage_df['enzyme_id'].dropna().unique())
|
2233
|
+
|
2234
|
+
log.info("Found %d unique enzymes in substrate scope data", len(scope_enzymes))
|
2235
|
+
log.info("Found %d unique enzymes in lineage data", len(lineage_enzymes))
|
2236
|
+
|
2237
|
+
# Use Gemini to match enzyme names if model is provided
|
2238
|
+
if model and scope_enzymes and lineage_enzymes:
|
2239
|
+
log.info("Using Gemini to match enzyme names between datasets...")
|
2240
|
+
enzyme_matches = _match_enzymes_with_gemini(
|
2241
|
+
scope_enzymes,
|
2242
|
+
lineage_enzymes,
|
2243
|
+
model,
|
2244
|
+
debug_dir=Path("examples/amino_esters_test/substrate_scope_debug_v4") if Path("examples/amino_esters_test/substrate_scope_debug_v4").exists() else None
|
2245
|
+
)
|
2246
|
+
else:
|
2247
|
+
# Fallback to simple case-insensitive matching
|
2248
|
+
log.info("Using simple case-insensitive matching (no model provided)")
|
2249
|
+
enzyme_matches = {}
|
2250
|
+
for scope_enzyme in scope_enzymes:
|
2251
|
+
for lineage_enzyme in lineage_enzymes:
|
2252
|
+
if scope_enzyme.lower() == lineage_enzyme.lower():
|
2253
|
+
enzyme_matches[scope_enzyme] = lineage_enzyme
|
2254
|
+
break
|
2255
|
+
|
2256
|
+
# Create lookup map with matched names
|
1810
2257
|
lineage_map = {}
|
1811
2258
|
for _, row in lineage_df.iterrows():
|
1812
2259
|
enzyme_id = str(row.get('enzyme_id', ''))
|
1813
|
-
lineage_map[enzyme_id
|
1814
|
-
'parent_id': row.get('
|
1815
|
-
'mutations': row.get('mutations'),
|
2260
|
+
lineage_map[enzyme_id] = {
|
2261
|
+
'parent_id': row.get('parent_enzyme_id', ''), # Note: might be 'parent_enzyme_id' not 'parent_id'
|
2262
|
+
'mutations': row.get('mutations', ''),
|
1816
2263
|
'generation': row.get('generation'),
|
1817
|
-
'aa_seq': row.get('aa_seq'),
|
1818
|
-
'dna_seq': row.get('dna_seq'),
|
1819
|
-
'confidence': row.get('confidence')
|
2264
|
+
'aa_seq': row.get('protein_sequence', '') or row.get('aa_seq', ''), # Try both column names
|
2265
|
+
'dna_seq': row.get('dna_seq', ''),
|
2266
|
+
'confidence': row.get('seq_confidence', '') or row.get('confidence', '')
|
1820
2267
|
}
|
1821
2268
|
|
1822
|
-
# Merge
|
2269
|
+
# Merge using matched names
|
1823
2270
|
merged_count = 0
|
1824
2271
|
for entry in entries:
|
1825
|
-
|
1826
|
-
|
1827
|
-
|
1828
|
-
|
1829
|
-
|
1830
|
-
|
1831
|
-
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
2272
|
+
if entry.enzyme_id in enzyme_matches:
|
2273
|
+
matched_name = enzyme_matches[entry.enzyme_id]
|
2274
|
+
if matched_name and matched_name in lineage_map:
|
2275
|
+
data = lineage_map[matched_name]
|
2276
|
+
entry.parent_id = data['parent_id']
|
2277
|
+
entry.mutations = data['mutations']
|
2278
|
+
entry.generation = data['generation']
|
2279
|
+
entry.aa_seq = data['aa_seq']
|
2280
|
+
entry.dna_seq = data['dna_seq']
|
2281
|
+
entry.confidence = data['confidence']
|
2282
|
+
merged_count += 1
|
2283
|
+
log.debug("Merged %s -> %s", entry.enzyme_id, matched_name)
|
1835
2284
|
|
1836
2285
|
log.info("Merged lineage data for %d/%d entries", merged_count, len(entries))
|
1837
2286
|
|
@@ -1957,7 +2406,7 @@ def run_pipeline(
|
|
1957
2406
|
|
1958
2407
|
# 4. Merge with lineage if available ---------------------------------------
|
1959
2408
|
if lineage_csv:
|
1960
|
-
entries = merge_with_lineage(entries, Path(lineage_csv))
|
2409
|
+
entries = merge_with_lineage(entries, Path(lineage_csv), model)
|
1961
2410
|
|
1962
2411
|
# 5. Validate entries ------------------------------------------------------
|
1963
2412
|
warnings = validate_scope_entries(entries)
|