debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/campaign_utils.py +146 -0
- debase/caption_pattern.py +44 -0
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +481 -106
- debase/lineage_format.py +44 -1
- debase/reaction_info_extractor.py +479 -135
- debase/substrate_scope_extractor.py +207 -80
- debase/wrapper.py +3 -3
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
- debase-0.6.2.dist-info/RECORD +18 -0
- debase-0.6.0.dist-info/RECORD +0 -16
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,16 @@ from dataclasses import dataclass, field
|
|
37
37
|
from functools import lru_cache
|
38
38
|
from pathlib import Path
|
39
39
|
from textwrap import dedent
|
40
|
-
from typing import Any, Dict, List, Optional, Tuple
|
40
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
41
|
+
|
42
|
+
# Import universal caption pattern
|
43
|
+
try:
|
44
|
+
from .caption_pattern import get_universal_caption_pattern
|
45
|
+
from .campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
|
46
|
+
except ImportError:
|
47
|
+
# Fallback if running as standalone script
|
48
|
+
from caption_pattern import get_universal_caption_pattern
|
49
|
+
from campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
|
41
50
|
|
42
51
|
import fitz # PyMuPDF - for image extraction
|
43
52
|
import google.generativeai as genai # type: ignore
|
@@ -433,7 +442,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
|
|
433
442
|
You are an expert reader of protein engineering manuscripts.
|
434
443
|
Given the following article captions and section titles, identify most promising locations
|
435
444
|
(tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
|
436
|
-
activity, etc.) for enzyme variants.
|
445
|
+
activity, etc.) for enzyme variants.
|
446
|
+
|
447
|
+
CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
|
448
|
+
- Look for locations showing data for ALL enzyme variants in the evolution lineage
|
449
|
+
- Prioritize sources that show the complete evolutionary progression (parent → child variants)
|
450
|
+
- Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
|
451
|
+
- Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
|
437
452
|
|
438
453
|
IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
|
439
454
|
performance data locations. Pay careful attention to:
|
@@ -441,8 +456,13 @@ performance data locations. Pay careful attention to:
|
|
441
456
|
- Enzyme name prefixes that indicate different campaigns
|
442
457
|
- Different substrate/product types mentioned in captions
|
443
458
|
|
459
|
+
IMPORTANT FIGURE REFERENCE RULES:
|
460
|
+
- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
|
461
|
+
- The extraction system will handle retrieving the entire figure including all sub-panels
|
462
|
+
- For tables, return the complete reference as it appears
|
463
|
+
|
444
464
|
Respond with a JSON array where each element contains:
|
445
|
-
- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
|
465
|
+
- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
|
446
466
|
- "type": one of "table", "figure"
|
447
467
|
- "confidence": your confidence score (0-100)
|
448
468
|
- "caption": the exact caption text for this location
|
@@ -450,14 +470,29 @@ Respond with a JSON array where each element contains:
|
|
450
470
|
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
451
471
|
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
452
472
|
|
453
|
-
|
473
|
+
PRIORITIZATION RULES:
|
474
|
+
- HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
|
475
|
+
- MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
|
476
|
+
- LOWEST PRIORITY: Sources showing data for individual variants only
|
477
|
+
|
478
|
+
Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
|
479
|
+
|
480
|
+
IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
|
481
|
+
|
454
482
|
Do not include too much sources, just return 2 or 3 sources.
|
455
483
|
Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
|
456
484
|
When returning confidence scores, be more accurate and avoid scores that are too close together.
|
485
|
+
|
486
|
+
CRITICAL:
|
487
|
+
- Return "location" EXACTLY as the first reference identifier appears in the actual caption text
|
488
|
+
- Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
|
489
|
+
- Do NOT modify, standardize, or interpret the location - return it verbatim from the document
|
490
|
+
- Include "document" field to specify which PDF contains this location: "manuscript" or "supplementary"
|
491
|
+
|
457
492
|
Respond ONLY with **minified JSON**. NO markdown fences.
|
458
493
|
|
459
|
-
|
460
|
-
[{"location": "
|
494
|
+
Format:
|
495
|
+
[{"location": "", "type": "", "document": "", "confidence": 0, "caption": "", "reason": "", "lineage_hint": "", "campaign_clues": ""}]
|
461
496
|
""")
|
462
497
|
|
463
498
|
PROMPT_EXTRACT_METRICS = dedent("""
|
@@ -484,6 +519,13 @@ IMPORTANT:
|
|
484
519
|
- If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
|
485
520
|
- If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
|
486
521
|
|
522
|
+
CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
|
523
|
+
- Yield (%) measures how much product was formed (0-100%)
|
524
|
+
- Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
|
525
|
+
- TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
|
526
|
+
- These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
|
527
|
+
- Be extremely careful when extracting from tables/figures with multiple columns or data series
|
528
|
+
|
487
529
|
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
488
530
|
""")
|
489
531
|
|
@@ -511,6 +553,17 @@ STEP 4: Extract values for each matched variant
|
|
511
553
|
- CRITICAL: Read actual scale values from the axis labels and tick marks
|
512
554
|
- Verify: taller bars should have higher values, higher dots should have higher values
|
513
555
|
|
556
|
+
CRITICAL DATA ACCURACY REQUIREMENTS:
|
557
|
+
- DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
|
558
|
+
- Yield is typically shown as percentage (0-100%)
|
559
|
+
- Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
|
560
|
+
- TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
|
561
|
+
- Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
|
562
|
+
- Carefully match each bar/dot to its corresponding enzyme label on the X-axis
|
563
|
+
- If looking at grouped bars, ensure you're reading the correct bar for each metric
|
564
|
+
- Double-check that variant A's yield is not confused with variant B's yield
|
565
|
+
- If values are unclear or ambiguous, return null rather than guessing
|
566
|
+
|
514
567
|
Target enzymes to find and extract:
|
515
568
|
{enzyme_names}
|
516
569
|
|
@@ -734,7 +787,8 @@ class ReactionExtractor:
|
|
734
787
|
_TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
735
788
|
|
736
789
|
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
737
|
-
campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None
|
790
|
+
campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None,
|
791
|
+
campaign_info: Optional[Dict[str, Any]] = None):
|
738
792
|
self.manuscript = manuscript
|
739
793
|
self.si = si
|
740
794
|
self.cfg = cfg
|
@@ -742,6 +796,7 @@ class ReactionExtractor:
|
|
742
796
|
self.debug_dir = debug_dir
|
743
797
|
self.campaign_filter = campaign_filter # Filter for specific campaign
|
744
798
|
self.all_campaigns = all_campaigns or [] # List of all campaigns for context
|
799
|
+
self.campaign_info = campaign_info # Detailed campaign information from campaigns.json
|
745
800
|
|
746
801
|
# Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
|
747
802
|
self._figure_cache = LRUCache(maxsize=100) # Figures are large, so smaller cache
|
@@ -778,10 +833,8 @@ class ReactionExtractor:
|
|
778
833
|
# ------------------------------------------------------------------
|
779
834
|
|
780
835
|
def _collect_captions_and_titles(self) -> str:
|
781
|
-
#
|
782
|
-
|
783
|
-
# Also handles cases where there's whitespace or page numbers before the caption
|
784
|
-
cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
|
836
|
+
# Use universal caption pattern that handles all common formats
|
837
|
+
cap_pattern = get_universal_caption_pattern()
|
785
838
|
captions: List[str] = []
|
786
839
|
|
787
840
|
# Process each page individually to avoid TOC entries
|
@@ -861,7 +914,34 @@ class ReactionExtractor:
|
|
861
914
|
|
862
915
|
# Add campaign context - always provide context to help model understanding
|
863
916
|
campaign_context = ""
|
864
|
-
|
917
|
+
|
918
|
+
# If we have detailed campaign info, use it to provide specific guidance
|
919
|
+
if self.campaign_info:
|
920
|
+
location_hints = get_location_hints_for_campaign(self.campaign_info)
|
921
|
+
campaign_context = f"""
|
922
|
+
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
923
|
+
|
924
|
+
CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
|
925
|
+
- Campaign ID: {self.campaign_info.get('campaign_id', '')}
|
926
|
+
- Name: {self.campaign_info.get('campaign_name', '')}
|
927
|
+
- Description: {self.campaign_info.get('description', '')}
|
928
|
+
- Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
|
929
|
+
- Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
|
930
|
+
- Notes: {self.campaign_info.get('notes', '')}
|
931
|
+
|
932
|
+
KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
|
933
|
+
These locations are known to contain relevant data - prioritize them highly.
|
934
|
+
|
935
|
+
CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
|
936
|
+
- Look for data showing the entire evolutionary progression of enzyme variants
|
937
|
+
- Prioritize locations that show performance data for ALL variants in the lineage
|
938
|
+
- The campaign description and notes above provide context about the evolution strategy used
|
939
|
+
|
940
|
+
{f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
|
941
|
+
|
942
|
+
CRITICAL: Only return locations that contain data for this specific campaign.
|
943
|
+
"""
|
944
|
+
elif self.campaign_filter:
|
865
945
|
campaigns_warning = ""
|
866
946
|
if self.all_campaigns:
|
867
947
|
campaigns_warning = f"""
|
@@ -1103,11 +1183,114 @@ class ReactionExtractor:
|
|
1103
1183
|
|
1104
1184
|
return toc_score >= 2
|
1105
1185
|
|
1186
|
+
def _build_caption_index(self) -> Dict[str, Dict[str, Any]]:
|
1187
|
+
"""Build an index of all captions for quick lookup."""
|
1188
|
+
if hasattr(self, '_caption_index'):
|
1189
|
+
return self._caption_index
|
1190
|
+
|
1191
|
+
cap_pattern = get_universal_caption_pattern()
|
1192
|
+
caption_index = {}
|
1193
|
+
|
1194
|
+
for idx, page in enumerate(self.all_pages):
|
1195
|
+
source = "manuscript" if idx < len(self.ms_pages) else "supplementary"
|
1196
|
+
page_num = idx + 1 if idx < len(self.ms_pages) else idx - len(self.ms_pages) + 1
|
1197
|
+
|
1198
|
+
for match in cap_pattern.finditer(page):
|
1199
|
+
caption_text = match.group(0).strip()
|
1200
|
+
# Extract a normalized key (e.g., "table 5", "figure 3")
|
1201
|
+
caption_lower = caption_text.lower()
|
1202
|
+
|
1203
|
+
# Store multiple access patterns for the same caption
|
1204
|
+
caption_info = {
|
1205
|
+
'full_caption': caption_text,
|
1206
|
+
'page_content': page,
|
1207
|
+
'page_idx': idx,
|
1208
|
+
'source': source,
|
1209
|
+
'page_num': page_num,
|
1210
|
+
'match_start': match.start()
|
1211
|
+
}
|
1212
|
+
|
1213
|
+
# Create multiple keys for flexible matching
|
1214
|
+
# Key 1: Full caption text (first 100 chars)
|
1215
|
+
key1 = caption_text[:100].lower().strip()
|
1216
|
+
caption_index[key1] = caption_info
|
1217
|
+
|
1218
|
+
# Key 2: Simplified reference (e.g., "table 5", "figure s3")
|
1219
|
+
ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
|
1220
|
+
if ref_match:
|
1221
|
+
key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
|
1222
|
+
caption_index[key2] = caption_info
|
1223
|
+
|
1224
|
+
# Also store with 's' prefix if in SI
|
1225
|
+
if source == "supplementary" and 's' not in key2:
|
1226
|
+
key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
|
1227
|
+
caption_index[key3] = caption_info
|
1228
|
+
|
1229
|
+
self._caption_index = caption_index
|
1230
|
+
return caption_index
|
1231
|
+
|
1106
1232
|
def _page_with_reference(self, ref_id: str) -> Optional[str]:
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1233
|
+
"""Find page(s) containing a reference using flexible matching."""
|
1234
|
+
caption_index = self._build_caption_index()
|
1235
|
+
ref_lower = ref_id.lower().strip()
|
1236
|
+
|
1237
|
+
# Try multiple matching strategies
|
1238
|
+
matches = []
|
1239
|
+
|
1240
|
+
# Strategy 1: Direct key lookup
|
1241
|
+
if ref_lower in caption_index:
|
1242
|
+
matches.append(caption_index[ref_lower])
|
1243
|
+
|
1244
|
+
# Strategy 2: Normalized reference lookup (e.g., "table 5", "figure s3")
|
1245
|
+
ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
|
1246
|
+
if ref_match:
|
1247
|
+
ref_type, ref_num = ref_match.groups()
|
1248
|
+
if ref_type == 'fig':
|
1249
|
+
ref_type = 'figure'
|
1250
|
+
|
1251
|
+
# Try different key formats
|
1252
|
+
keys_to_try = [
|
1253
|
+
f"{ref_type} {ref_num}",
|
1254
|
+
f"{ref_type} s{ref_num}",
|
1255
|
+
f"table {ref_num}", # Sometimes figures are mislabeled
|
1256
|
+
f"fig {ref_num}",
|
1257
|
+
f"figure {ref_num}"
|
1258
|
+
]
|
1259
|
+
|
1260
|
+
for key in keys_to_try:
|
1261
|
+
if key in caption_index and caption_index[key] not in matches:
|
1262
|
+
matches.append(caption_index[key])
|
1263
|
+
|
1264
|
+
# Strategy 3: Fuzzy matching on caption text
|
1265
|
+
if not matches:
|
1266
|
+
# Look for any caption containing the reference number
|
1267
|
+
for key, info in caption_index.items():
|
1268
|
+
if ref_match and ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
|
1269
|
+
if info not in matches:
|
1270
|
+
matches.append(info)
|
1271
|
+
|
1272
|
+
# Return results
|
1273
|
+
if not matches:
|
1274
|
+
LOGGER.warning(f"No matches found for reference '{ref_id}'")
|
1275
|
+
# Last resort: simple text search
|
1276
|
+
for page in self.all_pages:
|
1277
|
+
if ref_lower in page.lower():
|
1278
|
+
return page
|
1279
|
+
return None
|
1280
|
+
|
1281
|
+
# If single match, return it
|
1282
|
+
if len(matches) == 1:
|
1283
|
+
return matches[0]['page_content']
|
1284
|
+
|
1285
|
+
# Multiple matches: combine them with source annotations
|
1286
|
+
LOGGER.info(f"Found {len(matches)} potential matches for '{ref_id}'")
|
1287
|
+
combined_pages = []
|
1288
|
+
for match in matches:
|
1289
|
+
header = f"\n\n=== {match['source'].upper()} PAGE {match['page_num']} ===\n"
|
1290
|
+
header += f"Caption: {match['full_caption'][:200]}...\n"
|
1291
|
+
combined_pages.append(header + match['page_content'])
|
1292
|
+
|
1293
|
+
return "\n".join(combined_pages)
|
1111
1294
|
|
1112
1295
|
# ---- Table text helper - now returns full page ----
|
1113
1296
|
def _extract_table_context(self, ref: str) -> str:
|
@@ -1140,19 +1323,29 @@ class ReactionExtractor:
|
|
1140
1323
|
return pix
|
1141
1324
|
|
1142
1325
|
# ---- NEW: Page image helper for both figures and tables ----
|
1143
|
-
def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
|
1326
|
+
def _extract_page_png(self, ref: str, extract_figure_only: bool = True, caption_hint: str = "", document_hint: str = "") -> Optional[str]:
|
1144
1327
|
"""Export the page containing the reference as PNG.
|
1145
1328
|
If extract_figure_only=True, extracts just the figure above the caption.
|
1146
1329
|
If False, extracts the entire page (useful for tables).
|
1147
|
-
Returns a base64-encoded PNG or None.
|
1148
|
-
LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
|
1330
|
+
Returns a base64-encoded PNG or None.
|
1149
1331
|
|
1150
|
-
|
1151
|
-
|
1332
|
+
Args:
|
1333
|
+
ref: The reference string (e.g., "Fig. 3")
|
1334
|
+
extract_figure_only: Whether to extract just the figure or the entire page
|
1335
|
+
caption_hint: Optional caption text from location data to help find the exact figure
|
1336
|
+
document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
|
1337
|
+
"""
|
1338
|
+
LOGGER.info("_extract_page_png called with ref='%s', extract_figure_only=%s, caption_hint='%s', document_hint='%s'",
|
1339
|
+
ref, extract_figure_only, caption_hint[:50] + "..." if caption_hint else "EMPTY", document_hint)
|
1340
|
+
|
1341
|
+
# Check cache first - include document hint in key to avoid cross-document contamination
|
1342
|
+
cache_key = f"{ref}_{extract_figure_only}_{document_hint}" if document_hint else f"{ref}_{extract_figure_only}"
|
1152
1343
|
cached_result = self._figure_cache.get(cache_key)
|
1153
1344
|
if cached_result is not None:
|
1154
|
-
LOGGER.
|
1345
|
+
LOGGER.info("Using cached figure for %s (cache key: %s)", ref, cache_key)
|
1155
1346
|
return cached_result
|
1347
|
+
else:
|
1348
|
+
LOGGER.info("Cache miss for %s (cache key: %s)", ref, cache_key)
|
1156
1349
|
|
1157
1350
|
# For table extraction, use multi-page approach
|
1158
1351
|
if not extract_figure_only:
|
@@ -1162,13 +1355,28 @@ class ReactionExtractor:
|
|
1162
1355
|
return self._extract_multiple_pages_png(pages_with_ref, ref)
|
1163
1356
|
return None
|
1164
1357
|
|
1165
|
-
# For figure extraction,
|
1166
|
-
|
1167
|
-
|
1358
|
+
# For figure extraction, prioritize based on document hint
|
1359
|
+
if document_hint == "manuscript" and self.ms_doc:
|
1360
|
+
# Search manuscript first, then SI as fallback
|
1361
|
+
docs = list(filter(None, [self.ms_doc, self.si_doc]))
|
1362
|
+
LOGGER.info("Prioritizing manuscript document for '%s' (hint: %s)", ref, document_hint)
|
1363
|
+
LOGGER.info("Search order: 1) Manuscript, 2) SI (fallback)")
|
1364
|
+
elif document_hint == "supplementary" and self.si_doc:
|
1365
|
+
# Search SI first, then manuscript as fallback
|
1366
|
+
docs = list(filter(None, [self.si_doc, self.ms_doc]))
|
1367
|
+
LOGGER.info("Prioritizing supplementary document for '%s' (hint: %s)", ref, document_hint)
|
1368
|
+
LOGGER.info("Search order: 1) SI, 2) Manuscript (fallback)")
|
1369
|
+
else:
|
1370
|
+
# Default behavior - search both in order
|
1371
|
+
docs = list(filter(None, [self.ms_doc, self.si_doc]))
|
1372
|
+
LOGGER.info("Searching for '%s' in %d documents (no document hint)", ref, len(docs))
|
1373
|
+
LOGGER.info("Search order: 1) Manuscript, 2) SI (default order)")
|
1168
1374
|
|
1169
1375
|
for doc_idx, doc in enumerate(docs):
|
1170
|
-
|
1171
|
-
|
1376
|
+
# Determine document name based on actual document, not position
|
1377
|
+
doc_name = "MS" if doc == self.ms_doc else "SI"
|
1378
|
+
LOGGER.info("Searching document %d/%d: %s (has %d pages)",
|
1379
|
+
doc_idx + 1, len(docs), doc_name, doc.page_count)
|
1172
1380
|
|
1173
1381
|
for page_number in range(doc.page_count):
|
1174
1382
|
page = doc.load_page(page_number)
|
@@ -1181,9 +1389,90 @@ class ReactionExtractor:
|
|
1181
1389
|
LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
|
1182
1390
|
continue
|
1183
1391
|
|
1392
|
+
# If we have a caption hint, try to find it using fuzzy matching
|
1393
|
+
if caption_hint:
|
1394
|
+
LOGGER.info("=== CAPTION HINT SEARCH ===")
|
1395
|
+
LOGGER.info("Caption hint provided: %s", caption_hint[:100])
|
1396
|
+
LOGGER.info("Searching in %s document, page %d", doc_name, page_number + 1)
|
1397
|
+
LOGGER.info("Page text length: %d chars", len(page_text))
|
1398
|
+
|
1399
|
+
# Check if caption exists in raw form
|
1400
|
+
if caption_hint[:50] in page_text:
|
1401
|
+
LOGGER.info("✓ Caption hint found in raw page text!")
|
1402
|
+
else:
|
1403
|
+
LOGGER.info("✗ Caption hint NOT found in raw page text")
|
1404
|
+
|
1405
|
+
# Normalize texts for better matching
|
1406
|
+
def normalize_for_matching(text):
|
1407
|
+
# Remove extra whitespace, normalize spaces around punctuation
|
1408
|
+
text = ' '.join(text.split())
|
1409
|
+
# Normalize different dash types
|
1410
|
+
text = text.replace('–', '-').replace('—', '-')
|
1411
|
+
return text
|
1412
|
+
|
1413
|
+
normalized_hint = normalize_for_matching(caption_hint[:100]) # Use first 100 chars
|
1414
|
+
normalized_page = normalize_for_matching(page_text)
|
1415
|
+
|
1416
|
+
# Try to find the caption using fuzzy matching
|
1417
|
+
best_match_pos = -1
|
1418
|
+
best_match_score = 0
|
1419
|
+
match_found = False
|
1420
|
+
|
1421
|
+
# Slide through the page text looking for best match
|
1422
|
+
hint_len = len(normalized_hint)
|
1423
|
+
for i in range(len(normalized_page) - hint_len + 1):
|
1424
|
+
snippet = normalized_page[i:i + hint_len]
|
1425
|
+
# Simple character-based similarity
|
1426
|
+
matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
|
1427
|
+
score = matches / hint_len
|
1428
|
+
|
1429
|
+
if score > best_match_score and score > 0.8: # 80% similarity threshold
|
1430
|
+
best_match_score = score
|
1431
|
+
best_match_pos = i
|
1432
|
+
match_found = True
|
1433
|
+
|
1434
|
+
if match_found and best_match_pos >= 0:
|
1435
|
+
LOGGER.info("Found caption match in %s document on page %d with %.1f%% similarity",
|
1436
|
+
doc_name, page_number + 1, best_match_score * 100)
|
1437
|
+
|
1438
|
+
# Instead of complex position mapping, just search for the beginning of the caption
|
1439
|
+
# Use the first 30 chars which should be unique enough
|
1440
|
+
search_text = caption_hint[:30].strip()
|
1441
|
+
LOGGER.info("Searching for caption text: '%s'", search_text)
|
1442
|
+
caption_instances = page.search_for(search_text)
|
1443
|
+
LOGGER.info("Found %d caption instances", len(caption_instances) if caption_instances else 0)
|
1444
|
+
|
1445
|
+
if caption_instances:
|
1446
|
+
cap_rect = caption_instances[0]
|
1447
|
+
caption_found = True
|
1448
|
+
# Extract figure above this caption
|
1449
|
+
if extract_figure_only:
|
1450
|
+
LOGGER.info("Extracting figure area including caption for %s from %s document", ref, doc_name)
|
1451
|
+
LOGGER.info("Caption found at rect: %s on page %d", cap_rect, page_number + 1)
|
1452
|
+
page_rect = page.rect
|
1453
|
+
|
1454
|
+
# Include the caption in the extraction
|
1455
|
+
# Add some padding below the caption to ensure we get the full text
|
1456
|
+
caption_padding = 30 # pixels below caption
|
1457
|
+
figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
|
1458
|
+
LOGGER.info("Page rect: %s, Figure rect including caption: %s", page_rect, figure_rect)
|
1459
|
+
mat = fitz.Matrix(5.0, 5.0)
|
1460
|
+
pix = page.get_pixmap(matrix=mat, clip=figure_rect)
|
1461
|
+
pix = self._ensure_rgb_pixmap(pix)
|
1462
|
+
img_bytes = pix.tobytes("png")
|
1463
|
+
img_b64 = b64encode(img_bytes).decode('utf-8')
|
1464
|
+
self._figure_cache.put(cache_key, img_b64)
|
1465
|
+
LOGGER.info("Successfully extracted figure using caption hint for %s from %s document, page %d",
|
1466
|
+
ref, doc_name, page_number + 1)
|
1467
|
+
return img_b64
|
1468
|
+
else:
|
1469
|
+
LOGGER.info("No fuzzy match found for caption hint on page %d (best score: %.1f%%)",
|
1470
|
+
page_number + 1, best_match_score * 100)
|
1471
|
+
|
1472
|
+
# If caption hint didn't work or wasn't provided, fall back to pattern matching
|
1184
1473
|
# Look for figure caption pattern more flexibly
|
1185
1474
|
# Normalize the reference to handle variations
|
1186
|
-
figure_num = ref.replace('Figure', '').replace('figure', '').strip()
|
1475
|
+
figure_num = ref.replace('Figure', '').replace('figure', '').replace('Fig.', '').replace('Fig', '').strip()
|
1187
1476
|
|
1188
1477
|
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1189
1478
|
main_figure_num = re.match(r'^(\d+)', figure_num)
|
@@ -1193,10 +1482,10 @@ class ReactionExtractor:
|
|
1193
1482
|
main_figure_num = figure_num
|
1194
1483
|
|
1195
1484
|
# Create a flexible pattern that handles various spacing and formatting
|
1196
|
-
# This pattern looks for "Figure" (case insensitive) followed by optional spaces
|
1197
|
-
# then the figure number, then any of: period, colon, space+capital letter, or end of line
|
1485
|
+
# This pattern looks for "Figure" or "Fig" (case insensitive) followed by optional spaces
|
1486
|
+
# then the figure number, then any of: period, colon, pipe, space+capital letter, or end of line
|
1198
1487
|
# Also match at the beginning of a line to catch captions
|
1199
|
-
flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(
|
1488
|
+
flexible_pattern = rf"(?i)(?:^|\n)\s*(?:figure|fig\.?)\s*{re.escape(main_figure_num)}(?:\.|:|\||\s+\||(?=\s+[A-Z])|\s*$)"
|
1200
1489
|
|
1201
1490
|
LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
|
1202
1491
|
main_figure_num, flexible_pattern)
|
@@ -1252,8 +1541,8 @@ class ReactionExtractor:
|
|
1252
1541
|
if text_instances:
|
1253
1542
|
cap_rect = text_instances[0]
|
1254
1543
|
caption_found = True
|
1255
|
-
LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
|
1256
|
-
ref, caption_text, remaining_text[:50])
|
1544
|
+
LOGGER.info("Found actual caption for %s in %s document on page %d: '%s' with following text: '%s...'",
|
1545
|
+
ref, doc_name, page_number + 1, caption_text, remaining_text[:50])
|
1257
1546
|
break
|
1258
1547
|
|
1259
1548
|
if not caption_found:
|
@@ -1274,18 +1563,18 @@ class ReactionExtractor:
|
|
1274
1563
|
continue
|
1275
1564
|
|
1276
1565
|
if extract_figure_only:
|
1277
|
-
# Extract
|
1278
|
-
|
1279
|
-
LOGGER.info("Extracting figure area above caption for %s", ref)
|
1566
|
+
# Extract the figure area including the caption
|
1567
|
+
LOGGER.info("Extracting figure area including caption for %s", ref)
|
1280
1568
|
|
1281
1569
|
# Get the page dimensions
|
1282
1570
|
page_rect = page.rect
|
1283
1571
|
|
1284
|
-
# Extract the area
|
1572
|
+
# Extract the area including the caption
|
1285
1573
|
if cap_rect:
|
1286
|
-
# Extract from top of page to
|
1287
|
-
|
1288
|
-
|
1574
|
+
# Extract from top of page to bottom of caption plus padding
|
1575
|
+
caption_padding = 30 # pixels below caption
|
1576
|
+
figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
|
1577
|
+
LOGGER.debug("Extracting figure area with caption: %s (caption ends at y=%f)", figure_rect, cap_rect.y1)
|
1289
1578
|
else:
|
1290
1579
|
# If no caption found, use top 80% of page
|
1291
1580
|
figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
|
@@ -1419,7 +1708,7 @@ class ReactionExtractor:
|
|
1419
1708
|
# Sort pages by document and page number
|
1420
1709
|
pages.sort(key=lambda x: (id(x[0]), x[1]))
|
1421
1710
|
|
1422
|
-
# Extract the range of pages including one page after
|
1711
|
+
# Extract the range of pages including one page after for tables
|
1423
1712
|
all_images = []
|
1424
1713
|
for i, (doc, page_num) in enumerate(pages):
|
1425
1714
|
# Add the current page
|
@@ -1428,7 +1717,16 @@ class ReactionExtractor:
|
|
1428
1717
|
pix = self._ensure_rgb_pixmap(pix)
|
1429
1718
|
all_images.append(pix)
|
1430
1719
|
|
1431
|
-
#
|
1720
|
+
# Add the next page as well for tables (in case data continues)
|
1721
|
+
next_page_num = page_num + 1
|
1722
|
+
if next_page_num < doc.page_count:
|
1723
|
+
try:
|
1724
|
+
next_pix = doc.load_page(next_page_num).get_pixmap(matrix=mat)
|
1725
|
+
next_pix = self._ensure_rgb_pixmap(next_pix)
|
1726
|
+
all_images.append(next_pix)
|
1727
|
+
LOGGER.info("Including next page (%d) for table %s", next_page_num + 1, ref)
|
1728
|
+
except Exception as e:
|
1729
|
+
LOGGER.warning("Failed to extract next page %d for %s: %s", next_page_num + 1, ref, e)
|
1432
1730
|
|
1433
1731
|
if not all_images:
|
1434
1732
|
return None
|
@@ -1533,21 +1831,9 @@ class ReactionExtractor:
|
|
1533
1831
|
|
1534
1832
|
def _validate_location_exists(self, ref: str) -> bool:
|
1535
1833
|
"""Verify that the referenced location actually exists in the document."""
|
1536
|
-
#
|
1537
|
-
|
1538
|
-
|
1539
|
-
docs_to_check.append(self.si_doc)
|
1540
|
-
|
1541
|
-
for doc in docs_to_check:
|
1542
|
-
for page_num in range(len(doc)):
|
1543
|
-
page = doc[page_num]
|
1544
|
-
text = page.get_text()
|
1545
|
-
|
1546
|
-
# Look for table references like "Table 1", "Table S1", etc.
|
1547
|
-
if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
|
1548
|
-
return True
|
1549
|
-
|
1550
|
-
return False
|
1834
|
+
# Use the caption index to check if location exists
|
1835
|
+
result = self._page_with_reference(ref)
|
1836
|
+
return result is not None
|
1551
1837
|
|
1552
1838
|
def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
|
1553
1839
|
"""Validate that the context contains meaningful content for extraction."""
|
@@ -1596,15 +1882,31 @@ class ReactionExtractor:
|
|
1596
1882
|
LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
|
1597
1883
|
return True
|
1598
1884
|
|
1599
|
-
def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
|
1600
|
-
"""Extract performance metrics for multiple enzymes from the identified location in batch.
|
1601
|
-
|
1602
|
-
|
1885
|
+
def extract_metrics_batch(self, enzyme_list: List[str], ref: Union[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
|
1886
|
+
"""Extract performance metrics for multiple enzymes from the identified location in batch.
|
1887
|
+
|
1888
|
+
Args:
|
1889
|
+
enzyme_list: List of enzyme names to extract metrics for
|
1890
|
+
ref: Either a string reference (e.g., "Fig. 3") or a location dict with 'location' and optionally 'caption'
|
1891
|
+
"""
|
1892
|
+
# Handle both string and dict inputs
|
1893
|
+
if isinstance(ref, dict):
|
1894
|
+
location_str = ref['location']
|
1895
|
+
caption_hint = ref.get('caption', '')
|
1896
|
+
document_hint = ref.get('document', '')
|
1897
|
+
LOGGER.info("extract_metrics_batch called with location='%s' (with caption hint, document=%s) for %d enzymes",
|
1898
|
+
location_str, document_hint, len(enzyme_list))
|
1899
|
+
else:
|
1900
|
+
location_str = ref
|
1901
|
+
caption_hint = ''
|
1902
|
+
document_hint = ''
|
1903
|
+
LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", location_str, len(enzyme_list))
|
1904
|
+
ref_lc = location_str.lower()
|
1603
1905
|
image_b64: Optional[str] = None
|
1604
1906
|
|
1605
1907
|
# First, validate that the location actually exists in the document
|
1606
|
-
if not self._validate_location_exists(
|
1607
|
-
LOGGER.warning("Location %s not found in document - skipping",
|
1908
|
+
if not self._validate_location_exists(location_str):
|
1909
|
+
LOGGER.warning("Location %s not found in document - skipping", location_str)
|
1608
1910
|
return []
|
1609
1911
|
|
1610
1912
|
# Add campaign context if available
|
@@ -1614,30 +1916,30 @@ class ReactionExtractor:
|
|
1614
1916
|
|
1615
1917
|
if self._TAB_RE.search(ref_lc):
|
1616
1918
|
# For tables, try to extract the page as an image first
|
1617
|
-
image_b64 = self._extract_page_png(
|
1919
|
+
image_b64 = self._extract_page_png(location_str, extract_figure_only=False, document_hint=document_hint)
|
1618
1920
|
if not image_b64:
|
1619
|
-
LOGGER.debug("No page image found for %s - using full page text",
|
1620
|
-
snippet = self._extract_table_context(
|
1921
|
+
LOGGER.debug("No page image found for %s - using full page text", location_str)
|
1922
|
+
snippet = self._extract_table_context(location_str)
|
1621
1923
|
elif self._FIG_RE.search(ref_lc):
|
1622
1924
|
# For figures, extract just the figure image (same logic as compound mapping)
|
1623
|
-
LOGGER.
|
1624
|
-
image_b64 = self._extract_page_png(
|
1925
|
+
LOGGER.info("Attempting to extract figure image for '%s'", location_str)
|
1926
|
+
image_b64 = self._extract_page_png(location_str, extract_figure_only=True, caption_hint=caption_hint, document_hint=document_hint)
|
1625
1927
|
if not image_b64:
|
1626
|
-
LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text",
|
1627
|
-
snippet = self._extract_figure_caption(
|
1928
|
+
LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", location_str)
|
1929
|
+
snippet = self._extract_figure_caption(location_str)
|
1628
1930
|
LOGGER.debug("Caption extraction result: %s",
|
1629
1931
|
f"'{snippet[:100]}...'" if snippet else "empty")
|
1630
1932
|
else:
|
1631
|
-
LOGGER.info("Successfully extracted figure image for '%s'",
|
1933
|
+
LOGGER.info("Successfully extracted figure image for '%s'", location_str)
|
1632
1934
|
# If figure is found, ignore text information - use image only
|
1633
1935
|
snippet = ""
|
1634
1936
|
else:
|
1635
|
-
snippet = self._page_with_reference(
|
1937
|
+
snippet = self._page_with_reference(location_str) or ""
|
1636
1938
|
|
1637
1939
|
# For figures with images, skip text validation and proceed with image extraction
|
1638
1940
|
if image_b64 and self._FIG_RE.search(ref_lc):
|
1639
|
-
LOGGER.info("Using figure image for %s - ignoring text context",
|
1640
|
-
elif not image_b64 and not self._validate_context(snippet, enzyme_list,
|
1941
|
+
LOGGER.info("Using figure image for %s - ignoring text context", location_str)
|
1942
|
+
elif not image_b64 and not self._validate_context(snippet, enzyme_list, location_str):
|
1641
1943
|
return []
|
1642
1944
|
|
1643
1945
|
# Create enhanced enzyme descriptions with parent/mutation context
|
@@ -1934,6 +2236,19 @@ Different campaigns may use different model reactions.
|
|
1934
2236
|
prompt = PROMPT_COMPOUND_MAPPING
|
1935
2237
|
if campaign_filter:
|
1936
2238
|
prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
|
2239
|
+
|
2240
|
+
# Add campaign info as hints
|
2241
|
+
if self.campaign_info:
|
2242
|
+
substrate_id = self.campaign_info.get('substrate_id', '')
|
2243
|
+
product_id = self.campaign_info.get('product_id', '')
|
2244
|
+
model_substrate = self.campaign_info.get('model_substrate', '')
|
2245
|
+
model_product = self.campaign_info.get('model_product', '')
|
2246
|
+
|
2247
|
+
if substrate_id and model_substrate:
|
2248
|
+
prompt += f"\n\nHINT: The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})"
|
2249
|
+
if product_id and model_product:
|
2250
|
+
prompt += f"\nHINT: The model product for this campaign is likely '{model_product}' (ID: {product_id})"
|
2251
|
+
|
1937
2252
|
if compound_ids:
|
1938
2253
|
prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
|
1939
2254
|
prompt += "\n\nTEXT:\n" + extraction_text
|
@@ -2011,6 +2326,22 @@ IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campa
|
|
2011
2326
|
{campaigns_warning}
|
2012
2327
|
Different campaigns may use different numbering systems for compounds.
|
2013
2328
|
Do NOT include compound information from other campaigns."""
|
2329
|
+
|
2330
|
+
# Add campaign info as hints
|
2331
|
+
if self.campaign_info:
|
2332
|
+
substrate_id = self.campaign_info.get('substrate_id', '')
|
2333
|
+
product_id = self.campaign_info.get('product_id', '')
|
2334
|
+
model_substrate = self.campaign_info.get('model_substrate', '')
|
2335
|
+
model_product = self.campaign_info.get('model_product', '')
|
2336
|
+
|
2337
|
+
hints = []
|
2338
|
+
if substrate_id and model_substrate:
|
2339
|
+
hints.append(f"The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})")
|
2340
|
+
if product_id and model_product:
|
2341
|
+
hints.append(f"The model product for this campaign is likely '{model_product}' (ID: {product_id})")
|
2342
|
+
|
2343
|
+
if hints:
|
2344
|
+
prompt += "\n\nHINTS FROM CAMPAIGN INFO:\n" + "\n".join(hints)
|
2014
2345
|
|
2015
2346
|
prompt += """
|
2016
2347
|
|
@@ -2268,41 +2599,17 @@ Do NOT include compound information from other campaigns.
|
|
2268
2599
|
if not mapping or not mapping.iupac_name:
|
2269
2600
|
missing_compounds.append(cid)
|
2270
2601
|
|
2271
|
-
# Tier 2 (skip directly to full search): Full manuscript + SI search
|
2602
|
+
# Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
|
2272
2603
|
if missing_compounds:
|
2273
|
-
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
|
2604
|
+
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
|
2274
2605
|
len(missing_compounds), sorted(missing_compounds))
|
2275
2606
|
|
2276
|
-
# Get all available figures for compound structure analysis
|
2277
|
-
figure_images = {}
|
2278
|
-
|
2279
|
-
# Extract main manuscript figures
|
2280
|
-
figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
|
2281
|
-
for ref in figure_refs:
|
2282
|
-
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2283
|
-
if img_b64:
|
2284
|
-
figure_images[ref] = img_b64
|
2285
|
-
LOGGER.info("Retrieved %s for compound mapping", ref)
|
2286
|
-
|
2287
|
-
# Get SI figures
|
2288
|
-
si_figure_refs = []
|
2289
|
-
for page in self.si_pages[:10]: # Check first 10 SI pages
|
2290
|
-
matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
|
2291
|
-
si_figure_refs.extend(matches[:10]) # Limit to 10 figures
|
2292
|
-
|
2293
|
-
# Extract SI figures
|
2294
|
-
for ref in set(si_figure_refs):
|
2295
|
-
if ref not in figure_images:
|
2296
|
-
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2297
|
-
if img_b64:
|
2298
|
-
figure_images[ref] = img_b64
|
2299
|
-
LOGGER.info("Extracted %s for compound mapping", ref)
|
2300
|
-
|
2301
2607
|
# Full text search including ALL pages (manuscript + SI)
|
2302
2608
|
full_text = "\n\n".join(self.all_pages) # Send everything
|
2303
2609
|
|
2304
|
-
|
2305
|
-
|
2610
|
+
# Use text-only extraction for Tier 2 (no images)
|
2611
|
+
final_mappings = self._extract_compound_mappings_from_text(
|
2612
|
+
full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
|
2306
2613
|
)
|
2307
2614
|
|
2308
2615
|
# Merge final mappings with better compound ID matching
|
@@ -2476,6 +2783,7 @@ Do NOT include compound information from other campaigns.
|
|
2476
2783
|
compound_mappings = {}
|
2477
2784
|
if compound_ids:
|
2478
2785
|
LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
|
2786
|
+
|
2479
2787
|
# Pass the IUPAC location hint if we have it
|
2480
2788
|
iupac_hint = locations.get("iupac_location") if locations else None
|
2481
2789
|
compound_mappings = self._extract_compound_mappings_adaptive(
|
@@ -2506,6 +2814,22 @@ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely differ
|
|
2506
2814
|
- Different campaigns may use similar enzyme names but different substrates
|
2507
2815
|
- Be extremely careful to only extract data for the {self.campaign_filter} campaign
|
2508
2816
|
- Ignore data from other campaigns even if they seem similar
|
2817
|
+
"""
|
2818
|
+
|
2819
|
+
# Add specific campaign info if available
|
2820
|
+
campaign_info_context = ""
|
2821
|
+
if self.campaign_info:
|
2822
|
+
campaign_info_context = f"""
|
2823
|
+
|
2824
|
+
KNOWN CAMPAIGN INFORMATION:
|
2825
|
+
- Campaign: {self.campaign_info.get('campaign_name', '')}
|
2826
|
+
- Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
|
2827
|
+
- Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
|
2828
|
+
- Known Data Locations: {', '.join(self.campaign_info.get('data_locations', []))}
|
2829
|
+
|
2830
|
+
IMPORTANT: Use this information to guide your extraction. The model reaction should involve:
|
2831
|
+
- Substrate ID: {self.campaign_info.get('substrate_id', '')}
|
2832
|
+
- Product ID: {self.campaign_info.get('product_id', '')}
|
2509
2833
|
"""
|
2510
2834
|
|
2511
2835
|
campaign_context = f"""
|
@@ -2515,6 +2839,7 @@ You are extracting the model reaction used specifically for these enzyme variant
|
|
2515
2839
|
|
2516
2840
|
These variants belong to campaign: {self.campaign_filter}
|
2517
2841
|
{campaigns_context}
|
2842
|
+
{campaign_info_context}
|
2518
2843
|
Focus on extracting the model reaction that was used to evaluate THESE specific variants.
|
2519
2844
|
Different campaigns may use different model reactions and substrates.
|
2520
2845
|
|
@@ -2625,34 +2950,6 @@ Different campaigns may use different model reactions and substrates.
|
|
2625
2950
|
LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
|
2626
2951
|
list(compound_mappings.keys()))
|
2627
2952
|
|
2628
|
-
# First, populate IUPAC lists directly from compound mappings based on compound_type
|
2629
|
-
substrate_iupacs_from_mappings = []
|
2630
|
-
product_iupacs_from_mappings = []
|
2631
|
-
|
2632
|
-
for mapping in compound_mappings.values():
|
2633
|
-
if mapping.iupac_name and mapping.compound_type:
|
2634
|
-
if mapping.compound_type.lower() == "substrate":
|
2635
|
-
substrate_iupacs_from_mappings.append(mapping.iupac_name)
|
2636
|
-
LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
|
2637
|
-
elif mapping.compound_type.lower() == "product":
|
2638
|
-
product_iupacs_from_mappings.append(mapping.iupac_name)
|
2639
|
-
LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
|
2640
|
-
|
2641
|
-
# Initialize or update the IUPAC lists with mapped compounds
|
2642
|
-
if substrate_iupacs_from_mappings:
|
2643
|
-
existing_substrates = data.get("substrate_iupac_list", []) or []
|
2644
|
-
if isinstance(existing_substrates, list):
|
2645
|
-
data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
|
2646
|
-
else:
|
2647
|
-
data["substrate_iupac_list"] = substrate_iupacs_from_mappings
|
2648
|
-
|
2649
|
-
if product_iupacs_from_mappings:
|
2650
|
-
existing_products = data.get("product_iupac_list", []) or []
|
2651
|
-
if isinstance(existing_products, list):
|
2652
|
-
data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
|
2653
|
-
else:
|
2654
|
-
data["product_iupac_list"] = product_iupacs_from_mappings
|
2655
|
-
|
2656
2953
|
# Try to map substrate/product lists through compound IDs
|
2657
2954
|
substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
|
2658
2955
|
if isinstance(substrate_list, list):
|
@@ -2790,7 +3087,7 @@ Different campaigns may use different model reactions and substrates.
|
|
2790
3087
|
best_location.get('confidence', 0))
|
2791
3088
|
|
2792
3089
|
# Extract metrics from the most confident source only
|
2793
|
-
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location
|
3090
|
+
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location)
|
2794
3091
|
|
2795
3092
|
# Filter to valid metrics
|
2796
3093
|
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
@@ -3034,7 +3331,7 @@ Different campaigns may use different model reactions and substrates.
|
|
3034
3331
|
location['location'], location.get('confidence', 0))
|
3035
3332
|
|
3036
3333
|
# Extract metrics from this location
|
3037
|
-
metrics_rows = self.extract_metrics_batch(list(all_variants), location
|
3334
|
+
metrics_rows = self.extract_metrics_batch(list(all_variants), location)
|
3038
3335
|
|
3039
3336
|
# Filter to valid metrics
|
3040
3337
|
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
@@ -3263,6 +3560,11 @@ def main() -> None:
|
|
3263
3560
|
LOGGER.info("Loading enzyme data from CSV…")
|
3264
3561
|
enzyme_df = pd.read_csv(args.lineage_csv)
|
3265
3562
|
|
3563
|
+
# Rename enzyme_id to enzyme if needed
|
3564
|
+
if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
|
3565
|
+
enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
|
3566
|
+
LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
|
3567
|
+
|
3266
3568
|
# Detect campaign information from the enzyme CSV
|
3267
3569
|
if 'campaign_id' in enzyme_df.columns:
|
3268
3570
|
all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
|
@@ -3277,10 +3579,27 @@ def main() -> None:
|
|
3277
3579
|
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3278
3580
|
LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
|
3279
3581
|
|
3582
|
+
# Load campaign info from campaigns.json if available
|
3583
|
+
campaign_info = None
|
3584
|
+
if args.debug_dir:
|
3585
|
+
from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
|
3586
|
+
campaigns_file = Path(args.debug_dir) / "campaigns.json"
|
3587
|
+
if campaigns_file.exists():
|
3588
|
+
campaigns = load_campaigns_from_file(campaigns_file)
|
3589
|
+
campaign_info = find_campaign_by_id(campaigns, campaign_filter)
|
3590
|
+
if campaign_info:
|
3591
|
+
LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign_filter)
|
3592
|
+
|
3280
3593
|
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3281
|
-
campaign_filter=campaign_filter, all_campaigns=all_campaigns
|
3594
|
+
campaign_filter=campaign_filter, all_campaigns=all_campaigns,
|
3595
|
+
campaign_info=campaign_info)
|
3282
3596
|
df_metrics = extractor.run(enzyme_df)
|
3283
3597
|
|
3598
|
+
# For single campaign, also merge with lineage data
|
3599
|
+
if not df_metrics.empty:
|
3600
|
+
df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3601
|
+
LOGGER.info("Merged metrics with lineage data for single campaign")
|
3602
|
+
|
3284
3603
|
elif len(all_campaigns) > 1:
|
3285
3604
|
LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
|
3286
3605
|
all_results = []
|
@@ -3304,8 +3623,20 @@ def main() -> None:
|
|
3304
3623
|
campaign_debug_dir.mkdir(parents=True, exist_ok=True)
|
3305
3624
|
LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
|
3306
3625
|
|
3626
|
+
# Load campaign info from campaigns.json if available
|
3627
|
+
campaign_info = None
|
3628
|
+
if args.debug_dir:
|
3629
|
+
from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
|
3630
|
+
campaigns_file = Path(args.debug_dir) / "campaigns.json"
|
3631
|
+
if campaigns_file.exists():
|
3632
|
+
campaigns = load_campaigns_from_file(campaigns_file)
|
3633
|
+
campaign_info = find_campaign_by_id(campaigns, campaign)
|
3634
|
+
if campaign_info:
|
3635
|
+
LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign)
|
3636
|
+
|
3307
3637
|
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
|
3308
|
-
campaign_filter=campaign, all_campaigns=all_campaigns
|
3638
|
+
campaign_filter=campaign, all_campaigns=all_campaigns,
|
3639
|
+
campaign_info=campaign_info)
|
3309
3640
|
|
3310
3641
|
# Run extraction for this campaign
|
3311
3642
|
campaign_metrics = extractor.run(campaign_df)
|
@@ -3319,6 +3650,10 @@ def main() -> None:
|
|
3319
3650
|
# Merge campaign metrics with lineage data
|
3320
3651
|
campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3321
3652
|
|
3653
|
+
# Rename aa_seq to protein_sequence for consistency
|
3654
|
+
if 'aa_seq' in campaign_final.columns:
|
3655
|
+
campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
|
3656
|
+
|
3322
3657
|
# Save campaign-specific file immediately
|
3323
3658
|
output_dir = args.output.parent
|
3324
3659
|
base_name = args.output.stem
|
@@ -3335,6 +3670,10 @@ def main() -> None:
|
|
3335
3670
|
# Still save an empty campaign file with lineage data
|
3336
3671
|
campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3337
3672
|
if not campaign_lineage.empty:
|
3673
|
+
# Rename aa_seq to protein_sequence for consistency
|
3674
|
+
if 'aa_seq' in campaign_lineage.columns:
|
3675
|
+
campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
|
3676
|
+
|
3338
3677
|
output_dir = args.output.parent
|
3339
3678
|
base_name = args.output.stem
|
3340
3679
|
campaign_file = output_dir / f"{base_name}_{campaign}.csv"
|
@@ -3365,6 +3704,11 @@ def main() -> None:
|
|
3365
3704
|
df_final = df_metrics
|
3366
3705
|
LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
|
3367
3706
|
|
3707
|
+
# Rename aa_seq to protein_sequence for consistency
|
3708
|
+
if df_final is not None and 'aa_seq' in df_final.columns:
|
3709
|
+
df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
|
3710
|
+
LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
|
3711
|
+
|
3368
3712
|
df_final.to_csv(args.output, index=False)
|
3369
3713
|
LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
|
3370
3714
|
|