debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,7 +37,16 @@ from dataclasses import dataclass, field
37
37
  from functools import lru_cache
38
38
  from pathlib import Path
39
39
  from textwrap import dedent
40
- from typing import Any, Dict, List, Optional, Tuple
40
+ from typing import Any, Dict, List, Optional, Tuple, Union
41
+
42
+ # Import universal caption pattern
43
+ try:
44
+ from .caption_pattern import get_universal_caption_pattern
45
+ from .campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
46
+ except ImportError:
47
+ # Fallback if running as standalone script
48
+ from caption_pattern import get_universal_caption_pattern
49
+ from campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
41
50
 
42
51
  import fitz # PyMuPDF - for image extraction
43
52
  import google.generativeai as genai # type: ignore
@@ -433,7 +442,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
433
442
  You are an expert reader of protein engineering manuscripts.
434
443
  Given the following article captions and section titles, identify most promising locations
435
444
  (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
436
- activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
445
+ activity, etc.) for enzyme variants.
446
+
447
+ CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
448
+ - Look for locations showing data for ALL enzyme variants in the evolution lineage
449
+ - Prioritize sources that show the complete evolutionary progression (parent → child variants)
450
+ - Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
451
+ - Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
437
452
 
438
453
  IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
439
454
  performance data locations. Pay careful attention to:
@@ -441,8 +456,13 @@ performance data locations. Pay careful attention to:
441
456
  - Enzyme name prefixes that indicate different campaigns
442
457
  - Different substrate/product types mentioned in captions
443
458
 
459
+ IMPORTANT FIGURE REFERENCE RULES:
460
+ - For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
461
+ - The extraction system will handle retrieving the entire figure including all sub-panels
462
+ - For tables, return the complete reference as it appears
463
+
444
464
  Respond with a JSON array where each element contains:
445
- - "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
465
+ - "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
446
466
  - "type": one of "table", "figure"
447
467
  - "confidence": your confidence score (0-100)
448
468
  - "caption": the exact caption text for this location
@@ -450,14 +470,29 @@ Respond with a JSON array where each element contains:
450
470
  - "lineage_hint": any indication of which enzyme group this data is for (or null)
451
471
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
452
472
 
453
- Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
473
+ PRIORITIZATION RULES:
474
+ - HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
475
+ - MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
476
+ - LOWEST PRIORITY: Sources showing data for individual variants only
477
+
478
+ Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
479
+
480
+ IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
481
+
454
482
  Do not include too much sources, just return 2 or 3 sources.
455
483
  Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
456
484
  When returning confidence scores, be more accurate and avoid scores that are too close together.
485
+
486
+ CRITICAL:
487
+ - Return "location" EXACTLY as the first reference identifier appears in the actual caption text
488
+ - Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
489
+ - Do NOT modify, standardize, or interpret the location - return it verbatim from the document
490
+ - Include "document" field to specify which PDF contains this location: "manuscript" or "supplementary"
491
+
457
492
  Respond ONLY with **minified JSON**. NO markdown fences.
458
493
 
459
- Example:
460
- [{"location": "Table S1", "type": "table", "confidence": 95, "caption": "Table S1. Detailed information...", "reason": "Complete performance metrics", "lineage_hint": "first enzyme family", "campaign_clues": "PYS lineage, pyrrolidine synthesis"}]
494
+ Format:
495
+ [{"location": "", "type": "", "document": "", "confidence": 0, "caption": "", "reason": "", "lineage_hint": "", "campaign_clues": ""}]
461
496
  """)
462
497
 
463
498
  PROMPT_EXTRACT_METRICS = dedent("""
@@ -484,6 +519,13 @@ IMPORTANT:
484
519
  - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
485
520
  - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
486
521
 
522
+ CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
523
+ - Yield (%) measures how much product was formed (0-100%)
524
+ - Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
525
+ - TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
526
+ - These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
527
+ - Be extremely careful when extracting from tables/figures with multiple columns or data series
528
+
487
529
  Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
488
530
  """)
489
531
 
@@ -511,6 +553,17 @@ STEP 4: Extract values for each matched variant
511
553
  - CRITICAL: Read actual scale values from the axis labels and tick marks
512
554
  - Verify: taller bars should have higher values, higher dots should have higher values
513
555
 
556
+ CRITICAL DATA ACCURACY REQUIREMENTS:
557
+ - DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
558
+ - Yield is typically shown as percentage (0-100%)
559
+ - Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
560
+ - TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
561
+ - Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
562
+ - Carefully match each bar/dot to its corresponding enzyme label on the X-axis
563
+ - If looking at grouped bars, ensure you're reading the correct bar for each metric
564
+ - Double-check that variant A's yield is not confused with variant B's yield
565
+ - If values are unclear or ambiguous, return null rather than guessing
566
+
514
567
  Target enzymes to find and extract:
515
568
  {enzyme_names}
516
569
 
@@ -734,7 +787,8 @@ class ReactionExtractor:
734
787
  _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
735
788
 
736
789
  def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
737
- campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
790
+ campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None,
791
+ campaign_info: Optional[Dict[str, Any]] = None):
738
792
  self.manuscript = manuscript
739
793
  self.si = si
740
794
  self.cfg = cfg
@@ -742,6 +796,7 @@ class ReactionExtractor:
742
796
  self.debug_dir = debug_dir
743
797
  self.campaign_filter = campaign_filter # Filter for specific campaign
744
798
  self.all_campaigns = all_campaigns or [] # List of all campaigns for context
799
+ self.campaign_info = campaign_info # Detailed campaign information from campaigns.json
745
800
 
746
801
  # Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
747
802
  self._figure_cache = LRUCache(maxsize=100) # Figures are large, so smaller cache
@@ -778,10 +833,8 @@ class ReactionExtractor:
778
833
  # ------------------------------------------------------------------
779
834
 
780
835
  def _collect_captions_and_titles(self) -> str:
781
- # Pattern to match Table or Figure with optional leading whitespace and page numbers
782
- # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
783
- # Also handles cases where there's whitespace or page numbers before the caption
784
- cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
836
+ # Use universal caption pattern that handles all common formats
837
+ cap_pattern = get_universal_caption_pattern()
785
838
  captions: List[str] = []
786
839
 
787
840
  # Process each page individually to avoid TOC entries
@@ -861,7 +914,34 @@ class ReactionExtractor:
861
914
 
862
915
  # Add campaign context - always provide context to help model understanding
863
916
  campaign_context = ""
864
- if self.campaign_filter:
917
+
918
+ # If we have detailed campaign info, use it to provide specific guidance
919
+ if self.campaign_info:
920
+ location_hints = get_location_hints_for_campaign(self.campaign_info)
921
+ campaign_context = f"""
922
+ IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
923
+
924
+ CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
925
+ - Campaign ID: {self.campaign_info.get('campaign_id', '')}
926
+ - Name: {self.campaign_info.get('campaign_name', '')}
927
+ - Description: {self.campaign_info.get('description', '')}
928
+ - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
929
+ - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
930
+ - Notes: {self.campaign_info.get('notes', '')}
931
+
932
+ KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
933
+ These locations are known to contain relevant data - prioritize them highly.
934
+
935
+ CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
936
+ - Look for data showing the entire evolutionary progression of enzyme variants
937
+ - Prioritize locations that show performance data for ALL variants in the lineage
938
+ - The campaign description and notes above provide context about the evolution strategy used
939
+
940
+ {f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
941
+
942
+ CRITICAL: Only return locations that contain data for this specific campaign.
943
+ """
944
+ elif self.campaign_filter:
865
945
  campaigns_warning = ""
866
946
  if self.all_campaigns:
867
947
  campaigns_warning = f"""
@@ -1103,11 +1183,114 @@ class ReactionExtractor:
1103
1183
 
1104
1184
  return toc_score >= 2
1105
1185
 
1186
+ def _build_caption_index(self) -> Dict[str, Dict[str, Any]]:
1187
+ """Build an index of all captions for quick lookup."""
1188
+ if hasattr(self, '_caption_index'):
1189
+ return self._caption_index
1190
+
1191
+ cap_pattern = get_universal_caption_pattern()
1192
+ caption_index = {}
1193
+
1194
+ for idx, page in enumerate(self.all_pages):
1195
+ source = "manuscript" if idx < len(self.ms_pages) else "supplementary"
1196
+ page_num = idx + 1 if idx < len(self.ms_pages) else idx - len(self.ms_pages) + 1
1197
+
1198
+ for match in cap_pattern.finditer(page):
1199
+ caption_text = match.group(0).strip()
1200
+ # Extract a normalized key (e.g., "table 5", "figure 3")
1201
+ caption_lower = caption_text.lower()
1202
+
1203
+ # Store multiple access patterns for the same caption
1204
+ caption_info = {
1205
+ 'full_caption': caption_text,
1206
+ 'page_content': page,
1207
+ 'page_idx': idx,
1208
+ 'source': source,
1209
+ 'page_num': page_num,
1210
+ 'match_start': match.start()
1211
+ }
1212
+
1213
+ # Create multiple keys for flexible matching
1214
+ # Key 1: Full caption text (first 100 chars)
1215
+ key1 = caption_text[:100].lower().strip()
1216
+ caption_index[key1] = caption_info
1217
+
1218
+ # Key 2: Simplified reference (e.g., "table 5", "figure s3")
1219
+ ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
1220
+ if ref_match:
1221
+ key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
1222
+ caption_index[key2] = caption_info
1223
+
1224
+ # Also store with 's' prefix if in SI
1225
+ if source == "supplementary" and 's' not in key2:
1226
+ key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
1227
+ caption_index[key3] = caption_info
1228
+
1229
+ self._caption_index = caption_index
1230
+ return caption_index
1231
+
1106
1232
  def _page_with_reference(self, ref_id: str) -> Optional[str]:
1107
- for page in self.all_pages:
1108
- if ref_id.lower() in page.lower():
1109
- return page
1110
- return None
1233
+ """Find page(s) containing a reference using flexible matching."""
1234
+ caption_index = self._build_caption_index()
1235
+ ref_lower = ref_id.lower().strip()
1236
+
1237
+ # Try multiple matching strategies
1238
+ matches = []
1239
+
1240
+ # Strategy 1: Direct key lookup
1241
+ if ref_lower in caption_index:
1242
+ matches.append(caption_index[ref_lower])
1243
+
1244
+ # Strategy 2: Normalized reference lookup (e.g., "table 5", "figure s3")
1245
+ ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
1246
+ if ref_match:
1247
+ ref_type, ref_num = ref_match.groups()
1248
+ if ref_type == 'fig':
1249
+ ref_type = 'figure'
1250
+
1251
+ # Try different key formats
1252
+ keys_to_try = [
1253
+ f"{ref_type} {ref_num}",
1254
+ f"{ref_type} s{ref_num}",
1255
+ f"table {ref_num}", # Sometimes figures are mislabeled
1256
+ f"fig {ref_num}",
1257
+ f"figure {ref_num}"
1258
+ ]
1259
+
1260
+ for key in keys_to_try:
1261
+ if key in caption_index and caption_index[key] not in matches:
1262
+ matches.append(caption_index[key])
1263
+
1264
+ # Strategy 3: Fuzzy matching on caption text
1265
+ if not matches:
1266
+ # Look for any caption containing the reference number
1267
+ for key, info in caption_index.items():
1268
+ if ref_match and ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
1269
+ if info not in matches:
1270
+ matches.append(info)
1271
+
1272
+ # Return results
1273
+ if not matches:
1274
+ LOGGER.warning(f"No matches found for reference '{ref_id}'")
1275
+ # Last resort: simple text search
1276
+ for page in self.all_pages:
1277
+ if ref_lower in page.lower():
1278
+ return page
1279
+ return None
1280
+
1281
+ # If single match, return it
1282
+ if len(matches) == 1:
1283
+ return matches[0]['page_content']
1284
+
1285
+ # Multiple matches: combine them with source annotations
1286
+ LOGGER.info(f"Found {len(matches)} potential matches for '{ref_id}'")
1287
+ combined_pages = []
1288
+ for match in matches:
1289
+ header = f"\n\n=== {match['source'].upper()} PAGE {match['page_num']} ===\n"
1290
+ header += f"Caption: {match['full_caption'][:200]}...\n"
1291
+ combined_pages.append(header + match['page_content'])
1292
+
1293
+ return "\n".join(combined_pages)
1111
1294
 
1112
1295
  # ---- Table text helper - now returns full page ----
1113
1296
  def _extract_table_context(self, ref: str) -> str:
@@ -1140,19 +1323,29 @@ class ReactionExtractor:
1140
1323
  return pix
1141
1324
 
1142
1325
  # ---- NEW: Page image helper for both figures and tables ----
1143
- def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
1326
+ def _extract_page_png(self, ref: str, extract_figure_only: bool = True, caption_hint: str = "", document_hint: str = "") -> Optional[str]:
1144
1327
  """Export the page containing the reference as PNG.
1145
1328
  If extract_figure_only=True, extracts just the figure above the caption.
1146
1329
  If False, extracts the entire page (useful for tables).
1147
- Returns a base64-encoded PNG or None."""
1148
- LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
1330
+ Returns a base64-encoded PNG or None.
1149
1331
 
1150
- # Check cache first
1151
- cache_key = f"{ref}_{extract_figure_only}"
1332
+ Args:
1333
+ ref: The reference string (e.g., "Fig. 3")
1334
+ extract_figure_only: Whether to extract just the figure or the entire page
1335
+ caption_hint: Optional caption text from location data to help find the exact figure
1336
+ document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
1337
+ """
1338
+ LOGGER.info("_extract_page_png called with ref='%s', extract_figure_only=%s, caption_hint='%s', document_hint='%s'",
1339
+ ref, extract_figure_only, caption_hint[:50] + "..." if caption_hint else "EMPTY", document_hint)
1340
+
1341
+ # Check cache first - include document hint in key to avoid cross-document contamination
1342
+ cache_key = f"{ref}_{extract_figure_only}_{document_hint}" if document_hint else f"{ref}_{extract_figure_only}"
1152
1343
  cached_result = self._figure_cache.get(cache_key)
1153
1344
  if cached_result is not None:
1154
- LOGGER.debug("Using cached figure for %s", ref)
1345
+ LOGGER.info("Using cached figure for %s (cache key: %s)", ref, cache_key)
1155
1346
  return cached_result
1347
+ else:
1348
+ LOGGER.info("Cache miss for %s (cache key: %s)", ref, cache_key)
1156
1349
 
1157
1350
  # For table extraction, use multi-page approach
1158
1351
  if not extract_figure_only:
@@ -1162,13 +1355,28 @@ class ReactionExtractor:
1162
1355
  return self._extract_multiple_pages_png(pages_with_ref, ref)
1163
1356
  return None
1164
1357
 
1165
- # For figure extraction, search both documents for actual figure captions
1166
- docs = list(filter(None, [self.ms_doc, self.si_doc]))
1167
- LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
1358
+ # For figure extraction, prioritize based on document hint
1359
+ if document_hint == "manuscript" and self.ms_doc:
1360
+ # Search manuscript first, then SI as fallback
1361
+ docs = list(filter(None, [self.ms_doc, self.si_doc]))
1362
+ LOGGER.info("Prioritizing manuscript document for '%s' (hint: %s)", ref, document_hint)
1363
+ LOGGER.info("Search order: 1) Manuscript, 2) SI (fallback)")
1364
+ elif document_hint == "supplementary" and self.si_doc:
1365
+ # Search SI first, then manuscript as fallback
1366
+ docs = list(filter(None, [self.si_doc, self.ms_doc]))
1367
+ LOGGER.info("Prioritizing supplementary document for '%s' (hint: %s)", ref, document_hint)
1368
+ LOGGER.info("Search order: 1) SI, 2) Manuscript (fallback)")
1369
+ else:
1370
+ # Default behavior - search both in order
1371
+ docs = list(filter(None, [self.ms_doc, self.si_doc]))
1372
+ LOGGER.info("Searching for '%s' in %d documents (no document hint)", ref, len(docs))
1373
+ LOGGER.info("Search order: 1) Manuscript, 2) SI (default order)")
1168
1374
 
1169
1375
  for doc_idx, doc in enumerate(docs):
1170
- doc_name = "MS" if doc_idx == 0 else "SI"
1171
- LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
1376
+ # Determine document name based on actual document, not position
1377
+ doc_name = "MS" if doc == self.ms_doc else "SI"
1378
+ LOGGER.info("Searching document %d/%d: %s (has %d pages)",
1379
+ doc_idx + 1, len(docs), doc_name, doc.page_count)
1172
1380
 
1173
1381
  for page_number in range(doc.page_count):
1174
1382
  page = doc.load_page(page_number)
@@ -1181,9 +1389,90 @@ class ReactionExtractor:
1181
1389
  LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
1182
1390
  continue
1183
1391
 
1392
+ # If we have a caption hint, try to find it using fuzzy matching
1393
+ if caption_hint:
1394
+ LOGGER.info("=== CAPTION HINT SEARCH ===")
1395
+ LOGGER.info("Caption hint provided: %s", caption_hint[:100])
1396
+ LOGGER.info("Searching in %s document, page %d", doc_name, page_number + 1)
1397
+ LOGGER.info("Page text length: %d chars", len(page_text))
1398
+
1399
+ # Check if caption exists in raw form
1400
+ if caption_hint[:50] in page_text:
1401
+ LOGGER.info("✓ Caption hint found in raw page text!")
1402
+ else:
1403
+ LOGGER.info("✗ Caption hint NOT found in raw page text")
1404
+
1405
+ # Normalize texts for better matching
1406
+ def normalize_for_matching(text):
1407
+ # Remove extra whitespace, normalize spaces around punctuation
1408
+ text = ' '.join(text.split())
1409
+ # Normalize different dash types
1410
+ text = text.replace('–', '-').replace('—', '-')
1411
+ return text
1412
+
1413
+ normalized_hint = normalize_for_matching(caption_hint[:100]) # Use first 100 chars
1414
+ normalized_page = normalize_for_matching(page_text)
1415
+
1416
+ # Try to find the caption using fuzzy matching
1417
+ best_match_pos = -1
1418
+ best_match_score = 0
1419
+ match_found = False
1420
+
1421
+ # Slide through the page text looking for best match
1422
+ hint_len = len(normalized_hint)
1423
+ for i in range(len(normalized_page) - hint_len + 1):
1424
+ snippet = normalized_page[i:i + hint_len]
1425
+ # Simple character-based similarity
1426
+ matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
1427
+ score = matches / hint_len
1428
+
1429
+ if score > best_match_score and score > 0.8: # 80% similarity threshold
1430
+ best_match_score = score
1431
+ best_match_pos = i
1432
+ match_found = True
1433
+
1434
+ if match_found and best_match_pos >= 0:
1435
+ LOGGER.info("Found caption match in %s document on page %d with %.1f%% similarity",
1436
+ doc_name, page_number + 1, best_match_score * 100)
1437
+
1438
+ # Instead of complex position mapping, just search for the beginning of the caption
1439
+ # Use the first 30 chars which should be unique enough
1440
+ search_text = caption_hint[:30].strip()
1441
+ LOGGER.info("Searching for caption text: '%s'", search_text)
1442
+ caption_instances = page.search_for(search_text)
1443
+ LOGGER.info("Found %d caption instances", len(caption_instances) if caption_instances else 0)
1444
+
1445
+ if caption_instances:
1446
+ cap_rect = caption_instances[0]
1447
+ caption_found = True
1448
+ # Extract figure above this caption
1449
+ if extract_figure_only:
1450
+ LOGGER.info("Extracting figure area including caption for %s from %s document", ref, doc_name)
1451
+ LOGGER.info("Caption found at rect: %s on page %d", cap_rect, page_number + 1)
1452
+ page_rect = page.rect
1453
+
1454
+ # Include the caption in the extraction
1455
+ # Add some padding below the caption to ensure we get the full text
1456
+ caption_padding = 30 # pixels below caption
1457
+ figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
1458
+ LOGGER.info("Page rect: %s, Figure rect including caption: %s", page_rect, figure_rect)
1459
+ mat = fitz.Matrix(5.0, 5.0)
1460
+ pix = page.get_pixmap(matrix=mat, clip=figure_rect)
1461
+ pix = self._ensure_rgb_pixmap(pix)
1462
+ img_bytes = pix.tobytes("png")
1463
+ img_b64 = b64encode(img_bytes).decode('utf-8')
1464
+ self._figure_cache.put(cache_key, img_b64)
1465
+ LOGGER.info("Successfully extracted figure using caption hint for %s from %s document, page %d",
1466
+ ref, doc_name, page_number + 1)
1467
+ return img_b64
1468
+ else:
1469
+ LOGGER.info("No fuzzy match found for caption hint on page %d (best score: %.1f%%)",
1470
+ page_number + 1, best_match_score * 100)
1471
+
1472
+ # If caption hint didn't work or wasn't provided, fall back to pattern matching
1184
1473
  # Look for figure caption pattern more flexibly
1185
1474
  # Normalize the reference to handle variations
1186
- figure_num = ref.replace('Figure', '').replace('figure', '').strip()
1475
+ figure_num = ref.replace('Figure', '').replace('figure', '').replace('Fig.', '').replace('Fig', '').strip()
1187
1476
 
1188
1477
  # Extract main figure number from subfigure (e.g., "1C" -> "1")
1189
1478
  main_figure_num = re.match(r'^(\d+)', figure_num)
@@ -1193,10 +1482,10 @@ class ReactionExtractor:
1193
1482
  main_figure_num = figure_num
1194
1483
 
1195
1484
  # Create a flexible pattern that handles various spacing and formatting
1196
- # This pattern looks for "Figure" (case insensitive) followed by optional spaces
1197
- # then the figure number, then any of: period, colon, space+capital letter, or end of line
1485
+ # This pattern looks for "Figure" or "Fig" (case insensitive) followed by optional spaces
1486
+ # then the figure number, then any of: period, colon, pipe, space+capital letter, or end of line
1198
1487
  # Also match at the beginning of a line to catch captions
1199
- flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
1488
+ flexible_pattern = rf"(?i)(?:^|\n)\s*(?:figure|fig\.?)\s*{re.escape(main_figure_num)}(?:\.|:|\||\s+\||(?=\s+[A-Z])|\s*$)"
1200
1489
 
1201
1490
  LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
1202
1491
  main_figure_num, flexible_pattern)
@@ -1252,8 +1541,8 @@ class ReactionExtractor:
1252
1541
  if text_instances:
1253
1542
  cap_rect = text_instances[0]
1254
1543
  caption_found = True
1255
- LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
1256
- ref, caption_text, remaining_text[:50])
1544
+ LOGGER.info("Found actual caption for %s in %s document on page %d: '%s' with following text: '%s...'",
1545
+ ref, doc_name, page_number + 1, caption_text, remaining_text[:50])
1257
1546
  break
1258
1547
 
1259
1548
  if not caption_found:
@@ -1274,18 +1563,18 @@ class ReactionExtractor:
1274
1563
  continue
1275
1564
 
1276
1565
  if extract_figure_only:
1277
- # Extract only the area above the caption (the actual figure)
1278
- # This excludes caption text and focuses on visual elements
1279
- LOGGER.info("Extracting figure area above caption for %s", ref)
1566
+ # Extract the figure area including the caption
1567
+ LOGGER.info("Extracting figure area including caption for %s", ref)
1280
1568
 
1281
1569
  # Get the page dimensions
1282
1570
  page_rect = page.rect
1283
1571
 
1284
- # Extract the area above the caption
1572
+ # Extract the area including the caption
1285
1573
  if cap_rect:
1286
- # Extract from top of page to top of caption
1287
- figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
1288
- LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
1574
+ # Extract from top of page to bottom of caption plus padding
1575
+ caption_padding = 30 # pixels below caption
1576
+ figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
1577
+ LOGGER.debug("Extracting figure area with caption: %s (caption ends at y=%f)", figure_rect, cap_rect.y1)
1289
1578
  else:
1290
1579
  # If no caption found, use top 80% of page
1291
1580
  figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
@@ -1419,7 +1708,7 @@ class ReactionExtractor:
1419
1708
  # Sort pages by document and page number
1420
1709
  pages.sort(key=lambda x: (id(x[0]), x[1]))
1421
1710
 
1422
- # Extract the range of pages including one page after
1711
+ # Extract the range of pages including one page after for tables
1423
1712
  all_images = []
1424
1713
  for i, (doc, page_num) in enumerate(pages):
1425
1714
  # Add the current page
@@ -1428,7 +1717,16 @@ class ReactionExtractor:
1428
1717
  pix = self._ensure_rgb_pixmap(pix)
1429
1718
  all_images.append(pix)
1430
1719
 
1431
- # Only extract the page containing the reference (removed next page logic)
1720
+ # Add the next page as well for tables (in case data continues)
1721
+ next_page_num = page_num + 1
1722
+ if next_page_num < doc.page_count:
1723
+ try:
1724
+ next_pix = doc.load_page(next_page_num).get_pixmap(matrix=mat)
1725
+ next_pix = self._ensure_rgb_pixmap(next_pix)
1726
+ all_images.append(next_pix)
1727
+ LOGGER.info("Including next page (%d) for table %s", next_page_num + 1, ref)
1728
+ except Exception as e:
1729
+ LOGGER.warning("Failed to extract next page %d for %s: %s", next_page_num + 1, ref, e)
1432
1730
 
1433
1731
  if not all_images:
1434
1732
  return None
@@ -1533,21 +1831,9 @@ class ReactionExtractor:
1533
1831
 
1534
1832
  def _validate_location_exists(self, ref: str) -> bool:
1535
1833
  """Verify that the referenced location actually exists in the document."""
1536
- # Search for the actual reference in both manuscript and SI documents
1537
- docs_to_check = [self.ms_doc]
1538
- if self.si_doc:
1539
- docs_to_check.append(self.si_doc)
1540
-
1541
- for doc in docs_to_check:
1542
- for page_num in range(len(doc)):
1543
- page = doc[page_num]
1544
- text = page.get_text()
1545
-
1546
- # Look for table references like "Table 1", "Table S1", etc.
1547
- if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
1548
- return True
1549
-
1550
- return False
1834
+ # Use the caption index to check if location exists
1835
+ result = self._page_with_reference(ref)
1836
+ return result is not None
1551
1837
 
1552
1838
  def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
1553
1839
  """Validate that the context contains meaningful content for extraction."""
@@ -1596,15 +1882,31 @@ class ReactionExtractor:
1596
1882
  LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
1597
1883
  return True
1598
1884
 
1599
- def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
1600
- """Extract performance metrics for multiple enzymes from the identified location in batch."""
1601
- LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
1602
- ref_lc = ref.lower()
1885
+ def extract_metrics_batch(self, enzyme_list: List[str], ref: Union[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
1886
+ """Extract performance metrics for multiple enzymes from the identified location in batch.
1887
+
1888
+ Args:
1889
+ enzyme_list: List of enzyme names to extract metrics for
1890
+ ref: Either a string reference (e.g., "Fig. 3") or a location dict with 'location' and optionally 'caption'
1891
+ """
1892
+ # Handle both string and dict inputs
1893
+ if isinstance(ref, dict):
1894
+ location_str = ref['location']
1895
+ caption_hint = ref.get('caption', '')
1896
+ document_hint = ref.get('document', '')
1897
+ LOGGER.info("extract_metrics_batch called with location='%s' (with caption hint, document=%s) for %d enzymes",
1898
+ location_str, document_hint, len(enzyme_list))
1899
+ else:
1900
+ location_str = ref
1901
+ caption_hint = ''
1902
+ document_hint = ''
1903
+ LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", location_str, len(enzyme_list))
1904
+ ref_lc = location_str.lower()
1603
1905
  image_b64: Optional[str] = None
1604
1906
 
1605
1907
  # First, validate that the location actually exists in the document
1606
- if not self._validate_location_exists(ref):
1607
- LOGGER.warning("Location %s not found in document - skipping", ref)
1908
+ if not self._validate_location_exists(location_str):
1909
+ LOGGER.warning("Location %s not found in document - skipping", location_str)
1608
1910
  return []
1609
1911
 
1610
1912
  # Add campaign context if available
@@ -1614,30 +1916,30 @@ class ReactionExtractor:
1614
1916
 
1615
1917
  if self._TAB_RE.search(ref_lc):
1616
1918
  # For tables, try to extract the page as an image first
1617
- image_b64 = self._extract_page_png(ref, extract_figure_only=False)
1919
+ image_b64 = self._extract_page_png(location_str, extract_figure_only=False, document_hint=document_hint)
1618
1920
  if not image_b64:
1619
- LOGGER.debug("No page image found for %s - using full page text", ref)
1620
- snippet = self._extract_table_context(ref)
1921
+ LOGGER.debug("No page image found for %s - using full page text", location_str)
1922
+ snippet = self._extract_table_context(location_str)
1621
1923
  elif self._FIG_RE.search(ref_lc):
1622
1924
  # For figures, extract just the figure image (same logic as compound mapping)
1623
- LOGGER.debug("Attempting to extract figure image for '%s'", ref)
1624
- image_b64 = self._extract_page_png(ref, extract_figure_only=True)
1925
+ LOGGER.info("Attempting to extract figure image for '%s'", location_str)
1926
+ image_b64 = self._extract_page_png(location_str, extract_figure_only=True, caption_hint=caption_hint, document_hint=document_hint)
1625
1927
  if not image_b64:
1626
- LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
1627
- snippet = self._extract_figure_caption(ref)
1928
+ LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", location_str)
1929
+ snippet = self._extract_figure_caption(location_str)
1628
1930
  LOGGER.debug("Caption extraction result: %s",
1629
1931
  f"'{snippet[:100]}...'" if snippet else "empty")
1630
1932
  else:
1631
- LOGGER.info("Successfully extracted figure image for '%s'", ref)
1933
+ LOGGER.info("Successfully extracted figure image for '%s'", location_str)
1632
1934
  # If figure is found, ignore text information - use image only
1633
1935
  snippet = ""
1634
1936
  else:
1635
- snippet = self._page_with_reference(ref) or ""
1937
+ snippet = self._page_with_reference(location_str) or ""
1636
1938
 
1637
1939
  # For figures with images, skip text validation and proceed with image extraction
1638
1940
  if image_b64 and self._FIG_RE.search(ref_lc):
1639
- LOGGER.info("Using figure image for %s - ignoring text context", ref)
1640
- elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
1941
+ LOGGER.info("Using figure image for %s - ignoring text context", location_str)
1942
+ elif not image_b64 and not self._validate_context(snippet, enzyme_list, location_str):
1641
1943
  return []
1642
1944
 
1643
1945
  # Create enhanced enzyme descriptions with parent/mutation context
@@ -1934,6 +2236,19 @@ Different campaigns may use different model reactions.
1934
2236
  prompt = PROMPT_COMPOUND_MAPPING
1935
2237
  if campaign_filter:
1936
2238
  prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
2239
+
2240
+ # Add campaign info as hints
2241
+ if self.campaign_info:
2242
+ substrate_id = self.campaign_info.get('substrate_id', '')
2243
+ product_id = self.campaign_info.get('product_id', '')
2244
+ model_substrate = self.campaign_info.get('model_substrate', '')
2245
+ model_product = self.campaign_info.get('model_product', '')
2246
+
2247
+ if substrate_id and model_substrate:
2248
+ prompt += f"\n\nHINT: The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})"
2249
+ if product_id and model_product:
2250
+ prompt += f"\nHINT: The model product for this campaign is likely '{model_product}' (ID: {product_id})"
2251
+
1937
2252
  if compound_ids:
1938
2253
  prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
1939
2254
  prompt += "\n\nTEXT:\n" + extraction_text
@@ -2011,6 +2326,22 @@ IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campa
2011
2326
  {campaigns_warning}
2012
2327
  Different campaigns may use different numbering systems for compounds.
2013
2328
  Do NOT include compound information from other campaigns."""
2329
+
2330
+ # Add campaign info as hints
2331
+ if self.campaign_info:
2332
+ substrate_id = self.campaign_info.get('substrate_id', '')
2333
+ product_id = self.campaign_info.get('product_id', '')
2334
+ model_substrate = self.campaign_info.get('model_substrate', '')
2335
+ model_product = self.campaign_info.get('model_product', '')
2336
+
2337
+ hints = []
2338
+ if substrate_id and model_substrate:
2339
+ hints.append(f"The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})")
2340
+ if product_id and model_product:
2341
+ hints.append(f"The model product for this campaign is likely '{model_product}' (ID: {product_id})")
2342
+
2343
+ if hints:
2344
+ prompt += "\n\nHINTS FROM CAMPAIGN INFO:\n" + "\n".join(hints)
2014
2345
 
2015
2346
  prompt += """
2016
2347
 
@@ -2268,41 +2599,17 @@ Do NOT include compound information from other campaigns.
2268
2599
  if not mapping or not mapping.iupac_name:
2269
2600
  missing_compounds.append(cid)
2270
2601
 
2271
- # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
2602
+ # Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
2272
2603
  if missing_compounds:
2273
- LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
2604
+ LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
2274
2605
  len(missing_compounds), sorted(missing_compounds))
2275
2606
 
2276
- # Get all available figures for compound structure analysis
2277
- figure_images = {}
2278
-
2279
- # Extract main manuscript figures
2280
- figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
2281
- for ref in figure_refs:
2282
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2283
- if img_b64:
2284
- figure_images[ref] = img_b64
2285
- LOGGER.info("Retrieved %s for compound mapping", ref)
2286
-
2287
- # Get SI figures
2288
- si_figure_refs = []
2289
- for page in self.si_pages[:10]: # Check first 10 SI pages
2290
- matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
2291
- si_figure_refs.extend(matches[:10]) # Limit to 10 figures
2292
-
2293
- # Extract SI figures
2294
- for ref in set(si_figure_refs):
2295
- if ref not in figure_images:
2296
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2297
- if img_b64:
2298
- figure_images[ref] = img_b64
2299
- LOGGER.info("Extracted %s for compound mapping", ref)
2300
-
2301
2607
  # Full text search including ALL pages (manuscript + SI)
2302
2608
  full_text = "\n\n".join(self.all_pages) # Send everything
2303
2609
 
2304
- final_mappings = self._extract_compound_mappings_with_figures(
2305
- full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2610
+ # Use text-only extraction for Tier 2 (no images)
2611
+ final_mappings = self._extract_compound_mappings_from_text(
2612
+ full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
2306
2613
  )
2307
2614
 
2308
2615
  # Merge final mappings with better compound ID matching
@@ -2476,6 +2783,7 @@ Do NOT include compound information from other campaigns.
2476
2783
  compound_mappings = {}
2477
2784
  if compound_ids:
2478
2785
  LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
2786
+
2479
2787
  # Pass the IUPAC location hint if we have it
2480
2788
  iupac_hint = locations.get("iupac_location") if locations else None
2481
2789
  compound_mappings = self._extract_compound_mappings_adaptive(
@@ -2506,6 +2814,22 @@ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely differ
2506
2814
  - Different campaigns may use similar enzyme names but different substrates
2507
2815
  - Be extremely careful to only extract data for the {self.campaign_filter} campaign
2508
2816
  - Ignore data from other campaigns even if they seem similar
2817
+ """
2818
+
2819
+ # Add specific campaign info if available
2820
+ campaign_info_context = ""
2821
+ if self.campaign_info:
2822
+ campaign_info_context = f"""
2823
+
2824
+ KNOWN CAMPAIGN INFORMATION:
2825
+ - Campaign: {self.campaign_info.get('campaign_name', '')}
2826
+ - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
2827
+ - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
2828
+ - Known Data Locations: {', '.join(self.campaign_info.get('data_locations', []))}
2829
+
2830
+ IMPORTANT: Use this information to guide your extraction. The model reaction should involve:
2831
+ - Substrate ID: {self.campaign_info.get('substrate_id', '')}
2832
+ - Product ID: {self.campaign_info.get('product_id', '')}
2509
2833
  """
2510
2834
 
2511
2835
  campaign_context = f"""
@@ -2515,6 +2839,7 @@ You are extracting the model reaction used specifically for these enzyme variant
2515
2839
 
2516
2840
  These variants belong to campaign: {self.campaign_filter}
2517
2841
  {campaigns_context}
2842
+ {campaign_info_context}
2518
2843
  Focus on extracting the model reaction that was used to evaluate THESE specific variants.
2519
2844
  Different campaigns may use different model reactions and substrates.
2520
2845
 
@@ -2625,34 +2950,6 @@ Different campaigns may use different model reactions and substrates.
2625
2950
  LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
2626
2951
  list(compound_mappings.keys()))
2627
2952
 
2628
- # First, populate IUPAC lists directly from compound mappings based on compound_type
2629
- substrate_iupacs_from_mappings = []
2630
- product_iupacs_from_mappings = []
2631
-
2632
- for mapping in compound_mappings.values():
2633
- if mapping.iupac_name and mapping.compound_type:
2634
- if mapping.compound_type.lower() == "substrate":
2635
- substrate_iupacs_from_mappings.append(mapping.iupac_name)
2636
- LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
2637
- elif mapping.compound_type.lower() == "product":
2638
- product_iupacs_from_mappings.append(mapping.iupac_name)
2639
- LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
2640
-
2641
- # Initialize or update the IUPAC lists with mapped compounds
2642
- if substrate_iupacs_from_mappings:
2643
- existing_substrates = data.get("substrate_iupac_list", []) or []
2644
- if isinstance(existing_substrates, list):
2645
- data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
2646
- else:
2647
- data["substrate_iupac_list"] = substrate_iupacs_from_mappings
2648
-
2649
- if product_iupacs_from_mappings:
2650
- existing_products = data.get("product_iupac_list", []) or []
2651
- if isinstance(existing_products, list):
2652
- data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
2653
- else:
2654
- data["product_iupac_list"] = product_iupacs_from_mappings
2655
-
2656
2953
  # Try to map substrate/product lists through compound IDs
2657
2954
  substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
2658
2955
  if isinstance(substrate_list, list):
@@ -2790,7 +3087,7 @@ Different campaigns may use different model reactions and substrates.
2790
3087
  best_location.get('confidence', 0))
2791
3088
 
2792
3089
  # Extract metrics from the most confident source only
2793
- metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
3090
+ metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location)
2794
3091
 
2795
3092
  # Filter to valid metrics
2796
3093
  valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
@@ -3034,7 +3331,7 @@ Different campaigns may use different model reactions and substrates.
3034
3331
  location['location'], location.get('confidence', 0))
3035
3332
 
3036
3333
  # Extract metrics from this location
3037
- metrics_rows = self.extract_metrics_batch(list(all_variants), location['location'])
3334
+ metrics_rows = self.extract_metrics_batch(list(all_variants), location)
3038
3335
 
3039
3336
  # Filter to valid metrics
3040
3337
  valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
@@ -3263,6 +3560,11 @@ def main() -> None:
3263
3560
  LOGGER.info("Loading enzyme data from CSV…")
3264
3561
  enzyme_df = pd.read_csv(args.lineage_csv)
3265
3562
 
3563
+ # Rename enzyme_id to enzyme if needed
3564
+ if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
3565
+ enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
3566
+ LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
3567
+
3266
3568
  # Detect campaign information from the enzyme CSV
3267
3569
  if 'campaign_id' in enzyme_df.columns:
3268
3570
  all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
@@ -3277,10 +3579,27 @@ def main() -> None:
3277
3579
  campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3278
3580
  LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3279
3581
 
3582
+ # Load campaign info from campaigns.json if available
3583
+ campaign_info = None
3584
+ if args.debug_dir:
3585
+ from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
3586
+ campaigns_file = Path(args.debug_dir) / "campaigns.json"
3587
+ if campaigns_file.exists():
3588
+ campaigns = load_campaigns_from_file(campaigns_file)
3589
+ campaign_info = find_campaign_by_id(campaigns, campaign_filter)
3590
+ if campaign_info:
3591
+ LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign_filter)
3592
+
3280
3593
  extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3281
- campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3594
+ campaign_filter=campaign_filter, all_campaigns=all_campaigns,
3595
+ campaign_info=campaign_info)
3282
3596
  df_metrics = extractor.run(enzyme_df)
3283
3597
 
3598
+ # For single campaign, also merge with lineage data
3599
+ if not df_metrics.empty:
3600
+ df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
3601
+ LOGGER.info("Merged metrics with lineage data for single campaign")
3602
+
3284
3603
  elif len(all_campaigns) > 1:
3285
3604
  LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
3286
3605
  all_results = []
@@ -3304,8 +3623,20 @@ def main() -> None:
3304
3623
  campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3305
3624
  LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3306
3625
 
3626
+ # Load campaign info from campaigns.json if available
3627
+ campaign_info = None
3628
+ if args.debug_dir:
3629
+ from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
3630
+ campaigns_file = Path(args.debug_dir) / "campaigns.json"
3631
+ if campaigns_file.exists():
3632
+ campaigns = load_campaigns_from_file(campaigns_file)
3633
+ campaign_info = find_campaign_by_id(campaigns, campaign)
3634
+ if campaign_info:
3635
+ LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign)
3636
+
3307
3637
  extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3308
- campaign_filter=campaign, all_campaigns=all_campaigns)
3638
+ campaign_filter=campaign, all_campaigns=all_campaigns,
3639
+ campaign_info=campaign_info)
3309
3640
 
3310
3641
  # Run extraction for this campaign
3311
3642
  campaign_metrics = extractor.run(campaign_df)
@@ -3319,6 +3650,10 @@ def main() -> None:
3319
3650
  # Merge campaign metrics with lineage data
3320
3651
  campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
3321
3652
 
3653
+ # Rename aa_seq to protein_sequence for consistency
3654
+ if 'aa_seq' in campaign_final.columns:
3655
+ campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
3656
+
3322
3657
  # Save campaign-specific file immediately
3323
3658
  output_dir = args.output.parent
3324
3659
  base_name = args.output.stem
@@ -3335,6 +3670,10 @@ def main() -> None:
3335
3670
  # Still save an empty campaign file with lineage data
3336
3671
  campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3337
3672
  if not campaign_lineage.empty:
3673
+ # Rename aa_seq to protein_sequence for consistency
3674
+ if 'aa_seq' in campaign_lineage.columns:
3675
+ campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
3676
+
3338
3677
  output_dir = args.output.parent
3339
3678
  base_name = args.output.stem
3340
3679
  campaign_file = output_dir / f"{base_name}_{campaign}.csv"
@@ -3365,6 +3704,11 @@ def main() -> None:
3365
3704
  df_final = df_metrics
3366
3705
  LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
3367
3706
 
3707
+ # Rename aa_seq to protein_sequence for consistency
3708
+ if df_final is not None and 'aa_seq' in df_final.columns:
3709
+ df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
3710
+ LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
3711
+
3368
3712
  df_final.to_csv(args.output, index=False)
3369
3713
  LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
3370
3714