debase 0.6.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,7 +37,16 @@ from dataclasses import dataclass, field
37
37
  from functools import lru_cache
38
38
  from pathlib import Path
39
39
  from textwrap import dedent
40
- from typing import Any, Dict, List, Optional, Tuple
40
+ from typing import Any, Dict, List, Optional, Tuple, Union
41
+
42
+ # Import universal caption pattern
43
+ try:
44
+ from .caption_pattern import get_universal_caption_pattern
45
+ from .campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
46
+ except ImportError:
47
+ # Fallback if running as standalone script
48
+ from caption_pattern import get_universal_caption_pattern
49
+ from campaign_utils import enhance_prompt_with_campaign, get_location_hints_for_campaign
41
50
 
42
51
  import fitz # PyMuPDF - for image extraction
43
52
  import google.generativeai as genai # type: ignore
@@ -451,13 +460,23 @@ Respond with a JSON array where each element contains:
451
460
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
452
461
 
453
462
  Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
463
+
464
+ IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
465
+
454
466
  Do not include too much sources, just return 2 or 3 sources.
455
467
  Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
456
468
  When returning confidence scores, be more accurate and avoid scores that are too close together.
469
+
470
+ CRITICAL:
471
+ - Return "location" EXACTLY as the first reference identifier appears in the actual caption text
472
+ - Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
473
+ - Do NOT modify, standardize, or interpret the location - return it verbatim from the document
474
+ - Include "document" field to specify which PDF contains this location: "manuscript" or "supplementary"
475
+
457
476
  Respond ONLY with **minified JSON**. NO markdown fences.
458
477
 
459
- Example:
460
- [{"location": "Table S1", "type": "table", "confidence": 95, "caption": "Table S1. Detailed information...", "reason": "Complete performance metrics", "lineage_hint": "first enzyme family", "campaign_clues": "PYS lineage, pyrrolidine synthesis"}]
478
+ Format:
479
+ [{"location": "", "type": "", "document": "", "confidence": 0, "caption": "", "reason": "", "lineage_hint": "", "campaign_clues": ""}]
461
480
  """)
462
481
 
463
482
  PROMPT_EXTRACT_METRICS = dedent("""
@@ -734,7 +753,8 @@ class ReactionExtractor:
734
753
  _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
735
754
 
736
755
  def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
737
- campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
756
+ campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None,
757
+ campaign_info: Optional[Dict[str, Any]] = None):
738
758
  self.manuscript = manuscript
739
759
  self.si = si
740
760
  self.cfg = cfg
@@ -742,6 +762,7 @@ class ReactionExtractor:
742
762
  self.debug_dir = debug_dir
743
763
  self.campaign_filter = campaign_filter # Filter for specific campaign
744
764
  self.all_campaigns = all_campaigns or [] # List of all campaigns for context
765
+ self.campaign_info = campaign_info # Detailed campaign information from campaigns.json
745
766
 
746
767
  # Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
747
768
  self._figure_cache = LRUCache(maxsize=100) # Figures are large, so smaller cache
@@ -778,10 +799,8 @@ class ReactionExtractor:
778
799
  # ------------------------------------------------------------------
779
800
 
780
801
  def _collect_captions_and_titles(self) -> str:
781
- # Pattern to match Table or Figure with optional leading whitespace and page numbers
782
- # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
783
- # Also handles cases where there's whitespace or page numbers before the caption
784
- cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
802
+ # Use universal caption pattern that handles all common formats
803
+ cap_pattern = get_universal_caption_pattern()
785
804
  captions: List[str] = []
786
805
 
787
806
  # Process each page individually to avoid TOC entries
@@ -861,7 +880,27 @@ class ReactionExtractor:
861
880
 
862
881
  # Add campaign context - always provide context to help model understanding
863
882
  campaign_context = ""
864
- if self.campaign_filter:
883
+
884
+ # If we have detailed campaign info, use it to provide specific guidance
885
+ if self.campaign_info:
886
+ location_hints = get_location_hints_for_campaign(self.campaign_info)
887
+ campaign_context = f"""
888
+ IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
889
+
890
+ Campaign Details:
891
+ - Name: {self.campaign_info.get('campaign_name', '')}
892
+ - Description: {self.campaign_info.get('description', '')}
893
+ - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
894
+ - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
895
+
896
+ KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
897
+ These locations are known to contain relevant data - prioritize them highly.
898
+
899
+ {f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
900
+
901
+ CRITICAL: Only return locations that contain data for this specific campaign.
902
+ """
903
+ elif self.campaign_filter:
865
904
  campaigns_warning = ""
866
905
  if self.all_campaigns:
867
906
  campaigns_warning = f"""
@@ -1103,11 +1142,114 @@ class ReactionExtractor:
1103
1142
 
1104
1143
  return toc_score >= 2
1105
1144
 
1145
+ def _build_caption_index(self) -> Dict[str, Dict[str, Any]]:
1146
+ """Build an index of all captions for quick lookup."""
1147
+ if hasattr(self, '_caption_index'):
1148
+ return self._caption_index
1149
+
1150
+ cap_pattern = get_universal_caption_pattern()
1151
+ caption_index = {}
1152
+
1153
+ for idx, page in enumerate(self.all_pages):
1154
+ source = "manuscript" if idx < len(self.ms_pages) else "supplementary"
1155
+ page_num = idx + 1 if idx < len(self.ms_pages) else idx - len(self.ms_pages) + 1
1156
+
1157
+ for match in cap_pattern.finditer(page):
1158
+ caption_text = match.group(0).strip()
1159
+ # Extract a normalized key (e.g., "table 5", "figure 3")
1160
+ caption_lower = caption_text.lower()
1161
+
1162
+ # Store multiple access patterns for the same caption
1163
+ caption_info = {
1164
+ 'full_caption': caption_text,
1165
+ 'page_content': page,
1166
+ 'page_idx': idx,
1167
+ 'source': source,
1168
+ 'page_num': page_num,
1169
+ 'match_start': match.start()
1170
+ }
1171
+
1172
+ # Create multiple keys for flexible matching
1173
+ # Key 1: Full caption text (first 100 chars)
1174
+ key1 = caption_text[:100].lower().strip()
1175
+ caption_index[key1] = caption_info
1176
+
1177
+ # Key 2: Simplified reference (e.g., "table 5", "figure s3")
1178
+ ref_match = re.search(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', caption_lower)
1179
+ if ref_match:
1180
+ key2 = f"{ref_match.group(1)} {ref_match.group(2)}"
1181
+ caption_index[key2] = caption_info
1182
+
1183
+ # Also store with 's' prefix if in SI
1184
+ if source == "supplementary" and 's' not in key2:
1185
+ key3 = f"{ref_match.group(1)} s{ref_match.group(2)}"
1186
+ caption_index[key3] = caption_info
1187
+
1188
+ self._caption_index = caption_index
1189
+ return caption_index
1190
+
1106
1191
  def _page_with_reference(self, ref_id: str) -> Optional[str]:
1107
- for page in self.all_pages:
1108
- if ref_id.lower() in page.lower():
1109
- return page
1110
- return None
1192
+ """Find page(s) containing a reference using flexible matching."""
1193
+ caption_index = self._build_caption_index()
1194
+ ref_lower = ref_id.lower().strip()
1195
+
1196
+ # Try multiple matching strategies
1197
+ matches = []
1198
+
1199
+ # Strategy 1: Direct key lookup
1200
+ if ref_lower in caption_index:
1201
+ matches.append(caption_index[ref_lower])
1202
+
1203
+ # Strategy 2: Normalized reference lookup (e.g., "table 5", "figure s3")
1204
+ ref_match = re.match(r'(table|figure|fig|scheme)\s*s?(\d+[a-z]?)', ref_lower, re.I)
1205
+ if ref_match:
1206
+ ref_type, ref_num = ref_match.groups()
1207
+ if ref_type == 'fig':
1208
+ ref_type = 'figure'
1209
+
1210
+ # Try different key formats
1211
+ keys_to_try = [
1212
+ f"{ref_type} {ref_num}",
1213
+ f"{ref_type} s{ref_num}",
1214
+ f"table {ref_num}", # Sometimes figures are mislabeled
1215
+ f"fig {ref_num}",
1216
+ f"figure {ref_num}"
1217
+ ]
1218
+
1219
+ for key in keys_to_try:
1220
+ if key in caption_index and caption_index[key] not in matches:
1221
+ matches.append(caption_index[key])
1222
+
1223
+ # Strategy 3: Fuzzy matching on caption text
1224
+ if not matches:
1225
+ # Look for any caption containing the reference number
1226
+ for key, info in caption_index.items():
1227
+ if ref_match and ref_num in key and any(t in key for t in ['table', 'figure', 'fig', 'scheme']):
1228
+ if info not in matches:
1229
+ matches.append(info)
1230
+
1231
+ # Return results
1232
+ if not matches:
1233
+ LOGGER.warning(f"No matches found for reference '{ref_id}'")
1234
+ # Last resort: simple text search
1235
+ for page in self.all_pages:
1236
+ if ref_lower in page.lower():
1237
+ return page
1238
+ return None
1239
+
1240
+ # If single match, return it
1241
+ if len(matches) == 1:
1242
+ return matches[0]['page_content']
1243
+
1244
+ # Multiple matches: combine them with source annotations
1245
+ LOGGER.info(f"Found {len(matches)} potential matches for '{ref_id}'")
1246
+ combined_pages = []
1247
+ for match in matches:
1248
+ header = f"\n\n=== {match['source'].upper()} PAGE {match['page_num']} ===\n"
1249
+ header += f"Caption: {match['full_caption'][:200]}...\n"
1250
+ combined_pages.append(header + match['page_content'])
1251
+
1252
+ return "\n".join(combined_pages)
1111
1253
 
1112
1254
  # ---- Table text helper - now returns full page ----
1113
1255
  def _extract_table_context(self, ref: str) -> str:
@@ -1140,19 +1282,29 @@ class ReactionExtractor:
1140
1282
  return pix
1141
1283
 
1142
1284
  # ---- NEW: Page image helper for both figures and tables ----
1143
- def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
1285
+ def _extract_page_png(self, ref: str, extract_figure_only: bool = True, caption_hint: str = "", document_hint: str = "") -> Optional[str]:
1144
1286
  """Export the page containing the reference as PNG.
1145
1287
  If extract_figure_only=True, extracts just the figure above the caption.
1146
1288
  If False, extracts the entire page (useful for tables).
1147
- Returns a base64-encoded PNG or None."""
1148
- LOGGER.debug("_extract_page_png called with ref='%s', extract_figure_only=%s", ref, extract_figure_only)
1289
+ Returns a base64-encoded PNG or None.
1149
1290
 
1150
- # Check cache first
1151
- cache_key = f"{ref}_{extract_figure_only}"
1291
+ Args:
1292
+ ref: The reference string (e.g., "Fig. 3")
1293
+ extract_figure_only: Whether to extract just the figure or the entire page
1294
+ caption_hint: Optional caption text from location data to help find the exact figure
1295
+ document_hint: Optional hint about which document to search ("manuscript" or "supplementary")
1296
+ """
1297
+ LOGGER.info("_extract_page_png called with ref='%s', extract_figure_only=%s, caption_hint='%s', document_hint='%s'",
1298
+ ref, extract_figure_only, caption_hint[:50] + "..." if caption_hint else "EMPTY", document_hint)
1299
+
1300
+ # Check cache first - include document hint in key to avoid cross-document contamination
1301
+ cache_key = f"{ref}_{extract_figure_only}_{document_hint}" if document_hint else f"{ref}_{extract_figure_only}"
1152
1302
  cached_result = self._figure_cache.get(cache_key)
1153
1303
  if cached_result is not None:
1154
- LOGGER.debug("Using cached figure for %s", ref)
1304
+ LOGGER.info("Using cached figure for %s (cache key: %s)", ref, cache_key)
1155
1305
  return cached_result
1306
+ else:
1307
+ LOGGER.info("Cache miss for %s (cache key: %s)", ref, cache_key)
1156
1308
 
1157
1309
  # For table extraction, use multi-page approach
1158
1310
  if not extract_figure_only:
@@ -1162,13 +1314,28 @@ class ReactionExtractor:
1162
1314
  return self._extract_multiple_pages_png(pages_with_ref, ref)
1163
1315
  return None
1164
1316
 
1165
- # For figure extraction, search both documents for actual figure captions
1166
- docs = list(filter(None, [self.ms_doc, self.si_doc]))
1167
- LOGGER.debug("Searching for '%s' in %d documents", ref, len(docs))
1317
+ # For figure extraction, prioritize based on document hint
1318
+ if document_hint == "manuscript" and self.ms_doc:
1319
+ # Search manuscript first, then SI as fallback
1320
+ docs = list(filter(None, [self.ms_doc, self.si_doc]))
1321
+ LOGGER.info("Prioritizing manuscript document for '%s' (hint: %s)", ref, document_hint)
1322
+ LOGGER.info("Search order: 1) Manuscript, 2) SI (fallback)")
1323
+ elif document_hint == "supplementary" and self.si_doc:
1324
+ # Search SI first, then manuscript as fallback
1325
+ docs = list(filter(None, [self.si_doc, self.ms_doc]))
1326
+ LOGGER.info("Prioritizing supplementary document for '%s' (hint: %s)", ref, document_hint)
1327
+ LOGGER.info("Search order: 1) SI, 2) Manuscript (fallback)")
1328
+ else:
1329
+ # Default behavior - search both in order
1330
+ docs = list(filter(None, [self.ms_doc, self.si_doc]))
1331
+ LOGGER.info("Searching for '%s' in %d documents (no document hint)", ref, len(docs))
1332
+ LOGGER.info("Search order: 1) Manuscript, 2) SI (default order)")
1168
1333
 
1169
1334
  for doc_idx, doc in enumerate(docs):
1170
- doc_name = "MS" if doc_idx == 0 else "SI"
1171
- LOGGER.debug("Searching in %s document with %d pages", doc_name, doc.page_count)
1335
+ # Determine document name based on actual document, not position
1336
+ doc_name = "MS" if doc == self.ms_doc else "SI"
1337
+ LOGGER.info("Searching document %d/%d: %s (has %d pages)",
1338
+ doc_idx + 1, len(docs), doc_name, doc.page_count)
1172
1339
 
1173
1340
  for page_number in range(doc.page_count):
1174
1341
  page = doc.load_page(page_number)
@@ -1181,9 +1348,90 @@ class ReactionExtractor:
1181
1348
  LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
1182
1349
  continue
1183
1350
 
1351
+ # If we have a caption hint, try to find it using fuzzy matching
1352
+ if caption_hint:
1353
+ LOGGER.info("=== CAPTION HINT SEARCH ===")
1354
+ LOGGER.info("Caption hint provided: %s", caption_hint[:100])
1355
+ LOGGER.info("Searching in %s document, page %d", doc_name, page_number + 1)
1356
+ LOGGER.info("Page text length: %d chars", len(page_text))
1357
+
1358
+ # Check if caption exists in raw form
1359
+ if caption_hint[:50] in page_text:
1360
+ LOGGER.info("✓ Caption hint found in raw page text!")
1361
+ else:
1362
+ LOGGER.info("✗ Caption hint NOT found in raw page text")
1363
+
1364
+ # Normalize texts for better matching
1365
+ def normalize_for_matching(text):
1366
+ # Remove extra whitespace, normalize spaces around punctuation
1367
+ text = ' '.join(text.split())
1368
+ # Normalize different dash types
1369
+ text = text.replace('–', '-').replace('—', '-')
1370
+ return text
1371
+
1372
+ normalized_hint = normalize_for_matching(caption_hint[:100]) # Use first 100 chars
1373
+ normalized_page = normalize_for_matching(page_text)
1374
+
1375
+ # Try to find the caption using fuzzy matching
1376
+ best_match_pos = -1
1377
+ best_match_score = 0
1378
+ match_found = False
1379
+
1380
+ # Slide through the page text looking for best match
1381
+ hint_len = len(normalized_hint)
1382
+ for i in range(len(normalized_page) - hint_len + 1):
1383
+ snippet = normalized_page[i:i + hint_len]
1384
+ # Simple character-based similarity
1385
+ matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
1386
+ score = matches / hint_len
1387
+
1388
+ if score > best_match_score and score > 0.8: # 80% similarity threshold
1389
+ best_match_score = score
1390
+ best_match_pos = i
1391
+ match_found = True
1392
+
1393
+ if match_found and best_match_pos >= 0:
1394
+ LOGGER.info("Found caption match in %s document on page %d with %.1f%% similarity",
1395
+ doc_name, page_number + 1, best_match_score * 100)
1396
+
1397
+ # Instead of complex position mapping, just search for the beginning of the caption
1398
+ # Use the first 30 chars which should be unique enough
1399
+ search_text = caption_hint[:30].strip()
1400
+ LOGGER.info("Searching for caption text: '%s'", search_text)
1401
+ caption_instances = page.search_for(search_text)
1402
+ LOGGER.info("Found %d caption instances", len(caption_instances) if caption_instances else 0)
1403
+
1404
+ if caption_instances:
1405
+ cap_rect = caption_instances[0]
1406
+ caption_found = True
1407
+ # Extract figure above this caption
1408
+ if extract_figure_only:
1409
+ LOGGER.info("Extracting figure area including caption for %s from %s document", ref, doc_name)
1410
+ LOGGER.info("Caption found at rect: %s on page %d", cap_rect, page_number + 1)
1411
+ page_rect = page.rect
1412
+
1413
+ # Include the caption in the extraction
1414
+ # Add some padding below the caption to ensure we get the full text
1415
+ caption_padding = 30 # pixels below caption
1416
+ figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
1417
+ LOGGER.info("Page rect: %s, Figure rect including caption: %s", page_rect, figure_rect)
1418
+ mat = fitz.Matrix(5.0, 5.0)
1419
+ pix = page.get_pixmap(matrix=mat, clip=figure_rect)
1420
+ pix = self._ensure_rgb_pixmap(pix)
1421
+ img_bytes = pix.tobytes("png")
1422
+ img_b64 = b64encode(img_bytes).decode('utf-8')
1423
+ self._figure_cache.put(cache_key, img_b64)
1424
+ LOGGER.info("Successfully extracted figure using caption hint for %s from %s document, page %d",
1425
+ ref, doc_name, page_number + 1)
1426
+ return img_b64
1427
+ else:
1428
+ LOGGER.info("No fuzzy match found for caption hint on page %d (best score: %.1f%%)",
1429
+ page_number + 1, best_match_score * 100)
1430
+
1431
+ # If caption hint didn't work or wasn't provided, fall back to pattern matching
1184
1432
  # Look for figure caption pattern more flexibly
1185
1433
  # Normalize the reference to handle variations
1186
- figure_num = ref.replace('Figure', '').replace('figure', '').strip()
1434
+ figure_num = ref.replace('Figure', '').replace('figure', '').replace('Fig.', '').replace('Fig', '').strip()
1187
1435
 
1188
1436
  # Extract main figure number from subfigure (e.g., "1C" -> "1")
1189
1437
  main_figure_num = re.match(r'^(\d+)', figure_num)
@@ -1193,10 +1441,10 @@ class ReactionExtractor:
1193
1441
  main_figure_num = figure_num
1194
1442
 
1195
1443
  # Create a flexible pattern that handles various spacing and formatting
1196
- # This pattern looks for "Figure" (case insensitive) followed by optional spaces
1197
- # then the figure number, then any of: period, colon, space+capital letter, or end of line
1444
+ # This pattern looks for "Figure" or "Fig" (case insensitive) followed by optional spaces
1445
+ # then the figure number, then any of: period, colon, pipe, space+capital letter, or end of line
1198
1446
  # Also match at the beginning of a line to catch captions
1199
- flexible_pattern = rf"(?i)(?:^|\n)\s*figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
1447
+ flexible_pattern = rf"(?i)(?:^|\n)\s*(?:figure|fig\.?)\s*{re.escape(main_figure_num)}(?:\.|:|\||\s+\||(?=\s+[A-Z])|\s*$)"
1200
1448
 
1201
1449
  LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
1202
1450
  main_figure_num, flexible_pattern)
@@ -1252,8 +1500,8 @@ class ReactionExtractor:
1252
1500
  if text_instances:
1253
1501
  cap_rect = text_instances[0]
1254
1502
  caption_found = True
1255
- LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
1256
- ref, caption_text, remaining_text[:50])
1503
+ LOGGER.info("Found actual caption for %s in %s document on page %d: '%s' with following text: '%s...'",
1504
+ ref, doc_name, page_number + 1, caption_text, remaining_text[:50])
1257
1505
  break
1258
1506
 
1259
1507
  if not caption_found:
@@ -1274,18 +1522,18 @@ class ReactionExtractor:
1274
1522
  continue
1275
1523
 
1276
1524
  if extract_figure_only:
1277
- # Extract only the area above the caption (the actual figure)
1278
- # This excludes caption text and focuses on visual elements
1279
- LOGGER.info("Extracting figure area above caption for %s", ref)
1525
+ # Extract the figure area including the caption
1526
+ LOGGER.info("Extracting figure area including caption for %s", ref)
1280
1527
 
1281
1528
  # Get the page dimensions
1282
1529
  page_rect = page.rect
1283
1530
 
1284
- # Extract the area above the caption
1531
+ # Extract the area including the caption
1285
1532
  if cap_rect:
1286
- # Extract from top of page to top of caption
1287
- figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
1288
- LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
1533
+ # Extract from top of page to bottom of caption plus padding
1534
+ caption_padding = 30 # pixels below caption
1535
+ figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y1 + caption_padding)
1536
+ LOGGER.debug("Extracting figure area with caption: %s (caption ends at y=%f)", figure_rect, cap_rect.y1)
1289
1537
  else:
1290
1538
  # If no caption found, use top 80% of page
1291
1539
  figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
@@ -1419,7 +1667,7 @@ class ReactionExtractor:
1419
1667
  # Sort pages by document and page number
1420
1668
  pages.sort(key=lambda x: (id(x[0]), x[1]))
1421
1669
 
1422
- # Extract the range of pages including one page after
1670
+ # Extract the range of pages including one page after for tables
1423
1671
  all_images = []
1424
1672
  for i, (doc, page_num) in enumerate(pages):
1425
1673
  # Add the current page
@@ -1428,7 +1676,16 @@ class ReactionExtractor:
1428
1676
  pix = self._ensure_rgb_pixmap(pix)
1429
1677
  all_images.append(pix)
1430
1678
 
1431
- # Only extract the page containing the reference (removed next page logic)
1679
+ # Add the next page as well for tables (in case data continues)
1680
+ next_page_num = page_num + 1
1681
+ if next_page_num < doc.page_count:
1682
+ try:
1683
+ next_pix = doc.load_page(next_page_num).get_pixmap(matrix=mat)
1684
+ next_pix = self._ensure_rgb_pixmap(next_pix)
1685
+ all_images.append(next_pix)
1686
+ LOGGER.info("Including next page (%d) for table %s", next_page_num + 1, ref)
1687
+ except Exception as e:
1688
+ LOGGER.warning("Failed to extract next page %d for %s: %s", next_page_num + 1, ref, e)
1432
1689
 
1433
1690
  if not all_images:
1434
1691
  return None
@@ -1533,21 +1790,9 @@ class ReactionExtractor:
1533
1790
 
1534
1791
  def _validate_location_exists(self, ref: str) -> bool:
1535
1792
  """Verify that the referenced location actually exists in the document."""
1536
- # Search for the actual reference in both manuscript and SI documents
1537
- docs_to_check = [self.ms_doc]
1538
- if self.si_doc:
1539
- docs_to_check.append(self.si_doc)
1540
-
1541
- for doc in docs_to_check:
1542
- for page_num in range(len(doc)):
1543
- page = doc[page_num]
1544
- text = page.get_text()
1545
-
1546
- # Look for table references like "Table 1", "Table S1", etc.
1547
- if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
1548
- return True
1549
-
1550
- return False
1793
+ # Use the caption index to check if location exists
1794
+ result = self._page_with_reference(ref)
1795
+ return result is not None
1551
1796
 
1552
1797
  def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
1553
1798
  """Validate that the context contains meaningful content for extraction."""
@@ -1596,15 +1841,31 @@ class ReactionExtractor:
1596
1841
  LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
1597
1842
  return True
1598
1843
 
1599
- def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
1600
- """Extract performance metrics for multiple enzymes from the identified location in batch."""
1601
- LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", ref, len(enzyme_list))
1602
- ref_lc = ref.lower()
1844
+ def extract_metrics_batch(self, enzyme_list: List[str], ref: Union[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
1845
+ """Extract performance metrics for multiple enzymes from the identified location in batch.
1846
+
1847
+ Args:
1848
+ enzyme_list: List of enzyme names to extract metrics for
1849
+ ref: Either a string reference (e.g., "Fig. 3") or a location dict with 'location' and optionally 'caption'
1850
+ """
1851
+ # Handle both string and dict inputs
1852
+ if isinstance(ref, dict):
1853
+ location_str = ref['location']
1854
+ caption_hint = ref.get('caption', '')
1855
+ document_hint = ref.get('document', '')
1856
+ LOGGER.info("extract_metrics_batch called with location='%s' (with caption hint, document=%s) for %d enzymes",
1857
+ location_str, document_hint, len(enzyme_list))
1858
+ else:
1859
+ location_str = ref
1860
+ caption_hint = ''
1861
+ document_hint = ''
1862
+ LOGGER.info("extract_metrics_batch called with ref='%s' for %d enzymes", location_str, len(enzyme_list))
1863
+ ref_lc = location_str.lower()
1603
1864
  image_b64: Optional[str] = None
1604
1865
 
1605
1866
  # First, validate that the location actually exists in the document
1606
- if not self._validate_location_exists(ref):
1607
- LOGGER.warning("Location %s not found in document - skipping", ref)
1867
+ if not self._validate_location_exists(location_str):
1868
+ LOGGER.warning("Location %s not found in document - skipping", location_str)
1608
1869
  return []
1609
1870
 
1610
1871
  # Add campaign context if available
@@ -1614,30 +1875,30 @@ class ReactionExtractor:
1614
1875
 
1615
1876
  if self._TAB_RE.search(ref_lc):
1616
1877
  # For tables, try to extract the page as an image first
1617
- image_b64 = self._extract_page_png(ref, extract_figure_only=False)
1878
+ image_b64 = self._extract_page_png(location_str, extract_figure_only=False, document_hint=document_hint)
1618
1879
  if not image_b64:
1619
- LOGGER.debug("No page image found for %s - using full page text", ref)
1620
- snippet = self._extract_table_context(ref)
1880
+ LOGGER.debug("No page image found for %s - using full page text", location_str)
1881
+ snippet = self._extract_table_context(location_str)
1621
1882
  elif self._FIG_RE.search(ref_lc):
1622
1883
  # For figures, extract just the figure image (same logic as compound mapping)
1623
- LOGGER.debug("Attempting to extract figure image for '%s'", ref)
1624
- image_b64 = self._extract_page_png(ref, extract_figure_only=True)
1884
+ LOGGER.info("Attempting to extract figure image for '%s'", location_str)
1885
+ image_b64 = self._extract_page_png(location_str, extract_figure_only=True, caption_hint=caption_hint, document_hint=document_hint)
1625
1886
  if not image_b64:
1626
- LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", ref)
1627
- snippet = self._extract_figure_caption(ref)
1887
+ LOGGER.warning("Failed to extract figure image for '%s' - falling back to caption text", location_str)
1888
+ snippet = self._extract_figure_caption(location_str)
1628
1889
  LOGGER.debug("Caption extraction result: %s",
1629
1890
  f"'{snippet[:100]}...'" if snippet else "empty")
1630
1891
  else:
1631
- LOGGER.info("Successfully extracted figure image for '%s'", ref)
1892
+ LOGGER.info("Successfully extracted figure image for '%s'", location_str)
1632
1893
  # If figure is found, ignore text information - use image only
1633
1894
  snippet = ""
1634
1895
  else:
1635
- snippet = self._page_with_reference(ref) or ""
1896
+ snippet = self._page_with_reference(location_str) or ""
1636
1897
 
1637
1898
  # For figures with images, skip text validation and proceed with image extraction
1638
1899
  if image_b64 and self._FIG_RE.search(ref_lc):
1639
- LOGGER.info("Using figure image for %s - ignoring text context", ref)
1640
- elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
1900
+ LOGGER.info("Using figure image for %s - ignoring text context", location_str)
1901
+ elif not image_b64 and not self._validate_context(snippet, enzyme_list, location_str):
1641
1902
  return []
1642
1903
 
1643
1904
  # Create enhanced enzyme descriptions with parent/mutation context
@@ -1934,6 +2195,19 @@ Different campaigns may use different model reactions.
1934
2195
  prompt = PROMPT_COMPOUND_MAPPING
1935
2196
  if campaign_filter:
1936
2197
  prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
2198
+
2199
+ # Add campaign info as hints
2200
+ if self.campaign_info:
2201
+ substrate_id = self.campaign_info.get('substrate_id', '')
2202
+ product_id = self.campaign_info.get('product_id', '')
2203
+ model_substrate = self.campaign_info.get('model_substrate', '')
2204
+ model_product = self.campaign_info.get('model_product', '')
2205
+
2206
+ if substrate_id and model_substrate:
2207
+ prompt += f"\n\nHINT: The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})"
2208
+ if product_id and model_product:
2209
+ prompt += f"\nHINT: The model product for this campaign is likely '{model_product}' (ID: {product_id})"
2210
+
1937
2211
  if compound_ids:
1938
2212
  prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
1939
2213
  prompt += "\n\nTEXT:\n" + extraction_text
@@ -2011,6 +2285,22 @@ IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campa
2011
2285
  {campaigns_warning}
2012
2286
  Different campaigns may use different numbering systems for compounds.
2013
2287
  Do NOT include compound information from other campaigns."""
2288
+
2289
+ # Add campaign info as hints
2290
+ if self.campaign_info:
2291
+ substrate_id = self.campaign_info.get('substrate_id', '')
2292
+ product_id = self.campaign_info.get('product_id', '')
2293
+ model_substrate = self.campaign_info.get('model_substrate', '')
2294
+ model_product = self.campaign_info.get('model_product', '')
2295
+
2296
+ hints = []
2297
+ if substrate_id and model_substrate:
2298
+ hints.append(f"The model substrate for this campaign is likely '{model_substrate}' (ID: {substrate_id})")
2299
+ if product_id and model_product:
2300
+ hints.append(f"The model product for this campaign is likely '{model_product}' (ID: {product_id})")
2301
+
2302
+ if hints:
2303
+ prompt += "\n\nHINTS FROM CAMPAIGN INFO:\n" + "\n".join(hints)
2014
2304
 
2015
2305
  prompt += """
2016
2306
 
@@ -2476,6 +2766,7 @@ Do NOT include compound information from other campaigns.
2476
2766
  compound_mappings = {}
2477
2767
  if compound_ids:
2478
2768
  LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
2769
+
2479
2770
  # Pass the IUPAC location hint if we have it
2480
2771
  iupac_hint = locations.get("iupac_location") if locations else None
2481
2772
  compound_mappings = self._extract_compound_mappings_adaptive(
@@ -2506,6 +2797,22 @@ CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely differ
2506
2797
  - Different campaigns may use similar enzyme names but different substrates
2507
2798
  - Be extremely careful to only extract data for the {self.campaign_filter} campaign
2508
2799
  - Ignore data from other campaigns even if they seem similar
2800
+ """
2801
+
2802
+ # Add specific campaign info if available
2803
+ campaign_info_context = ""
2804
+ if self.campaign_info:
2805
+ campaign_info_context = f"""
2806
+
2807
+ KNOWN CAMPAIGN INFORMATION:
2808
+ - Campaign: {self.campaign_info.get('campaign_name', '')}
2809
+ - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
2810
+ - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
2811
+ - Known Data Locations: {', '.join(self.campaign_info.get('data_locations', []))}
2812
+
2813
+ IMPORTANT: Use this information to guide your extraction. The model reaction should involve:
2814
+ - Substrate ID: {self.campaign_info.get('substrate_id', '')}
2815
+ - Product ID: {self.campaign_info.get('product_id', '')}
2509
2816
  """
2510
2817
 
2511
2818
  campaign_context = f"""
@@ -2515,6 +2822,7 @@ You are extracting the model reaction used specifically for these enzyme variant
2515
2822
 
2516
2823
  These variants belong to campaign: {self.campaign_filter}
2517
2824
  {campaigns_context}
2825
+ {campaign_info_context}
2518
2826
  Focus on extracting the model reaction that was used to evaluate THESE specific variants.
2519
2827
  Different campaigns may use different model reactions and substrates.
2520
2828
 
@@ -2790,7 +3098,7 @@ Different campaigns may use different model reactions and substrates.
2790
3098
  best_location.get('confidence', 0))
2791
3099
 
2792
3100
  # Extract metrics from the most confident source only
2793
- metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
3101
+ metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location)
2794
3102
 
2795
3103
  # Filter to valid metrics
2796
3104
  valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
@@ -3034,7 +3342,7 @@ Different campaigns may use different model reactions and substrates.
3034
3342
  location['location'], location.get('confidence', 0))
3035
3343
 
3036
3344
  # Extract metrics from this location
3037
- metrics_rows = self.extract_metrics_batch(list(all_variants), location['location'])
3345
+ metrics_rows = self.extract_metrics_batch(list(all_variants), location)
3038
3346
 
3039
3347
  # Filter to valid metrics
3040
3348
  valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
@@ -3277,8 +3585,20 @@ def main() -> None:
3277
3585
  campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3278
3586
  LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3279
3587
 
3588
+ # Load campaign info from campaigns.json if available
3589
+ campaign_info = None
3590
+ if args.debug_dir:
3591
+ from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
3592
+ campaigns_file = Path(args.debug_dir) / "campaigns.json"
3593
+ if campaigns_file.exists():
3594
+ campaigns = load_campaigns_from_file(campaigns_file)
3595
+ campaign_info = find_campaign_by_id(campaigns, campaign_filter)
3596
+ if campaign_info:
3597
+ LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign_filter)
3598
+
3280
3599
  extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3281
- campaign_filter=campaign_filter, all_campaigns=all_campaigns)
3600
+ campaign_filter=campaign_filter, all_campaigns=all_campaigns,
3601
+ campaign_info=campaign_info)
3282
3602
  df_metrics = extractor.run(enzyme_df)
3283
3603
 
3284
3604
  elif len(all_campaigns) > 1:
@@ -3304,8 +3624,20 @@ def main() -> None:
3304
3624
  campaign_debug_dir.mkdir(parents=True, exist_ok=True)
3305
3625
  LOGGER.info("Campaign debug directory: %s", campaign_debug_dir)
3306
3626
 
3627
+ # Load campaign info from campaigns.json if available
3628
+ campaign_info = None
3629
+ if args.debug_dir:
3630
+ from .campaign_utils import load_campaigns_from_file, find_campaign_by_id
3631
+ campaigns_file = Path(args.debug_dir) / "campaigns.json"
3632
+ if campaigns_file.exists():
3633
+ campaigns = load_campaigns_from_file(campaigns_file)
3634
+ campaign_info = find_campaign_by_id(campaigns, campaign)
3635
+ if campaign_info:
3636
+ LOGGER.info("Loaded campaign info for %s from campaigns.json", campaign)
3637
+
3307
3638
  extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=campaign_debug_dir,
3308
- campaign_filter=campaign, all_campaigns=all_campaigns)
3639
+ campaign_filter=campaign, all_campaigns=all_campaigns,
3640
+ campaign_info=campaign_info)
3309
3641
 
3310
3642
  # Run extraction for this campaign
3311
3643
  campaign_metrics = extractor.run(campaign_df)