debase 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -83,6 +83,7 @@ class ScopeEntry:
83
83
  # Metadata
84
84
  data_location: Optional[str] = None
85
85
  data_source_type: Dict[str, str] = field(default_factory=dict)
86
+ campaign_id: Optional[str] = None
86
87
 
87
88
  # Lineage information (populated during merge)
88
89
  parent_id: Optional[str] = None
@@ -312,24 +313,23 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
312
313
  # Extract just the figure with its caption, avoiding excessive white space
313
314
  page_rect = page.rect
314
315
 
315
- # Calculate the figure region on current page only
316
- # Extract from top of page to just below the caption
316
+ # Extract the entire page containing the identified location
317
317
  fig_top = 0 # Start from top of page
318
- fig_bottom = min(caption_rect.y0 + 200, page_rect.height) # 200px below caption, but not more than page height
318
+ fig_bottom = page_rect.height # Full page height
319
319
  fig_left = 0 # Full width
320
320
  fig_right = page_rect.width
321
321
 
322
- # Extract only the figure region (no additional pages to avoid white space)
322
+ # Extract the entire page
323
323
  clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
324
324
  mat = fitz.Matrix(2, 2) # 2x zoom for better quality
325
325
  pix = page.get_pixmap(clip=clip_rect, matrix=mat)
326
326
 
327
- log.info("Extracted figure region: %.0fx%.0f pixels from page %d",
327
+ log.info("Extracted entire page: %.0fx%.0f pixels from page %d",
328
328
  pix.width, pix.height, page_num + 1)
329
329
 
330
330
  # Convert to PNG
331
331
  img_bytes = pix.tobytes("png")
332
- log.info("Extracted figure region: %dx%d pixels from page %d",
332
+ log.info("Converted to PNG: %dx%d pixels from page %d",
333
333
  pix.width, pix.height, page_num + 1)
334
334
 
335
335
  return b64encode(img_bytes).decode()
@@ -974,25 +974,73 @@ Return as JSON:
974
974
 
975
975
  # ---- 6.2 Helper functions -------------------------------------------------
976
976
 
977
- def identify_scope_locations(
977
+
978
+
979
+ def identify_scope_locations_for_campaign(
978
980
  text: str,
979
981
  model,
982
+ campaign_id: str,
983
+ enzyme_ids: List[str],
980
984
  *,
981
985
  max_results: int = 5,
982
986
  debug_dir: str | Path | None = None,
983
987
  ) -> List[dict]:
984
- """Ask Gemini where substrate scope data is located."""
985
- prompt = _SCOPE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
988
+ """Ask Gemini where substrate scope data is located for a specific campaign."""
989
+
990
+ # Simple model reaction context
991
+ model_reactions_context = """
992
+ IMPORTANT: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
993
+ Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
994
+ """
995
+
996
+ # Create campaign-specific prompt
997
+ campaign_prompt = f"""
998
+ You are an expert reader of biocatalysis manuscripts.
999
+ Analyze this paper and identify all locations containing substrate scope data for the specific campaign: "{campaign_id}".
1000
+
1001
+ CAMPAIGN CONTEXT:
1002
+ - Campaign ID: {campaign_id}
1003
+ - Target enzymes: {', '.join(enzyme_ids)}
1004
+
1005
+ {model_reactions_context}
1006
+
1007
+ Your task is to:
1008
+ 1. Identify locations (tables, figures, text) containing substrate scope reaction data specifically for this campaign
1009
+ 2. Focus only on substrate scope studies involving the enzymes: {', '.join(enzyme_ids)}
1010
+ 3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
1011
+ - Model reactions are those used to evolve/optimize the enzymes
1012
+ - Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
1013
+ 4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
1014
+ 5. Determine which enzyme variants from this campaign were tested in substrate scope studies
1015
+
1016
+ Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
1017
+ [
1018
+ {{
1019
+ "location": "Description of where the data is found",
1020
+ "type": "table|figure|text",
1021
+ "confidence": 0.0-1.0,
1022
+ "enzyme_variants": ["list of enzyme IDs found"],
1023
+ "substrates_tested": ["list of substrates if identifiable"],
1024
+ "campaign_match": true/false,
1025
+ "is_substrate_scope": true/false,
1026
+ "model_reaction_excluded": "reason why this is not a model reaction"
1027
+ }}
1028
+ ]
1029
+
1030
+ Important: Only return locations that contain TRUE substrate scope data (not model reactions) for the specified campaign and enzymes. If no substrate scope data exists for this campaign, return an empty array.
1031
+ """
1032
+
1033
+ prompt = campaign_prompt + "\n\nTEXT:\n" + text[:15_000]
986
1034
  locs: List[dict] = []
987
1035
  try:
988
1036
  locs = generate_json_with_retry(
989
1037
  model,
990
1038
  prompt,
991
1039
  debug_dir=debug_dir,
992
- tag="scope_locate",
1040
+ tag=f"scope_locate_{campaign_id}",
993
1041
  )
994
1042
  except Exception as exc: # pragma: no cover
995
- log.warning("identify_scope_locations(): %s", exc)
1043
+ log.warning("identify_scope_locations_for_campaign(%s): %s", campaign_id, exc)
996
1044
  return locs if isinstance(locs, list) else []
997
1045
 
998
1046
  def identify_iupac_sections(
@@ -1679,16 +1727,18 @@ def extract_compound_mappings(
1679
1727
  log.info("Total compound mappings extracted: %d", len(mappings))
1680
1728
  return mappings
1681
1729
 
1682
- def extract_all_substrate_scope_data(
1730
+ def extract_substrate_scope_entries_for_campaign(
1683
1731
  text: str,
1684
1732
  model,
1685
1733
  locations: List[dict],
1734
+ campaign_id: str,
1735
+ enzyme_ids: List[str],
1686
1736
  *,
1687
1737
  pdf_paths: List[Path] = None,
1688
- figure_images: Dict[str, str] = None,
1689
1738
  debug_dir: str | Path | None = None,
1690
1739
  ) -> List[dict]:
1691
- """Extract all substrate scope data at once from all primary sources."""
1740
+ """Extract substrate scope data specifically for a campaign."""
1741
+
1692
1742
  extraction_hints = ""
1693
1743
  all_refs = []
1694
1744
 
@@ -1700,124 +1750,189 @@ def extract_all_substrate_scope_data(
1700
1750
  location_strs.append(loc_str)
1701
1751
  all_refs.append(loc_str)
1702
1752
 
1703
- extraction_hints = f"\nSubstrate scope locations: {', '.join(location_strs)}"
1704
-
1705
- # Collect all enzyme variants
1706
- all_variants = []
1707
- for loc in locations:
1708
- variants = loc.get('enzyme_variants_tested', [])
1709
- all_variants.extend(variants)
1753
+ extraction_hints = f"\nSubstrate scope locations for campaign {campaign_id}: {', '.join(location_strs)}"
1710
1754
 
1711
- if all_variants:
1712
- unique_variants = list(set(all_variants))
1713
- extraction_hints += f"\nEnzyme variants: {', '.join(unique_variants)}"
1755
+ # Focus on campaign-specific enzyme variants
1756
+ extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
1714
1757
 
1715
- # Extract text from ALL identified locations
1758
+ # Extract text from ALL identified locations (like the original function did)
1716
1759
  extraction_texts = []
1760
+ figure_images = {}
1717
1761
 
1718
1762
  for ref in all_refs:
1719
1763
  if ref and pdf_paths:
1720
1764
  ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
1721
1765
  if ref_text:
1722
- # Add figure image notation if available
1723
- if figure_images and ref in figure_images:
1724
- ref_text = f"[FIGURE IMAGE EXTRACTED: {ref}]\n\n{ref_text}"
1725
1766
  extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
1767
+
1768
+ # Extract figure images for this reference (crop page around figure)
1769
+ try:
1770
+ fig_base64 = extract_figure_image(pdf_paths, ref)
1771
+ if fig_base64:
1772
+ figure_images[ref] = fig_base64
1773
+ log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
1774
+
1775
+ # Save the figure image to debug folder
1776
+ if debug_dir:
1777
+ debug_path = Path(debug_dir)
1778
+ debug_path.mkdir(parents=True, exist_ok=True)
1779
+ # Clean ref for filename
1780
+ safe_ref = re.sub(r'[^\w\s-]', '', ref).strip().replace(' ', '_')
1781
+ image_file = debug_path / f"figure_{safe_ref}_{campaign_id}.png"
1782
+
1783
+ # Decode and save the image
1784
+ import base64
1785
+ with open(image_file, 'wb') as f:
1786
+ f.write(base64.b64decode(fig_base64))
1787
+ log.info("Campaign %s - saved figure image to %s", campaign_id, image_file)
1788
+ except Exception as e:
1789
+ log.warning("Campaign %s - failed to extract figure for %s: %s", campaign_id, ref, e)
1726
1790
 
1727
1791
  if not extraction_texts:
1728
1792
  extraction_texts = [text[:50_000]]
1729
1793
 
1730
1794
  extraction_text = "\n\n".join(extraction_texts)
1731
1795
 
1732
- prompt = _SUBSTRATE_SCOPE_PROMPT.format(extraction_hints=extraction_hints)
1733
- prompt += "\n\nTEXT:\n" + extraction_text
1734
-
1735
- # Prepare multimodal content with images
1736
- content_parts = [prompt]
1796
+ # Simple model reaction context
1797
+ model_reactions_context = """
1798
+ CRITICAL: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
1799
+ Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
1800
+ """
1737
1801
 
1738
- # Add figure images to the prompt
1739
- if figure_images:
1740
- import PIL.Image
1741
- import io
1742
- import base64
1743
-
1744
- for fig_ref, fig_base64 in figure_images.items():
1745
- try:
1746
- # Convert base64 to PIL Image
1747
- img_bytes = base64.b64decode(fig_base64)
1748
- image = PIL.Image.open(io.BytesIO(img_bytes))
1749
- content_parts.append(f"\n[Figure: {fig_ref}]")
1750
- content_parts.append(image)
1751
- log.info("Added figure %s to multimodal prompt", fig_ref)
1752
- except Exception as e:
1753
- log.warning("Failed to add figure %s: %s", fig_ref, e)
1802
+ # Create campaign-specific prompt
1803
+ campaign_prompt = f"""
1804
+ You are an expert reader of biocatalysis manuscripts.
1805
+ Extract ALL substrate scope reaction data specifically for campaign: "{campaign_id}".
1806
+
1807
+ CAMPAIGN CONTEXT:
1808
+ - Campaign ID: {campaign_id}
1809
+ - Target enzymes: {', '.join(enzyme_ids)}
1810
+
1811
+ {model_reactions_context}
1812
+
1813
+ IMPORTANT INSTRUCTIONS:
1814
+ 1. Focus ONLY on substrate scope data for the specified campaign and enzymes
1815
+ 2. Extract reactions involving enzymes: {', '.join(enzyme_ids)}
1816
+ 3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
1817
+ - Model reactions are those used to evolve/optimize the enzymes (listed above)
1818
+ - Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
1819
+ - DO NOT include model reactions in substrate scope data
1820
+ 4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
1821
+ 5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
1822
+
1823
+ {extraction_hints}
1824
+
1825
+ Return your analysis as JSON in this format:
1826
+ {{
1827
+ "substrate_scope_data": [
1828
+ {{
1829
+ "enzyme_id": "enzyme identifier",
1830
+ "substrate_ids": ["substrate identifiers"],
1831
+ "product_ids": ["product identifiers"],
1832
+ "substrate_names": ["substrate names"],
1833
+ "product_names": ["product names"],
1834
+ "yield_percent": number or null,
1835
+ "ee": number or null,
1836
+ "ttn": number or null,
1837
+ "temperature": "temperature" or null,
1838
+ "ph": "pH" or null,
1839
+ "buffer": "buffer" or null,
1840
+ "substrate_concentration": "concentration" or null,
1841
+ "data_location": "where this data was found",
1842
+ "campaign_id": "{campaign_id}",
1843
+ "is_substrate_scope": true,
1844
+ "model_reaction_excluded": "reason why this is not a model reaction"
1845
+ }}
1846
+ ]
1847
+ }}
1848
+
1849
+ Important: Only return TRUE substrate scope data (not model reactions) for the specified campaign. If no substrate scope data exists for this campaign, return {{"substrate_scope_data": []}}.
1850
+ """
1754
1851
 
1755
1852
  try:
1756
- # Use multimodal content if we have images
1757
- if len(content_parts) > 1:
1758
- # Log multimodal API call
1759
- log.info("=== GEMINI MULTIMODAL API CALL: SUBSTRATE_SCOPE_WITH_FIGURES ===")
1760
- log.info("Text prompt length: %d characters", len(prompt))
1761
- log.info("Number of images: %d", len(content_parts) - 1)
1762
- log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
1853
+ # Use multimodal extraction if we have figure images
1854
+ if figure_images:
1855
+ log.info("Campaign %s - using multimodal extraction with %d figure images", campaign_id, len(figure_images))
1763
1856
 
1764
- # Save prompt and image info to debug directory
1857
+ # Prepare multimodal content
1858
+ import PIL.Image
1859
+ import io
1860
+ import base64
1861
+
1862
+ content_parts = [campaign_prompt + "\n\nTEXT:\n" + extraction_text]
1863
+
1864
+ for fig_ref, fig_base64 in figure_images.items():
1865
+ try:
1866
+ # Convert base64 to PIL Image
1867
+ img_bytes = base64.b64decode(fig_base64)
1868
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1869
+ content_parts.append(f"\n[Figure: {fig_ref}]")
1870
+ content_parts.append(image)
1871
+ log.info("Campaign %s - added figure %s to multimodal prompt", campaign_id, fig_ref)
1872
+ except Exception as e:
1873
+ log.warning("Campaign %s - failed to add figure %s: %s", campaign_id, fig_ref, e)
1874
+
1875
+ # Save debug info
1765
1876
  if debug_dir:
1766
1877
  debug_path = Path(debug_dir)
1767
1878
  debug_path.mkdir(parents=True, exist_ok=True)
1768
- prompt_file = debug_path / f"substrate_scope_multimodal_prompt_{int(time.time())}.txt"
1879
+ prompt_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_prompt.txt"
1769
1880
 
1770
- # Build prompt info including image references
1771
- prompt_info = f"=== PROMPT FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n"
1881
+ prompt_info = f"=== CAMPAIGN {campaign_id} MULTIMODAL PROMPT ===\n"
1772
1882
  prompt_info += f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
1773
- prompt_info += f"Text length: {len(prompt)} characters\n"
1774
- prompt_info += f"Images included: {len(content_parts) - 1}\n"
1883
+ prompt_info += f"Text length: {len(extraction_text)} characters\n"
1884
+ prompt_info += f"Images included: {len(figure_images)}\n"
1775
1885
  for fig_ref in figure_images.keys():
1776
1886
  prompt_info += f" - {fig_ref}\n"
1777
1887
  prompt_info += "="*80 + "\n\n"
1778
- prompt_info += prompt
1888
+ prompt_info += campaign_prompt + "\n\nTEXT:\n" + extraction_text
1779
1889
 
1780
- _dump(prompt_info, prompt_file)
1781
- log.info("Full prompt saved to: %s", prompt_file)
1890
+ with open(prompt_file, 'w') as f:
1891
+ f.write(prompt_info)
1892
+ log.info("Campaign %s - prompt saved to: %s", campaign_id, prompt_file)
1782
1893
 
1783
- log.info("Calling Gemini Multimodal API...")
1894
+ # Call multimodal API
1784
1895
  response = model.generate_content(content_parts)
1785
- raw_text = _extract_text(response).strip()
1786
-
1787
- # Log and save response
1788
- log.info("Gemini multimodal response length: %d characters", len(raw_text))
1789
- log.info("First 500 chars of response:\n%s\n...(truncated)", raw_text[:500])
1896
+ raw_text = response.text.strip()
1790
1897
 
1898
+ # Save response
1791
1899
  if debug_dir:
1792
- debug_path = Path(debug_dir)
1793
- response_file = debug_path / f"substrate_scope_multimodal_response_{int(time.time())}.txt"
1900
+ response_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_response.txt"
1794
1901
  with open(response_file, 'w') as f:
1795
- f.write(f"=== RESPONSE FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n")
1902
+ f.write(f"=== CAMPAIGN {campaign_id} MULTIMODAL RESPONSE ===\n")
1796
1903
  f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
1797
1904
  f.write(f"Length: {len(raw_text)} characters\n")
1798
1905
  f.write("="*80 + "\n\n")
1799
1906
  f.write(raw_text)
1800
- log.info("Full response saved to: %s", response_file)
1907
+ log.info("Campaign %s - response saved to: %s", campaign_id, response_file)
1801
1908
 
1802
- # Parse JSON from response
1909
+ # Parse JSON
1803
1910
  import json
1804
1911
  data = json.loads(raw_text.strip('```json').strip('```').strip())
1805
1912
  else:
1913
+ log.info("Campaign %s - using text-only extraction", campaign_id)
1806
1914
  data = generate_json_with_retry(
1807
1915
  model,
1808
- prompt,
1916
+ campaign_prompt + "\n\nTEXT:\n" + extraction_text,
1809
1917
  debug_dir=debug_dir,
1810
- tag="substrate_scope",
1918
+ tag=f"substrate_scope_{campaign_id}",
1811
1919
  )
1812
1920
 
1813
1921
  scope_data = data.get("substrate_scope_data", [])
1814
- log.info("Extracted %d substrate scope entries", len(scope_data))
1922
+
1923
+ # Add campaign_id to each entry if not present
1924
+ for entry in scope_data:
1925
+ if "campaign_id" not in entry:
1926
+ entry["campaign_id"] = campaign_id
1927
+
1928
+ log.info("Campaign %s - extracted %d substrate scope entries", campaign_id, len(scope_data))
1815
1929
  return scope_data
1816
1930
 
1817
1931
  except Exception as exc:
1818
- log.error("Failed to extract substrate scope data: %s", exc)
1932
+ log.error("Failed to extract substrate scope data for campaign %s: %s", campaign_id, exc)
1819
1933
  return []
1820
1934
 
1935
+
1821
1936
  def _extract_single_reaction(
1822
1937
  text: str,
1823
1938
  model,
@@ -1871,7 +1986,7 @@ def _extract_single_reaction(
1871
1986
  log.error("Failed to extract reaction %s-%s: %s", enzyme_id, substrate_name, exc)
1872
1987
  return None
1873
1988
 
1874
- def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping]) -> List[ScopeEntry]:
1989
+ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping], campaign_id: Optional[str] = None) -> List[ScopeEntry]:
1875
1990
  """Convert raw JSON to ScopeEntry objects with IUPAC enhancement."""
1876
1991
  entries: List[ScopeEntry] = []
1877
1992
 
@@ -1980,6 +2095,7 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
1980
2095
  conditions=conditions,
1981
2096
  data_location=item.get("data_location", ""),
1982
2097
  data_source_type={"all": "text/figure"},
2098
+ campaign_id=campaign_id or item.get("campaign_id", ""),
1983
2099
  notes=item.get("notes", "")
1984
2100
  )
1985
2101
 
@@ -2010,7 +2126,10 @@ def get_substrate_scope(
2010
2126
  5. Extract individual reactions with context
2011
2127
  """
2012
2128
  # Step 1: Find locations using captions
2013
- locations = identify_scope_locations(caption_text, model, debug_dir=debug_dir)
2129
+ # For backward compatibility, use campaign-specific function with generic parameters
2130
+ locations = identify_scope_locations_for_campaign(
2131
+ caption_text, model, "general", ["all"], debug_dir=debug_dir
2132
+ )
2014
2133
  if locations:
2015
2134
  location_summary = []
2016
2135
  for loc in locations[:3]:
@@ -2071,10 +2190,13 @@ def get_substrate_scope(
2071
2190
  log.warning("Failed to extract %s image for %s", location_type, figure_ref)
2072
2191
 
2073
2192
  # Extract all substrate scope data in one call
2074
- raw_entries = extract_all_substrate_scope_data(
2193
+ # Note: This function is now deprecated in favor of campaign-specific extraction
2194
+ # For backward compatibility, we'll use a generic campaign approach
2195
+ raw_entries = extract_substrate_scope_entries_for_campaign(
2075
2196
  full_text, model, locations,
2197
+ campaign_id="general",
2198
+ enzyme_ids=["all"],
2076
2199
  pdf_paths=pdf_paths,
2077
- figure_images=figure_images,
2078
2200
  debug_dir=debug_dir
2079
2201
  )
2080
2202
 
@@ -2118,6 +2240,96 @@ def get_substrate_scope(
2118
2240
 
2119
2241
  return entries
2120
2242
 
2243
+
2244
+ def get_substrate_scope_for_campaign(
2245
+ caption_text: str,
2246
+ full_text: str,
2247
+ model,
2248
+ *,
2249
+ campaign_id: str,
2250
+ enzyme_ids: List[str],
2251
+ pdf_paths: Optional[List[Path]] = None,
2252
+ debug_dir: str | Path | None = None,
2253
+ ) -> List[ScopeEntry]:
2254
+ """
2255
+ Campaign-specific substrate scope extraction.
2256
+
2257
+ Like get_substrate_scope but focuses on a specific campaign and its enzymes.
2258
+ Tells Gemini about the specific campaign and that it's okay to return null if
2259
+ no substrate scope data exists for this campaign.
2260
+ """
2261
+ log.info("Starting campaign-specific substrate scope extraction for: %s", campaign_id)
2262
+ log.info("Target enzymes: %s", enzyme_ids)
2263
+
2264
+ # Step 1: Find locations using captions with campaign context
2265
+ locations = identify_scope_locations_for_campaign(
2266
+ caption_text, model, campaign_id, enzyme_ids, debug_dir=debug_dir
2267
+ )
2268
+
2269
+ if not locations:
2270
+ log.info("No substrate scope locations identified for campaign %s", campaign_id)
2271
+ return []
2272
+
2273
+ location_summary = []
2274
+ for loc in locations[:3]:
2275
+ location_summary.append(
2276
+ f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, "
2277
+ f"confidence: {loc.get('confidence', 0)})"
2278
+ )
2279
+ log.info("Campaign %s - identified %d substrate scope locations: %s",
2280
+ campaign_id, len(locations), ", ".join(location_summary))
2281
+
2282
+ # Step 2: Identify IUPAC sections from SI TOC (reuse existing logic)
2283
+ iupac_sections = identify_iupac_sections(caption_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2284
+ log.info("Campaign %s - identified %d IUPAC sections", campaign_id, len(iupac_sections))
2285
+
2286
+ # Step 3: Extract raw entries with campaign context
2287
+ raw_entries = extract_substrate_scope_entries_for_campaign(
2288
+ full_text, model, locations, campaign_id, enzyme_ids,
2289
+ pdf_paths=pdf_paths, debug_dir=debug_dir
2290
+ )
2291
+
2292
+ if not raw_entries:
2293
+ log.info("No substrate scope entries extracted for campaign %s", campaign_id)
2294
+ return []
2295
+
2296
+ log.info("Campaign %s - extracted %d raw substrate scope entries", campaign_id, len(raw_entries))
2297
+
2298
+ # Step 4: Extract compound mappings (reuse existing logic)
2299
+ figure_images = []
2300
+ if pdf_paths:
2301
+ for pdf_path in pdf_paths:
2302
+ try:
2303
+ figure_images.extend(extract_figure_images(pdf_path))
2304
+ except Exception as e:
2305
+ log.warning("Failed to extract figure images from %s: %s", pdf_path, e)
2306
+
2307
+ # Collect all compound IDs from raw entries
2308
+ all_compound_ids = set()
2309
+ for entry in raw_entries:
2310
+ substrate_ids = entry.get("substrate_ids", [])
2311
+ product_ids = entry.get("product_ids", [])
2312
+ for sid in substrate_ids:
2313
+ all_compound_ids.add(str(sid))
2314
+ for pid in product_ids:
2315
+ all_compound_ids.add(str(pid))
2316
+
2317
+ log.info("Campaign %s - found %d unique compound IDs to map", campaign_id, len(all_compound_ids))
2318
+
2319
+ # Extract compound mappings (reuse existing function)
2320
+ compound_mappings = extract_compound_mappings(full_text, model,
2321
+ pdf_paths=pdf_paths,
2322
+ iupac_sections=iupac_sections,
2323
+ compound_ids=list(all_compound_ids),
2324
+ primary_locations=locations,
2325
+ debug_dir=debug_dir)
2326
+
2327
+ # Step 5: Parse all entries with compound mappings
2328
+ entries = _parse_scope_entries(raw_entries, compound_mappings, campaign_id)
2329
+ log.info("Campaign %s - successfully parsed %d substrate scope entries", campaign_id, len(entries))
2330
+
2331
+ return entries
2332
+
2121
2333
  # === 7. VALIDATION & MERGE ===
2122
2334
  """Validation, duplicate detection, and merging with lineage data."""
2123
2335
 
@@ -2313,6 +2525,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
2313
2525
  'parent_enzyme_id': entry.parent_id or '',
2314
2526
  'mutations': entry.mutations or '',
2315
2527
  'generation': entry.generation if entry.generation is not None else '',
2528
+ 'campaign_id': entry.campaign_id or '',
2316
2529
  'protein_sequence': entry.aa_seq or '',
2317
2530
  'nucleotide_sequence': entry.dna_seq or '',
2318
2531
  'sequence_confidence': str(entry.confidence) if entry.confidence is not None else '',
@@ -2345,7 +2558,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
2345
2558
 
2346
2559
  # Define column order
2347
2560
  column_order = [
2348
- 'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation',
2561
+ 'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation', 'campaign_id',
2349
2562
  'protein_sequence', 'nucleotide_sequence', 'sequence_confidence', 'flag',
2350
2563
  'substrate_list', 'substrate_iupac_list',
2351
2564
  'product_list', 'product_iupac_list',
@@ -2407,23 +2620,83 @@ def run_pipeline(
2407
2620
  # 2. Connect to Gemini -----------------------------------------------------
2408
2621
  model = get_model()
2409
2622
 
2410
- # 3. Extract substrate scope -----------------------------------------------
2411
- entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2623
+ # 3. Check for campaign-based extraction -----------------------------------
2624
+ all_entries = []
2625
+
2626
+ if lineage_csv:
2627
+ import pandas as pd
2628
+ lineage_df = pd.read_csv(lineage_csv)
2629
+
2630
+ # Check if we have campaign_id column - if so, process each campaign separately
2631
+ if 'campaign_id' in lineage_df.columns:
2632
+ campaigns = lineage_df['campaign_id'].unique()
2633
+ log.info("Detected %d campaigns in lineage data - processing each separately", len(campaigns))
2634
+ log.info("Campaigns: %s", campaigns.tolist())
2635
+
2636
+ # Simple campaign context for model reaction awareness
2637
+ campaigns_context_text = f"All campaigns: {campaigns.tolist()}"
2638
+ identify_scope_locations_for_campaign._all_campaigns_context = campaigns_context_text
2639
+ extract_substrate_scope_entries_for_campaign._all_campaigns_context = campaigns_context_text
2640
+
2641
+ for campaign_id in campaigns:
2642
+ log.info("\n" + "="*60)
2643
+ log.info("Processing campaign: %s", campaign_id)
2644
+ log.info("="*60)
2645
+
2646
+ # Get enzymes for this campaign
2647
+ campaign_enzymes = lineage_df[lineage_df['campaign_id'] == campaign_id]
2648
+ if 'enzyme_id' in campaign_enzymes.columns:
2649
+ enzyme_ids = campaign_enzymes['enzyme_id'].tolist()
2650
+ elif 'enzyme' in campaign_enzymes.columns:
2651
+ enzyme_ids = campaign_enzymes['enzyme'].tolist()
2652
+ elif 'variant_id' in campaign_enzymes.columns:
2653
+ enzyme_ids = campaign_enzymes['variant_id'].tolist()
2654
+ else:
2655
+ raise ValueError("No enzyme ID column found in lineage data")
2656
+
2657
+ log.info("Campaign %s has %d enzymes: %s", campaign_id, len(enzyme_ids), enzyme_ids)
2658
+
2659
+ # Create campaign-specific debug dir
2660
+ campaign_debug_dir = Path(debug_dir) / campaign_id if debug_dir else None
2661
+
2662
+ # Extract substrate scope for this campaign
2663
+ campaign_entries = get_substrate_scope_for_campaign(
2664
+ caption_text, full_text, model,
2665
+ campaign_id=campaign_id,
2666
+ enzyme_ids=enzyme_ids,
2667
+ pdf_paths=pdf_paths,
2668
+ debug_dir=campaign_debug_dir
2669
+ )
2670
+
2671
+ if campaign_entries:
2672
+ log.info("Extracted %d substrate scope entries for campaign %s", len(campaign_entries), campaign_id)
2673
+ all_entries.extend(campaign_entries)
2674
+ else:
2675
+ log.info("No substrate scope data found for campaign %s", campaign_id)
2676
+ else:
2677
+ # Original single extraction
2678
+ entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2679
+ all_entries = entries
2680
+ else:
2681
+ # No lineage data - single extraction
2682
+ entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2683
+ all_entries = entries
2412
2684
 
2413
- if not entries:
2414
- raise RuntimeError("Pipeline aborted: failed to extract any substrate scope data")
2685
+ if not all_entries:
2686
+ log.warning("No substrate scope data extracted from any campaign")
2687
+ all_entries = [] # Allow empty results
2415
2688
 
2416
2689
  # 4. Merge with lineage if available ---------------------------------------
2417
- if lineage_csv:
2418
- entries = merge_with_lineage(entries, Path(lineage_csv), model)
2690
+ if lineage_csv and all_entries:
2691
+ all_entries = merge_with_lineage(all_entries, Path(lineage_csv), model)
2419
2692
 
2420
2693
  # 5. Validate entries ------------------------------------------------------
2421
- warnings = validate_scope_entries(entries)
2694
+ warnings = validate_scope_entries(all_entries)
2422
2695
  if warnings:
2423
2696
  log.warning("Found %d validation warnings", len(warnings))
2424
2697
 
2425
2698
  # 6. Convert to DataFrame --------------------------------------------------
2426
- df_final = _entries_to_dataframe(entries)
2699
+ df_final = _entries_to_dataframe(all_entries)
2427
2700
 
2428
2701
  # 7. Write CSV if requested ------------------------------------------------
2429
2702
  if output_csv: