debase 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -83,6 +83,7 @@ class ScopeEntry:
83
83
  # Metadata
84
84
  data_location: Optional[str] = None
85
85
  data_source_type: Dict[str, str] = field(default_factory=dict)
86
+ campaign_id: Optional[str] = None
86
87
 
87
88
  # Lineage information (populated during merge)
88
89
  parent_id: Optional[str] = None
@@ -309,68 +310,27 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
309
310
 
310
311
  log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
311
312
 
312
- # Extract multi-page region including the figure and content below
313
- # The figure should be between the top of the viewable area and extend to subsequent pages
313
+ # Extract just the figure with its caption, avoiding excessive white space
314
314
  page_rect = page.rect
315
315
 
316
- # Define the region to extract
317
- # Extract everything above the caption plus additional content from subsequent pages
318
- top_margin = 0 # Start from the very top of the page
319
- additional_pages = 2 # Number of additional pages to include
320
- left_margin = 0 # Use full page width
321
- right_margin = 0
322
-
323
- # Calculate the figure region for the first page
324
- fig_top = top_margin
325
- fig_bottom = max(caption_rect.y0 + 150, page_rect.height) # At least 150px below caption or full page
326
- fig_left = left_margin
327
- fig_right = page_rect.width - right_margin
328
-
329
- # Create list to store all page images
330
- page_images = []
316
+ # Extract the entire page containing the identified location
317
+ fig_top = 0 # Start from top of page
318
+ fig_bottom = page_rect.height # Full page height
319
+ fig_left = 0 # Full width
320
+ fig_right = page_rect.width
331
321
 
332
- # Extract first page (from top to bottom)
322
+ # Extract the entire page
333
323
  clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
334
324
  mat = fitz.Matrix(2, 2) # 2x zoom for better quality
335
325
  pix = page.get_pixmap(clip=clip_rect, matrix=mat)
336
- page_images.append(pix)
337
-
338
- # Extract additional pages if they exist
339
- for additional_page_offset in range(1, additional_pages + 1):
340
- next_page_num = page_num + additional_page_offset
341
- if next_page_num < doc.page_count:
342
- next_page = doc.load_page(next_page_num)
343
- next_page_rect = next_page.rect
344
-
345
- # Extract full page for additional pages
346
- next_clip_rect = fitz.Rect(0, 0, next_page_rect.width, next_page_rect.height)
347
- next_pix = next_page.get_pixmap(clip=next_clip_rect, matrix=mat)
348
- page_images.append(next_pix)
349
- log.info("Added page %d to multi-page extraction", next_page_num + 1)
350
326
 
351
- # Combine all page images vertically
352
- if len(page_images) == 1:
353
- # Single page extraction
354
- combined_pix = page_images[0]
355
- else:
356
- # Multi-page extraction - combine vertically
357
- total_width = max(pix.width for pix in page_images)
358
- total_height = sum(pix.height for pix in page_images)
359
-
360
- # Create a new pixmap to hold the combined image
361
- combined_pix = fitz.Pixmap(fitz.csRGB, fitz.IRect(0, 0, total_width, total_height))
362
- combined_pix.clear_with(255) # White background
363
-
364
- current_y = 0
365
- for pix in page_images:
366
- # Copy each page image to the combined image
367
- combined_pix.copy(pix, fitz.IRect(0, current_y, pix.width, current_y + pix.height))
368
- current_y += pix.height
327
+ log.info("Extracted entire page: %.0fx%.0f pixels from page %d",
328
+ pix.width, pix.height, page_num + 1)
369
329
 
370
330
  # Convert to PNG
371
- img_bytes = combined_pix.tobytes("png")
372
- log.info("Extracted multi-page figure region: %dx%d pixels from %d pages starting at page %d",
373
- combined_pix.width, combined_pix.height, len(page_images), page_num + 1)
331
+ img_bytes = pix.tobytes("png")
332
+ log.info("Converted to PNG: %dx%d pixels from page %d",
333
+ pix.width, pix.height, page_num + 1)
374
334
 
375
335
  return b64encode(img_bytes).decode()
376
336
 
@@ -1014,25 +974,73 @@ Return as JSON:
1014
974
 
1015
975
  # ---- 6.2 Helper functions -------------------------------------------------
1016
976
 
1017
- def identify_scope_locations(
977
+
978
+
979
+ def identify_scope_locations_for_campaign(
1018
980
  text: str,
1019
981
  model,
982
+ campaign_id: str,
983
+ enzyme_ids: List[str],
1020
984
  *,
1021
985
  max_results: int = 5,
1022
986
  debug_dir: str | Path | None = None,
1023
987
  ) -> List[dict]:
1024
- """Ask Gemini where substrate scope data is located."""
1025
- prompt = _SCOPE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + text[:15_000]
988
+ """Ask Gemini where substrate scope data is located for a specific campaign."""
989
+
990
+ # Simple model reaction context
991
+ model_reactions_context = """
992
+ IMPORTANT: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
993
+ Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
994
+ """
995
+
996
+ # Create campaign-specific prompt
997
+ campaign_prompt = f"""
998
+ You are an expert reader of biocatalysis manuscripts.
999
+ Analyze this paper and identify all locations containing substrate scope data for the specific campaign: "{campaign_id}".
1000
+
1001
+ CAMPAIGN CONTEXT:
1002
+ - Campaign ID: {campaign_id}
1003
+ - Target enzymes: {', '.join(enzyme_ids)}
1004
+
1005
+ {model_reactions_context}
1006
+
1007
+ Your task is to:
1008
+ 1. Identify locations (tables, figures, text) containing substrate scope reaction data specifically for this campaign
1009
+ 2. Focus only on substrate scope studies involving the enzymes: {', '.join(enzyme_ids)}
1010
+ 3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
1011
+ - Model reactions are those used to evolve/optimize the enzymes
1012
+ - Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
1013
+ 4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
1014
+ 5. Determine which enzyme variants from this campaign were tested in substrate scope studies
1015
+
1016
+ Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
1017
+ [
1018
+ {{
1019
+ "location": "Description of where the data is found",
1020
+ "type": "table|figure|text",
1021
+ "confidence": 0.0-1.0,
1022
+ "enzyme_variants": ["list of enzyme IDs found"],
1023
+ "substrates_tested": ["list of substrates if identifiable"],
1024
+ "campaign_match": true/false,
1025
+ "is_substrate_scope": true/false,
1026
+ "model_reaction_excluded": "reason why this is not a model reaction"
1027
+ }}
1028
+ ]
1029
+
1030
+ Important: Only return locations that contain TRUE substrate scope data (not model reactions) for the specified campaign and enzymes. If no substrate scope data exists for this campaign, return an empty array.
1031
+ """
1032
+
1033
+ prompt = campaign_prompt + "\n\nTEXT:\n" + text[:15_000]
1026
1034
  locs: List[dict] = []
1027
1035
  try:
1028
1036
  locs = generate_json_with_retry(
1029
1037
  model,
1030
1038
  prompt,
1031
1039
  debug_dir=debug_dir,
1032
- tag="scope_locate",
1040
+ tag=f"scope_locate_{campaign_id}",
1033
1041
  )
1034
1042
  except Exception as exc: # pragma: no cover
1035
- log.warning("identify_scope_locations(): %s", exc)
1043
+ log.warning("identify_scope_locations_for_campaign(%s): %s", campaign_id, exc)
1036
1044
  return locs if isinstance(locs, list) else []
1037
1045
 
1038
1046
  def identify_iupac_sections(
@@ -1719,16 +1727,18 @@ def extract_compound_mappings(
1719
1727
  log.info("Total compound mappings extracted: %d", len(mappings))
1720
1728
  return mappings
1721
1729
 
1722
- def extract_all_substrate_scope_data(
1730
+ def extract_substrate_scope_entries_for_campaign(
1723
1731
  text: str,
1724
1732
  model,
1725
1733
  locations: List[dict],
1734
+ campaign_id: str,
1735
+ enzyme_ids: List[str],
1726
1736
  *,
1727
1737
  pdf_paths: List[Path] = None,
1728
- figure_images: Dict[str, str] = None,
1729
1738
  debug_dir: str | Path | None = None,
1730
1739
  ) -> List[dict]:
1731
- """Extract all substrate scope data at once from all primary sources."""
1740
+ """Extract substrate scope data specifically for a campaign."""
1741
+
1732
1742
  extraction_hints = ""
1733
1743
  all_refs = []
1734
1744
 
@@ -1740,124 +1750,189 @@ def extract_all_substrate_scope_data(
1740
1750
  location_strs.append(loc_str)
1741
1751
  all_refs.append(loc_str)
1742
1752
 
1743
- extraction_hints = f"\nSubstrate scope locations: {', '.join(location_strs)}"
1744
-
1745
- # Collect all enzyme variants
1746
- all_variants = []
1747
- for loc in locations:
1748
- variants = loc.get('enzyme_variants_tested', [])
1749
- all_variants.extend(variants)
1753
+ extraction_hints = f"\nSubstrate scope locations for campaign {campaign_id}: {', '.join(location_strs)}"
1750
1754
 
1751
- if all_variants:
1752
- unique_variants = list(set(all_variants))
1753
- extraction_hints += f"\nEnzyme variants: {', '.join(unique_variants)}"
1755
+ # Focus on campaign-specific enzyme variants
1756
+ extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
1754
1757
 
1755
- # Extract text from ALL identified locations
1758
+ # Extract text from ALL identified locations (like the original function did)
1756
1759
  extraction_texts = []
1760
+ figure_images = {}
1757
1761
 
1758
1762
  for ref in all_refs:
1759
1763
  if ref and pdf_paths:
1760
1764
  ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
1761
1765
  if ref_text:
1762
- # Add figure image notation if available
1763
- if figure_images and ref in figure_images:
1764
- ref_text = f"[FIGURE IMAGE EXTRACTED: {ref}]\n\n{ref_text}"
1765
1766
  extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
1767
+
1768
+ # Extract figure images for this reference (crop page around figure)
1769
+ try:
1770
+ fig_base64 = extract_figure_image(pdf_paths, ref)
1771
+ if fig_base64:
1772
+ figure_images[ref] = fig_base64
1773
+ log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
1774
+
1775
+ # Save the figure image to debug folder
1776
+ if debug_dir:
1777
+ debug_path = Path(debug_dir)
1778
+ debug_path.mkdir(parents=True, exist_ok=True)
1779
+ # Clean ref for filename
1780
+ safe_ref = re.sub(r'[^\w\s-]', '', ref).strip().replace(' ', '_')
1781
+ image_file = debug_path / f"figure_{safe_ref}_{campaign_id}.png"
1782
+
1783
+ # Decode and save the image
1784
+ import base64
1785
+ with open(image_file, 'wb') as f:
1786
+ f.write(base64.b64decode(fig_base64))
1787
+ log.info("Campaign %s - saved figure image to %s", campaign_id, image_file)
1788
+ except Exception as e:
1789
+ log.warning("Campaign %s - failed to extract figure for %s: %s", campaign_id, ref, e)
1766
1790
 
1767
1791
  if not extraction_texts:
1768
1792
  extraction_texts = [text[:50_000]]
1769
1793
 
1770
1794
  extraction_text = "\n\n".join(extraction_texts)
1771
1795
 
1772
- prompt = _SUBSTRATE_SCOPE_PROMPT.format(extraction_hints=extraction_hints)
1773
- prompt += "\n\nTEXT:\n" + extraction_text
1774
-
1775
- # Prepare multimodal content with images
1776
- content_parts = [prompt]
1796
+ # Simple model reaction context
1797
+ model_reactions_context = """
1798
+ CRITICAL: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
1799
+ Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
1800
+ """
1777
1801
 
1778
- # Add figure images to the prompt
1779
- if figure_images:
1780
- import PIL.Image
1781
- import io
1782
- import base64
1783
-
1784
- for fig_ref, fig_base64 in figure_images.items():
1785
- try:
1786
- # Convert base64 to PIL Image
1787
- img_bytes = base64.b64decode(fig_base64)
1788
- image = PIL.Image.open(io.BytesIO(img_bytes))
1789
- content_parts.append(f"\n[Figure: {fig_ref}]")
1790
- content_parts.append(image)
1791
- log.info("Added figure %s to multimodal prompt", fig_ref)
1792
- except Exception as e:
1793
- log.warning("Failed to add figure %s: %s", fig_ref, e)
1802
+ # Create campaign-specific prompt
1803
+ campaign_prompt = f"""
1804
+ You are an expert reader of biocatalysis manuscripts.
1805
+ Extract ALL substrate scope reaction data specifically for campaign: "{campaign_id}".
1806
+
1807
+ CAMPAIGN CONTEXT:
1808
+ - Campaign ID: {campaign_id}
1809
+ - Target enzymes: {', '.join(enzyme_ids)}
1810
+
1811
+ {model_reactions_context}
1812
+
1813
+ IMPORTANT INSTRUCTIONS:
1814
+ 1. Focus ONLY on substrate scope data for the specified campaign and enzymes
1815
+ 2. Extract reactions involving enzymes: {', '.join(enzyme_ids)}
1816
+ 3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
1817
+ - Model reactions are those used to evolve/optimize the enzymes (listed above)
1818
+ - Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
1819
+ - DO NOT include model reactions in substrate scope data
1820
+ 4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
1821
+ 5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
1822
+
1823
+ {extraction_hints}
1824
+
1825
+ Return your analysis as JSON in this format:
1826
+ {{
1827
+ "substrate_scope_data": [
1828
+ {{
1829
+ "enzyme_id": "enzyme identifier",
1830
+ "substrate_ids": ["substrate identifiers"],
1831
+ "product_ids": ["product identifiers"],
1832
+ "substrate_names": ["substrate names"],
1833
+ "product_names": ["product names"],
1834
+ "yield_percent": number or null,
1835
+ "ee": number or null,
1836
+ "ttn": number or null,
1837
+ "temperature": "temperature" or null,
1838
+ "ph": "pH" or null,
1839
+ "buffer": "buffer" or null,
1840
+ "substrate_concentration": "concentration" or null,
1841
+ "data_location": "where this data was found",
1842
+ "campaign_id": "{campaign_id}",
1843
+ "is_substrate_scope": true,
1844
+ "model_reaction_excluded": "reason why this is not a model reaction"
1845
+ }}
1846
+ ]
1847
+ }}
1848
+
1849
+ Important: Only return TRUE substrate scope data (not model reactions) for the specified campaign. If no substrate scope data exists for this campaign, return {{"substrate_scope_data": []}}.
1850
+ """
1794
1851
 
1795
1852
  try:
1796
- # Use multimodal content if we have images
1797
- if len(content_parts) > 1:
1798
- # Log multimodal API call
1799
- log.info("=== GEMINI MULTIMODAL API CALL: SUBSTRATE_SCOPE_WITH_FIGURES ===")
1800
- log.info("Text prompt length: %d characters", len(prompt))
1801
- log.info("Number of images: %d", len(content_parts) - 1)
1802
- log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
1853
+ # Use multimodal extraction if we have figure images
1854
+ if figure_images:
1855
+ log.info("Campaign %s - using multimodal extraction with %d figure images", campaign_id, len(figure_images))
1856
+
1857
+ # Prepare multimodal content
1858
+ import PIL.Image
1859
+ import io
1860
+ import base64
1861
+
1862
+ content_parts = [campaign_prompt + "\n\nTEXT:\n" + extraction_text]
1863
+
1864
+ for fig_ref, fig_base64 in figure_images.items():
1865
+ try:
1866
+ # Convert base64 to PIL Image
1867
+ img_bytes = base64.b64decode(fig_base64)
1868
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1869
+ content_parts.append(f"\n[Figure: {fig_ref}]")
1870
+ content_parts.append(image)
1871
+ log.info("Campaign %s - added figure %s to multimodal prompt", campaign_id, fig_ref)
1872
+ except Exception as e:
1873
+ log.warning("Campaign %s - failed to add figure %s: %s", campaign_id, fig_ref, e)
1803
1874
 
1804
- # Save prompt and image info to debug directory
1875
+ # Save debug info
1805
1876
  if debug_dir:
1806
1877
  debug_path = Path(debug_dir)
1807
1878
  debug_path.mkdir(parents=True, exist_ok=True)
1808
- prompt_file = debug_path / f"substrate_scope_multimodal_prompt_{int(time.time())}.txt"
1879
+ prompt_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_prompt.txt"
1809
1880
 
1810
- # Build prompt info including image references
1811
- prompt_info = f"=== PROMPT FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n"
1881
+ prompt_info = f"=== CAMPAIGN {campaign_id} MULTIMODAL PROMPT ===\n"
1812
1882
  prompt_info += f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
1813
- prompt_info += f"Text length: {len(prompt)} characters\n"
1814
- prompt_info += f"Images included: {len(content_parts) - 1}\n"
1883
+ prompt_info += f"Text length: {len(extraction_text)} characters\n"
1884
+ prompt_info += f"Images included: {len(figure_images)}\n"
1815
1885
  for fig_ref in figure_images.keys():
1816
1886
  prompt_info += f" - {fig_ref}\n"
1817
1887
  prompt_info += "="*80 + "\n\n"
1818
- prompt_info += prompt
1888
+ prompt_info += campaign_prompt + "\n\nTEXT:\n" + extraction_text
1819
1889
 
1820
- _dump(prompt_info, prompt_file)
1821
- log.info("Full prompt saved to: %s", prompt_file)
1890
+ with open(prompt_file, 'w') as f:
1891
+ f.write(prompt_info)
1892
+ log.info("Campaign %s - prompt saved to: %s", campaign_id, prompt_file)
1822
1893
 
1823
- log.info("Calling Gemini Multimodal API...")
1894
+ # Call multimodal API
1824
1895
  response = model.generate_content(content_parts)
1825
- raw_text = _extract_text(response).strip()
1826
-
1827
- # Log and save response
1828
- log.info("Gemini multimodal response length: %d characters", len(raw_text))
1829
- log.info("First 500 chars of response:\n%s\n...(truncated)", raw_text[:500])
1896
+ raw_text = response.text.strip()
1830
1897
 
1898
+ # Save response
1831
1899
  if debug_dir:
1832
- debug_path = Path(debug_dir)
1833
- response_file = debug_path / f"substrate_scope_multimodal_response_{int(time.time())}.txt"
1900
+ response_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_response.txt"
1834
1901
  with open(response_file, 'w') as f:
1835
- f.write(f"=== RESPONSE FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n")
1902
+ f.write(f"=== CAMPAIGN {campaign_id} MULTIMODAL RESPONSE ===\n")
1836
1903
  f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
1837
1904
  f.write(f"Length: {len(raw_text)} characters\n")
1838
1905
  f.write("="*80 + "\n\n")
1839
1906
  f.write(raw_text)
1840
- log.info("Full response saved to: %s", response_file)
1907
+ log.info("Campaign %s - response saved to: %s", campaign_id, response_file)
1841
1908
 
1842
- # Parse JSON from response
1909
+ # Parse JSON
1843
1910
  import json
1844
1911
  data = json.loads(raw_text.strip('```json').strip('```').strip())
1845
1912
  else:
1913
+ log.info("Campaign %s - using text-only extraction", campaign_id)
1846
1914
  data = generate_json_with_retry(
1847
1915
  model,
1848
- prompt,
1916
+ campaign_prompt + "\n\nTEXT:\n" + extraction_text,
1849
1917
  debug_dir=debug_dir,
1850
- tag="substrate_scope",
1918
+ tag=f"substrate_scope_{campaign_id}",
1851
1919
  )
1852
1920
 
1853
1921
  scope_data = data.get("substrate_scope_data", [])
1854
- log.info("Extracted %d substrate scope entries", len(scope_data))
1922
+
1923
+ # Add campaign_id to each entry if not present
1924
+ for entry in scope_data:
1925
+ if "campaign_id" not in entry:
1926
+ entry["campaign_id"] = campaign_id
1927
+
1928
+ log.info("Campaign %s - extracted %d substrate scope entries", campaign_id, len(scope_data))
1855
1929
  return scope_data
1856
1930
 
1857
1931
  except Exception as exc:
1858
- log.error("Failed to extract substrate scope data: %s", exc)
1932
+ log.error("Failed to extract substrate scope data for campaign %s: %s", campaign_id, exc)
1859
1933
  return []
1860
1934
 
1935
+
1861
1936
  def _extract_single_reaction(
1862
1937
  text: str,
1863
1938
  model,
@@ -1911,7 +1986,7 @@ def _extract_single_reaction(
1911
1986
  log.error("Failed to extract reaction %s-%s: %s", enzyme_id, substrate_name, exc)
1912
1987
  return None
1913
1988
 
1914
- def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping]) -> List[ScopeEntry]:
1989
+ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping], campaign_id: Optional[str] = None) -> List[ScopeEntry]:
1915
1990
  """Convert raw JSON to ScopeEntry objects with IUPAC enhancement."""
1916
1991
  entries: List[ScopeEntry] = []
1917
1992
 
@@ -2020,6 +2095,7 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
2020
2095
  conditions=conditions,
2021
2096
  data_location=item.get("data_location", ""),
2022
2097
  data_source_type={"all": "text/figure"},
2098
+ campaign_id=campaign_id or item.get("campaign_id", ""),
2023
2099
  notes=item.get("notes", "")
2024
2100
  )
2025
2101
 
@@ -2050,7 +2126,10 @@ def get_substrate_scope(
2050
2126
  5. Extract individual reactions with context
2051
2127
  """
2052
2128
  # Step 1: Find locations using captions
2053
- locations = identify_scope_locations(caption_text, model, debug_dir=debug_dir)
2129
+ # For backward compatibility, use campaign-specific function with generic parameters
2130
+ locations = identify_scope_locations_for_campaign(
2131
+ caption_text, model, "general", ["all"], debug_dir=debug_dir
2132
+ )
2054
2133
  if locations:
2055
2134
  location_summary = []
2056
2135
  for loc in locations[:3]:
@@ -2111,10 +2190,13 @@ def get_substrate_scope(
2111
2190
  log.warning("Failed to extract %s image for %s", location_type, figure_ref)
2112
2191
 
2113
2192
  # Extract all substrate scope data in one call
2114
- raw_entries = extract_all_substrate_scope_data(
2193
+ # Note: This function is now deprecated in favor of campaign-specific extraction
2194
+ # For backward compatibility, we'll use a generic campaign approach
2195
+ raw_entries = extract_substrate_scope_entries_for_campaign(
2115
2196
  full_text, model, locations,
2197
+ campaign_id="general",
2198
+ enzyme_ids=["all"],
2116
2199
  pdf_paths=pdf_paths,
2117
- figure_images=figure_images,
2118
2200
  debug_dir=debug_dir
2119
2201
  )
2120
2202
 
@@ -2158,6 +2240,96 @@ def get_substrate_scope(
2158
2240
 
2159
2241
  return entries
2160
2242
 
2243
+
2244
+ def get_substrate_scope_for_campaign(
2245
+ caption_text: str,
2246
+ full_text: str,
2247
+ model,
2248
+ *,
2249
+ campaign_id: str,
2250
+ enzyme_ids: List[str],
2251
+ pdf_paths: Optional[List[Path]] = None,
2252
+ debug_dir: str | Path | None = None,
2253
+ ) -> List[ScopeEntry]:
2254
+ """
2255
+ Campaign-specific substrate scope extraction.
2256
+
2257
+ Like get_substrate_scope but focuses on a specific campaign and its enzymes.
2258
+ Tells Gemini about the specific campaign and that it's okay to return null if
2259
+ no substrate scope data exists for this campaign.
2260
+ """
2261
+ log.info("Starting campaign-specific substrate scope extraction for: %s", campaign_id)
2262
+ log.info("Target enzymes: %s", enzyme_ids)
2263
+
2264
+ # Step 1: Find locations using captions with campaign context
2265
+ locations = identify_scope_locations_for_campaign(
2266
+ caption_text, model, campaign_id, enzyme_ids, debug_dir=debug_dir
2267
+ )
2268
+
2269
+ if not locations:
2270
+ log.info("No substrate scope locations identified for campaign %s", campaign_id)
2271
+ return []
2272
+
2273
+ location_summary = []
2274
+ for loc in locations[:3]:
2275
+ location_summary.append(
2276
+ f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, "
2277
+ f"confidence: {loc.get('confidence', 0)})"
2278
+ )
2279
+ log.info("Campaign %s - identified %d substrate scope locations: %s",
2280
+ campaign_id, len(locations), ", ".join(location_summary))
2281
+
2282
+ # Step 2: Identify IUPAC sections from SI TOC (reuse existing logic)
2283
+ iupac_sections = identify_iupac_sections(caption_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2284
+ log.info("Campaign %s - identified %d IUPAC sections", campaign_id, len(iupac_sections))
2285
+
2286
+ # Step 3: Extract raw entries with campaign context
2287
+ raw_entries = extract_substrate_scope_entries_for_campaign(
2288
+ full_text, model, locations, campaign_id, enzyme_ids,
2289
+ pdf_paths=pdf_paths, debug_dir=debug_dir
2290
+ )
2291
+
2292
+ if not raw_entries:
2293
+ log.info("No substrate scope entries extracted for campaign %s", campaign_id)
2294
+ return []
2295
+
2296
+ log.info("Campaign %s - extracted %d raw substrate scope entries", campaign_id, len(raw_entries))
2297
+
2298
+ # Step 4: Extract compound mappings (reuse existing logic)
2299
+ figure_images = []
2300
+ if pdf_paths:
2301
+ for pdf_path in pdf_paths:
2302
+ try:
2303
+ figure_images.extend(extract_figure_images(pdf_path))
2304
+ except Exception as e:
2305
+ log.warning("Failed to extract figure images from %s: %s", pdf_path, e)
2306
+
2307
+ # Collect all compound IDs from raw entries
2308
+ all_compound_ids = set()
2309
+ for entry in raw_entries:
2310
+ substrate_ids = entry.get("substrate_ids", [])
2311
+ product_ids = entry.get("product_ids", [])
2312
+ for sid in substrate_ids:
2313
+ all_compound_ids.add(str(sid))
2314
+ for pid in product_ids:
2315
+ all_compound_ids.add(str(pid))
2316
+
2317
+ log.info("Campaign %s - found %d unique compound IDs to map", campaign_id, len(all_compound_ids))
2318
+
2319
+ # Extract compound mappings (reuse existing function)
2320
+ compound_mappings = extract_compound_mappings(full_text, model,
2321
+ pdf_paths=pdf_paths,
2322
+ iupac_sections=iupac_sections,
2323
+ compound_ids=list(all_compound_ids),
2324
+ primary_locations=locations,
2325
+ debug_dir=debug_dir)
2326
+
2327
+ # Step 5: Parse all entries with compound mappings
2328
+ entries = _parse_scope_entries(raw_entries, compound_mappings, campaign_id)
2329
+ log.info("Campaign %s - successfully parsed %d substrate scope entries", campaign_id, len(entries))
2330
+
2331
+ return entries
2332
+
2161
2333
  # === 7. VALIDATION & MERGE ===
2162
2334
  """Validation, duplicate detection, and merging with lineage data."""
2163
2335
 
@@ -2353,6 +2525,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
2353
2525
  'parent_enzyme_id': entry.parent_id or '',
2354
2526
  'mutations': entry.mutations or '',
2355
2527
  'generation': entry.generation if entry.generation is not None else '',
2528
+ 'campaign_id': entry.campaign_id or '',
2356
2529
  'protein_sequence': entry.aa_seq or '',
2357
2530
  'nucleotide_sequence': entry.dna_seq or '',
2358
2531
  'sequence_confidence': str(entry.confidence) if entry.confidence is not None else '',
@@ -2385,7 +2558,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
2385
2558
 
2386
2559
  # Define column order
2387
2560
  column_order = [
2388
- 'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation',
2561
+ 'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation', 'campaign_id',
2389
2562
  'protein_sequence', 'nucleotide_sequence', 'sequence_confidence', 'flag',
2390
2563
  'substrate_list', 'substrate_iupac_list',
2391
2564
  'product_list', 'product_iupac_list',
@@ -2447,23 +2620,83 @@ def run_pipeline(
2447
2620
  # 2. Connect to Gemini -----------------------------------------------------
2448
2621
  model = get_model()
2449
2622
 
2450
- # 3. Extract substrate scope -----------------------------------------------
2451
- entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2623
+ # 3. Check for campaign-based extraction -----------------------------------
2624
+ all_entries = []
2625
+
2626
+ if lineage_csv:
2627
+ import pandas as pd
2628
+ lineage_df = pd.read_csv(lineage_csv)
2629
+
2630
+ # Check if we have campaign_id column - if so, process each campaign separately
2631
+ if 'campaign_id' in lineage_df.columns:
2632
+ campaigns = lineage_df['campaign_id'].unique()
2633
+ log.info("Detected %d campaigns in lineage data - processing each separately", len(campaigns))
2634
+ log.info("Campaigns: %s", campaigns.tolist())
2635
+
2636
+ # Simple campaign context for model reaction awareness
2637
+ campaigns_context_text = f"All campaigns: {campaigns.tolist()}"
2638
+ identify_scope_locations_for_campaign._all_campaigns_context = campaigns_context_text
2639
+ extract_substrate_scope_entries_for_campaign._all_campaigns_context = campaigns_context_text
2640
+
2641
+ for campaign_id in campaigns:
2642
+ log.info("\n" + "="*60)
2643
+ log.info("Processing campaign: %s", campaign_id)
2644
+ log.info("="*60)
2645
+
2646
+ # Get enzymes for this campaign
2647
+ campaign_enzymes = lineage_df[lineage_df['campaign_id'] == campaign_id]
2648
+ if 'enzyme_id' in campaign_enzymes.columns:
2649
+ enzyme_ids = campaign_enzymes['enzyme_id'].tolist()
2650
+ elif 'enzyme' in campaign_enzymes.columns:
2651
+ enzyme_ids = campaign_enzymes['enzyme'].tolist()
2652
+ elif 'variant_id' in campaign_enzymes.columns:
2653
+ enzyme_ids = campaign_enzymes['variant_id'].tolist()
2654
+ else:
2655
+ raise ValueError("No enzyme ID column found in lineage data")
2656
+
2657
+ log.info("Campaign %s has %d enzymes: %s", campaign_id, len(enzyme_ids), enzyme_ids)
2658
+
2659
+ # Create campaign-specific debug dir
2660
+ campaign_debug_dir = Path(debug_dir) / campaign_id if debug_dir else None
2661
+
2662
+ # Extract substrate scope for this campaign
2663
+ campaign_entries = get_substrate_scope_for_campaign(
2664
+ caption_text, full_text, model,
2665
+ campaign_id=campaign_id,
2666
+ enzyme_ids=enzyme_ids,
2667
+ pdf_paths=pdf_paths,
2668
+ debug_dir=campaign_debug_dir
2669
+ )
2670
+
2671
+ if campaign_entries:
2672
+ log.info("Extracted %d substrate scope entries for campaign %s", len(campaign_entries), campaign_id)
2673
+ all_entries.extend(campaign_entries)
2674
+ else:
2675
+ log.info("No substrate scope data found for campaign %s", campaign_id)
2676
+ else:
2677
+ # Original single extraction
2678
+ entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2679
+ all_entries = entries
2680
+ else:
2681
+ # No lineage data - single extraction
2682
+ entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
2683
+ all_entries = entries
2452
2684
 
2453
- if not entries:
2454
- raise RuntimeError("Pipeline aborted: failed to extract any substrate scope data")
2685
+ if not all_entries:
2686
+ log.warning("No substrate scope data extracted from any campaign")
2687
+ all_entries = [] # Allow empty results
2455
2688
 
2456
2689
  # 4. Merge with lineage if available ---------------------------------------
2457
- if lineage_csv:
2458
- entries = merge_with_lineage(entries, Path(lineage_csv), model)
2690
+ if lineage_csv and all_entries:
2691
+ all_entries = merge_with_lineage(all_entries, Path(lineage_csv), model)
2459
2692
 
2460
2693
  # 5. Validate entries ------------------------------------------------------
2461
- warnings = validate_scope_entries(entries)
2694
+ warnings = validate_scope_entries(all_entries)
2462
2695
  if warnings:
2463
2696
  log.warning("Found %d validation warnings", len(warnings))
2464
2697
 
2465
2698
  # 6. Convert to DataFrame --------------------------------------------------
2466
- df_final = _entries_to_dataframe(entries)
2699
+ df_final = _entries_to_dataframe(all_entries)
2467
2700
 
2468
2701
  # 7. Write CSV if requested ------------------------------------------------
2469
2702
  if output_csv: