debase 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +14 -8
- debase/lineage_format.py +335 -56
- debase/reaction_info_extractor.py +60 -32
- debase/substrate_scope_extractor.py +373 -140
- debase/wrapper.py +37 -11
- {debase-0.4.0.dist-info → debase-0.4.2.dist-info}/METADATA +1 -1
- debase-0.4.2.dist-info/RECORD +16 -0
- debase-0.4.0.dist-info/RECORD +0 -16
- {debase-0.4.0.dist-info → debase-0.4.2.dist-info}/WHEEL +0 -0
- {debase-0.4.0.dist-info → debase-0.4.2.dist-info}/entry_points.txt +0 -0
- {debase-0.4.0.dist-info → debase-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.0.dist-info → debase-0.4.2.dist-info}/top_level.txt +0 -0
@@ -83,6 +83,7 @@ class ScopeEntry:
|
|
83
83
|
# Metadata
|
84
84
|
data_location: Optional[str] = None
|
85
85
|
data_source_type: Dict[str, str] = field(default_factory=dict)
|
86
|
+
campaign_id: Optional[str] = None
|
86
87
|
|
87
88
|
# Lineage information (populated during merge)
|
88
89
|
parent_id: Optional[str] = None
|
@@ -309,68 +310,27 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
309
310
|
|
310
311
|
log.info("Found figure caption on page %d at y=%.0f", page_num + 1, caption_rect.y0)
|
311
312
|
|
312
|
-
# Extract
|
313
|
-
# The figure should be between the top of the viewable area and extend to subsequent pages
|
313
|
+
# Extract just the figure with its caption, avoiding excessive white space
|
314
314
|
page_rect = page.rect
|
315
315
|
|
316
|
-
#
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
right_margin = 0
|
322
|
-
|
323
|
-
# Calculate the figure region for the first page
|
324
|
-
fig_top = top_margin
|
325
|
-
fig_bottom = max(caption_rect.y0 + 150, page_rect.height) # At least 150px below caption or full page
|
326
|
-
fig_left = left_margin
|
327
|
-
fig_right = page_rect.width - right_margin
|
328
|
-
|
329
|
-
# Create list to store all page images
|
330
|
-
page_images = []
|
316
|
+
# Extract the entire page containing the identified location
|
317
|
+
fig_top = 0 # Start from top of page
|
318
|
+
fig_bottom = page_rect.height # Full page height
|
319
|
+
fig_left = 0 # Full width
|
320
|
+
fig_right = page_rect.width
|
331
321
|
|
332
|
-
# Extract
|
322
|
+
# Extract the entire page
|
333
323
|
clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
|
334
324
|
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
335
325
|
pix = page.get_pixmap(clip=clip_rect, matrix=mat)
|
336
|
-
page_images.append(pix)
|
337
|
-
|
338
|
-
# Extract additional pages if they exist
|
339
|
-
for additional_page_offset in range(1, additional_pages + 1):
|
340
|
-
next_page_num = page_num + additional_page_offset
|
341
|
-
if next_page_num < doc.page_count:
|
342
|
-
next_page = doc.load_page(next_page_num)
|
343
|
-
next_page_rect = next_page.rect
|
344
|
-
|
345
|
-
# Extract full page for additional pages
|
346
|
-
next_clip_rect = fitz.Rect(0, 0, next_page_rect.width, next_page_rect.height)
|
347
|
-
next_pix = next_page.get_pixmap(clip=next_clip_rect, matrix=mat)
|
348
|
-
page_images.append(next_pix)
|
349
|
-
log.info("Added page %d to multi-page extraction", next_page_num + 1)
|
350
326
|
|
351
|
-
|
352
|
-
|
353
|
-
# Single page extraction
|
354
|
-
combined_pix = page_images[0]
|
355
|
-
else:
|
356
|
-
# Multi-page extraction - combine vertically
|
357
|
-
total_width = max(pix.width for pix in page_images)
|
358
|
-
total_height = sum(pix.height for pix in page_images)
|
359
|
-
|
360
|
-
# Create a new pixmap to hold the combined image
|
361
|
-
combined_pix = fitz.Pixmap(fitz.csRGB, fitz.IRect(0, 0, total_width, total_height))
|
362
|
-
combined_pix.clear_with(255) # White background
|
363
|
-
|
364
|
-
current_y = 0
|
365
|
-
for pix in page_images:
|
366
|
-
# Copy each page image to the combined image
|
367
|
-
combined_pix.copy(pix, fitz.IRect(0, current_y, pix.width, current_y + pix.height))
|
368
|
-
current_y += pix.height
|
327
|
+
log.info("Extracted entire page: %.0fx%.0f pixels from page %d",
|
328
|
+
pix.width, pix.height, page_num + 1)
|
369
329
|
|
370
330
|
# Convert to PNG
|
371
|
-
img_bytes =
|
372
|
-
log.info("
|
373
|
-
|
331
|
+
img_bytes = pix.tobytes("png")
|
332
|
+
log.info("Converted to PNG: %dx%d pixels from page %d",
|
333
|
+
pix.width, pix.height, page_num + 1)
|
374
334
|
|
375
335
|
return b64encode(img_bytes).decode()
|
376
336
|
|
@@ -1014,25 +974,73 @@ Return as JSON:
|
|
1014
974
|
|
1015
975
|
# ---- 6.2 Helper functions -------------------------------------------------
|
1016
976
|
|
1017
|
-
|
977
|
+
|
978
|
+
|
979
|
+
def identify_scope_locations_for_campaign(
|
1018
980
|
text: str,
|
1019
981
|
model,
|
982
|
+
campaign_id: str,
|
983
|
+
enzyme_ids: List[str],
|
1020
984
|
*,
|
1021
985
|
max_results: int = 5,
|
1022
986
|
debug_dir: str | Path | None = None,
|
1023
987
|
) -> List[dict]:
|
1024
|
-
"""Ask Gemini where substrate scope data is located."""
|
1025
|
-
|
988
|
+
"""Ask Gemini where substrate scope data is located for a specific campaign."""
|
989
|
+
|
990
|
+
# Simple model reaction context
|
991
|
+
model_reactions_context = """
|
992
|
+
IMPORTANT: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
|
993
|
+
Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
|
994
|
+
"""
|
995
|
+
|
996
|
+
# Create campaign-specific prompt
|
997
|
+
campaign_prompt = f"""
|
998
|
+
You are an expert reader of biocatalysis manuscripts.
|
999
|
+
Analyze this paper and identify all locations containing substrate scope data for the specific campaign: "{campaign_id}".
|
1000
|
+
|
1001
|
+
CAMPAIGN CONTEXT:
|
1002
|
+
- Campaign ID: {campaign_id}
|
1003
|
+
- Target enzymes: {', '.join(enzyme_ids)}
|
1004
|
+
|
1005
|
+
{model_reactions_context}
|
1006
|
+
|
1007
|
+
Your task is to:
|
1008
|
+
1. Identify locations (tables, figures, text) containing substrate scope reaction data specifically for this campaign
|
1009
|
+
2. Focus only on substrate scope studies involving the enzymes: {', '.join(enzyme_ids)}
|
1010
|
+
3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
|
1011
|
+
- Model reactions are those used to evolve/optimize the enzymes
|
1012
|
+
- Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
|
1013
|
+
4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
|
1014
|
+
5. Determine which enzyme variants from this campaign were tested in substrate scope studies
|
1015
|
+
|
1016
|
+
Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
|
1017
|
+
[
|
1018
|
+
{{
|
1019
|
+
"location": "Description of where the data is found",
|
1020
|
+
"type": "table|figure|text",
|
1021
|
+
"confidence": 0.0-1.0,
|
1022
|
+
"enzyme_variants": ["list of enzyme IDs found"],
|
1023
|
+
"substrates_tested": ["list of substrates if identifiable"],
|
1024
|
+
"campaign_match": true/false,
|
1025
|
+
"is_substrate_scope": true/false,
|
1026
|
+
"model_reaction_excluded": "reason why this is not a model reaction"
|
1027
|
+
}}
|
1028
|
+
]
|
1029
|
+
|
1030
|
+
Important: Only return locations that contain TRUE substrate scope data (not model reactions) for the specified campaign and enzymes. If no substrate scope data exists for this campaign, return an empty array.
|
1031
|
+
"""
|
1032
|
+
|
1033
|
+
prompt = campaign_prompt + "\n\nTEXT:\n" + text[:15_000]
|
1026
1034
|
locs: List[dict] = []
|
1027
1035
|
try:
|
1028
1036
|
locs = generate_json_with_retry(
|
1029
1037
|
model,
|
1030
1038
|
prompt,
|
1031
1039
|
debug_dir=debug_dir,
|
1032
|
-
tag="
|
1040
|
+
tag=f"scope_locate_{campaign_id}",
|
1033
1041
|
)
|
1034
1042
|
except Exception as exc: # pragma: no cover
|
1035
|
-
log.warning("
|
1043
|
+
log.warning("identify_scope_locations_for_campaign(%s): %s", campaign_id, exc)
|
1036
1044
|
return locs if isinstance(locs, list) else []
|
1037
1045
|
|
1038
1046
|
def identify_iupac_sections(
|
@@ -1719,16 +1727,18 @@ def extract_compound_mappings(
|
|
1719
1727
|
log.info("Total compound mappings extracted: %d", len(mappings))
|
1720
1728
|
return mappings
|
1721
1729
|
|
1722
|
-
def
|
1730
|
+
def extract_substrate_scope_entries_for_campaign(
|
1723
1731
|
text: str,
|
1724
1732
|
model,
|
1725
1733
|
locations: List[dict],
|
1734
|
+
campaign_id: str,
|
1735
|
+
enzyme_ids: List[str],
|
1726
1736
|
*,
|
1727
1737
|
pdf_paths: List[Path] = None,
|
1728
|
-
figure_images: Dict[str, str] = None,
|
1729
1738
|
debug_dir: str | Path | None = None,
|
1730
1739
|
) -> List[dict]:
|
1731
|
-
"""Extract
|
1740
|
+
"""Extract substrate scope data specifically for a campaign."""
|
1741
|
+
|
1732
1742
|
extraction_hints = ""
|
1733
1743
|
all_refs = []
|
1734
1744
|
|
@@ -1740,124 +1750,189 @@ def extract_all_substrate_scope_data(
|
|
1740
1750
|
location_strs.append(loc_str)
|
1741
1751
|
all_refs.append(loc_str)
|
1742
1752
|
|
1743
|
-
extraction_hints = f"\nSubstrate scope locations: {', '.join(location_strs)}"
|
1744
|
-
|
1745
|
-
# Collect all enzyme variants
|
1746
|
-
all_variants = []
|
1747
|
-
for loc in locations:
|
1748
|
-
variants = loc.get('enzyme_variants_tested', [])
|
1749
|
-
all_variants.extend(variants)
|
1753
|
+
extraction_hints = f"\nSubstrate scope locations for campaign {campaign_id}: {', '.join(location_strs)}"
|
1750
1754
|
|
1751
|
-
|
1752
|
-
|
1753
|
-
extraction_hints += f"\nEnzyme variants: {', '.join(unique_variants)}"
|
1755
|
+
# Focus on campaign-specific enzyme variants
|
1756
|
+
extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
|
1754
1757
|
|
1755
|
-
# Extract text from ALL identified locations
|
1758
|
+
# Extract text from ALL identified locations (like the original function did)
|
1756
1759
|
extraction_texts = []
|
1760
|
+
figure_images = {}
|
1757
1761
|
|
1758
1762
|
for ref in all_refs:
|
1759
1763
|
if ref and pdf_paths:
|
1760
1764
|
ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
|
1761
1765
|
if ref_text:
|
1762
|
-
# Add figure image notation if available
|
1763
|
-
if figure_images and ref in figure_images:
|
1764
|
-
ref_text = f"[FIGURE IMAGE EXTRACTED: {ref}]\n\n{ref_text}"
|
1765
1766
|
extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
|
1767
|
+
|
1768
|
+
# Extract figure images for this reference (crop page around figure)
|
1769
|
+
try:
|
1770
|
+
fig_base64 = extract_figure_image(pdf_paths, ref)
|
1771
|
+
if fig_base64:
|
1772
|
+
figure_images[ref] = fig_base64
|
1773
|
+
log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
|
1774
|
+
|
1775
|
+
# Save the figure image to debug folder
|
1776
|
+
if debug_dir:
|
1777
|
+
debug_path = Path(debug_dir)
|
1778
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1779
|
+
# Clean ref for filename
|
1780
|
+
safe_ref = re.sub(r'[^\w\s-]', '', ref).strip().replace(' ', '_')
|
1781
|
+
image_file = debug_path / f"figure_{safe_ref}_{campaign_id}.png"
|
1782
|
+
|
1783
|
+
# Decode and save the image
|
1784
|
+
import base64
|
1785
|
+
with open(image_file, 'wb') as f:
|
1786
|
+
f.write(base64.b64decode(fig_base64))
|
1787
|
+
log.info("Campaign %s - saved figure image to %s", campaign_id, image_file)
|
1788
|
+
except Exception as e:
|
1789
|
+
log.warning("Campaign %s - failed to extract figure for %s: %s", campaign_id, ref, e)
|
1766
1790
|
|
1767
1791
|
if not extraction_texts:
|
1768
1792
|
extraction_texts = [text[:50_000]]
|
1769
1793
|
|
1770
1794
|
extraction_text = "\n\n".join(extraction_texts)
|
1771
1795
|
|
1772
|
-
|
1773
|
-
|
1774
|
-
|
1775
|
-
|
1776
|
-
|
1796
|
+
# Simple model reaction context
|
1797
|
+
model_reactions_context = """
|
1798
|
+
CRITICAL: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
|
1799
|
+
Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
|
1800
|
+
"""
|
1777
1801
|
|
1778
|
-
#
|
1779
|
-
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
|
1792
|
-
|
1793
|
-
|
1802
|
+
# Create campaign-specific prompt
|
1803
|
+
campaign_prompt = f"""
|
1804
|
+
You are an expert reader of biocatalysis manuscripts.
|
1805
|
+
Extract ALL substrate scope reaction data specifically for campaign: "{campaign_id}".
|
1806
|
+
|
1807
|
+
CAMPAIGN CONTEXT:
|
1808
|
+
- Campaign ID: {campaign_id}
|
1809
|
+
- Target enzymes: {', '.join(enzyme_ids)}
|
1810
|
+
|
1811
|
+
{model_reactions_context}
|
1812
|
+
|
1813
|
+
IMPORTANT INSTRUCTIONS:
|
1814
|
+
1. Focus ONLY on substrate scope data for the specified campaign and enzymes
|
1815
|
+
2. Extract reactions involving enzymes: {', '.join(enzyme_ids)}
|
1816
|
+
3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
|
1817
|
+
- Model reactions are those used to evolve/optimize the enzymes (listed above)
|
1818
|
+
- Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
|
1819
|
+
- DO NOT include model reactions in substrate scope data
|
1820
|
+
4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
|
1821
|
+
5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
|
1822
|
+
|
1823
|
+
{extraction_hints}
|
1824
|
+
|
1825
|
+
Return your analysis as JSON in this format:
|
1826
|
+
{{
|
1827
|
+
"substrate_scope_data": [
|
1828
|
+
{{
|
1829
|
+
"enzyme_id": "enzyme identifier",
|
1830
|
+
"substrate_ids": ["substrate identifiers"],
|
1831
|
+
"product_ids": ["product identifiers"],
|
1832
|
+
"substrate_names": ["substrate names"],
|
1833
|
+
"product_names": ["product names"],
|
1834
|
+
"yield_percent": number or null,
|
1835
|
+
"ee": number or null,
|
1836
|
+
"ttn": number or null,
|
1837
|
+
"temperature": "temperature" or null,
|
1838
|
+
"ph": "pH" or null,
|
1839
|
+
"buffer": "buffer" or null,
|
1840
|
+
"substrate_concentration": "concentration" or null,
|
1841
|
+
"data_location": "where this data was found",
|
1842
|
+
"campaign_id": "{campaign_id}",
|
1843
|
+
"is_substrate_scope": true,
|
1844
|
+
"model_reaction_excluded": "reason why this is not a model reaction"
|
1845
|
+
}}
|
1846
|
+
]
|
1847
|
+
}}
|
1848
|
+
|
1849
|
+
Important: Only return TRUE substrate scope data (not model reactions) for the specified campaign. If no substrate scope data exists for this campaign, return {{"substrate_scope_data": []}}.
|
1850
|
+
"""
|
1794
1851
|
|
1795
1852
|
try:
|
1796
|
-
# Use multimodal
|
1797
|
-
if
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1853
|
+
# Use multimodal extraction if we have figure images
|
1854
|
+
if figure_images:
|
1855
|
+
log.info("Campaign %s - using multimodal extraction with %d figure images", campaign_id, len(figure_images))
|
1856
|
+
|
1857
|
+
# Prepare multimodal content
|
1858
|
+
import PIL.Image
|
1859
|
+
import io
|
1860
|
+
import base64
|
1861
|
+
|
1862
|
+
content_parts = [campaign_prompt + "\n\nTEXT:\n" + extraction_text]
|
1863
|
+
|
1864
|
+
for fig_ref, fig_base64 in figure_images.items():
|
1865
|
+
try:
|
1866
|
+
# Convert base64 to PIL Image
|
1867
|
+
img_bytes = base64.b64decode(fig_base64)
|
1868
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1869
|
+
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1870
|
+
content_parts.append(image)
|
1871
|
+
log.info("Campaign %s - added figure %s to multimodal prompt", campaign_id, fig_ref)
|
1872
|
+
except Exception as e:
|
1873
|
+
log.warning("Campaign %s - failed to add figure %s: %s", campaign_id, fig_ref, e)
|
1803
1874
|
|
1804
|
-
# Save
|
1875
|
+
# Save debug info
|
1805
1876
|
if debug_dir:
|
1806
1877
|
debug_path = Path(debug_dir)
|
1807
1878
|
debug_path.mkdir(parents=True, exist_ok=True)
|
1808
|
-
prompt_file = debug_path / f"
|
1879
|
+
prompt_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_prompt.txt"
|
1809
1880
|
|
1810
|
-
|
1811
|
-
prompt_info = f"=== PROMPT FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n"
|
1881
|
+
prompt_info = f"=== CAMPAIGN {campaign_id} MULTIMODAL PROMPT ===\n"
|
1812
1882
|
prompt_info += f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
1813
|
-
prompt_info += f"Text length: {len(
|
1814
|
-
prompt_info += f"Images included: {len(
|
1883
|
+
prompt_info += f"Text length: {len(extraction_text)} characters\n"
|
1884
|
+
prompt_info += f"Images included: {len(figure_images)}\n"
|
1815
1885
|
for fig_ref in figure_images.keys():
|
1816
1886
|
prompt_info += f" - {fig_ref}\n"
|
1817
1887
|
prompt_info += "="*80 + "\n\n"
|
1818
|
-
prompt_info +=
|
1888
|
+
prompt_info += campaign_prompt + "\n\nTEXT:\n" + extraction_text
|
1819
1889
|
|
1820
|
-
|
1821
|
-
|
1890
|
+
with open(prompt_file, 'w') as f:
|
1891
|
+
f.write(prompt_info)
|
1892
|
+
log.info("Campaign %s - prompt saved to: %s", campaign_id, prompt_file)
|
1822
1893
|
|
1823
|
-
|
1894
|
+
# Call multimodal API
|
1824
1895
|
response = model.generate_content(content_parts)
|
1825
|
-
raw_text =
|
1826
|
-
|
1827
|
-
# Log and save response
|
1828
|
-
log.info("Gemini multimodal response length: %d characters", len(raw_text))
|
1829
|
-
log.info("First 500 chars of response:\n%s\n...(truncated)", raw_text[:500])
|
1896
|
+
raw_text = response.text.strip()
|
1830
1897
|
|
1898
|
+
# Save response
|
1831
1899
|
if debug_dir:
|
1832
|
-
|
1833
|
-
response_file = debug_path / f"substrate_scope_multimodal_response_{int(time.time())}.txt"
|
1900
|
+
response_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_response.txt"
|
1834
1901
|
with open(response_file, 'w') as f:
|
1835
|
-
f.write(f"===
|
1902
|
+
f.write(f"=== CAMPAIGN {campaign_id} MULTIMODAL RESPONSE ===\n")
|
1836
1903
|
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
1837
1904
|
f.write(f"Length: {len(raw_text)} characters\n")
|
1838
1905
|
f.write("="*80 + "\n\n")
|
1839
1906
|
f.write(raw_text)
|
1840
|
-
log.info("
|
1907
|
+
log.info("Campaign %s - response saved to: %s", campaign_id, response_file)
|
1841
1908
|
|
1842
|
-
# Parse JSON
|
1909
|
+
# Parse JSON
|
1843
1910
|
import json
|
1844
1911
|
data = json.loads(raw_text.strip('```json').strip('```').strip())
|
1845
1912
|
else:
|
1913
|
+
log.info("Campaign %s - using text-only extraction", campaign_id)
|
1846
1914
|
data = generate_json_with_retry(
|
1847
1915
|
model,
|
1848
|
-
|
1916
|
+
campaign_prompt + "\n\nTEXT:\n" + extraction_text,
|
1849
1917
|
debug_dir=debug_dir,
|
1850
|
-
tag="
|
1918
|
+
tag=f"substrate_scope_{campaign_id}",
|
1851
1919
|
)
|
1852
1920
|
|
1853
1921
|
scope_data = data.get("substrate_scope_data", [])
|
1854
|
-
|
1922
|
+
|
1923
|
+
# Add campaign_id to each entry if not present
|
1924
|
+
for entry in scope_data:
|
1925
|
+
if "campaign_id" not in entry:
|
1926
|
+
entry["campaign_id"] = campaign_id
|
1927
|
+
|
1928
|
+
log.info("Campaign %s - extracted %d substrate scope entries", campaign_id, len(scope_data))
|
1855
1929
|
return scope_data
|
1856
1930
|
|
1857
1931
|
except Exception as exc:
|
1858
|
-
log.error("Failed to extract substrate scope data: %s", exc)
|
1932
|
+
log.error("Failed to extract substrate scope data for campaign %s: %s", campaign_id, exc)
|
1859
1933
|
return []
|
1860
1934
|
|
1935
|
+
|
1861
1936
|
def _extract_single_reaction(
|
1862
1937
|
text: str,
|
1863
1938
|
model,
|
@@ -1911,7 +1986,7 @@ def _extract_single_reaction(
|
|
1911
1986
|
log.error("Failed to extract reaction %s-%s: %s", enzyme_id, substrate_name, exc)
|
1912
1987
|
return None
|
1913
1988
|
|
1914
|
-
def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping]) -> List[ScopeEntry]:
|
1989
|
+
def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping], campaign_id: Optional[str] = None) -> List[ScopeEntry]:
|
1915
1990
|
"""Convert raw JSON to ScopeEntry objects with IUPAC enhancement."""
|
1916
1991
|
entries: List[ScopeEntry] = []
|
1917
1992
|
|
@@ -2020,6 +2095,7 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
|
|
2020
2095
|
conditions=conditions,
|
2021
2096
|
data_location=item.get("data_location", ""),
|
2022
2097
|
data_source_type={"all": "text/figure"},
|
2098
|
+
campaign_id=campaign_id or item.get("campaign_id", ""),
|
2023
2099
|
notes=item.get("notes", "")
|
2024
2100
|
)
|
2025
2101
|
|
@@ -2050,7 +2126,10 @@ def get_substrate_scope(
|
|
2050
2126
|
5. Extract individual reactions with context
|
2051
2127
|
"""
|
2052
2128
|
# Step 1: Find locations using captions
|
2053
|
-
|
2129
|
+
# For backward compatibility, use campaign-specific function with generic parameters
|
2130
|
+
locations = identify_scope_locations_for_campaign(
|
2131
|
+
caption_text, model, "general", ["all"], debug_dir=debug_dir
|
2132
|
+
)
|
2054
2133
|
if locations:
|
2055
2134
|
location_summary = []
|
2056
2135
|
for loc in locations[:3]:
|
@@ -2111,10 +2190,13 @@ def get_substrate_scope(
|
|
2111
2190
|
log.warning("Failed to extract %s image for %s", location_type, figure_ref)
|
2112
2191
|
|
2113
2192
|
# Extract all substrate scope data in one call
|
2114
|
-
|
2193
|
+
# Note: This function is now deprecated in favor of campaign-specific extraction
|
2194
|
+
# For backward compatibility, we'll use a generic campaign approach
|
2195
|
+
raw_entries = extract_substrate_scope_entries_for_campaign(
|
2115
2196
|
full_text, model, locations,
|
2197
|
+
campaign_id="general",
|
2198
|
+
enzyme_ids=["all"],
|
2116
2199
|
pdf_paths=pdf_paths,
|
2117
|
-
figure_images=figure_images,
|
2118
2200
|
debug_dir=debug_dir
|
2119
2201
|
)
|
2120
2202
|
|
@@ -2158,6 +2240,96 @@ def get_substrate_scope(
|
|
2158
2240
|
|
2159
2241
|
return entries
|
2160
2242
|
|
2243
|
+
|
2244
|
+
def get_substrate_scope_for_campaign(
|
2245
|
+
caption_text: str,
|
2246
|
+
full_text: str,
|
2247
|
+
model,
|
2248
|
+
*,
|
2249
|
+
campaign_id: str,
|
2250
|
+
enzyme_ids: List[str],
|
2251
|
+
pdf_paths: Optional[List[Path]] = None,
|
2252
|
+
debug_dir: str | Path | None = None,
|
2253
|
+
) -> List[ScopeEntry]:
|
2254
|
+
"""
|
2255
|
+
Campaign-specific substrate scope extraction.
|
2256
|
+
|
2257
|
+
Like get_substrate_scope but focuses on a specific campaign and its enzymes.
|
2258
|
+
Tells Gemini about the specific campaign and that it's okay to return null if
|
2259
|
+
no substrate scope data exists for this campaign.
|
2260
|
+
"""
|
2261
|
+
log.info("Starting campaign-specific substrate scope extraction for: %s", campaign_id)
|
2262
|
+
log.info("Target enzymes: %s", enzyme_ids)
|
2263
|
+
|
2264
|
+
# Step 1: Find locations using captions with campaign context
|
2265
|
+
locations = identify_scope_locations_for_campaign(
|
2266
|
+
caption_text, model, campaign_id, enzyme_ids, debug_dir=debug_dir
|
2267
|
+
)
|
2268
|
+
|
2269
|
+
if not locations:
|
2270
|
+
log.info("No substrate scope locations identified for campaign %s", campaign_id)
|
2271
|
+
return []
|
2272
|
+
|
2273
|
+
location_summary = []
|
2274
|
+
for loc in locations[:3]:
|
2275
|
+
location_summary.append(
|
2276
|
+
f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, "
|
2277
|
+
f"confidence: {loc.get('confidence', 0)})"
|
2278
|
+
)
|
2279
|
+
log.info("Campaign %s - identified %d substrate scope locations: %s",
|
2280
|
+
campaign_id, len(locations), ", ".join(location_summary))
|
2281
|
+
|
2282
|
+
# Step 2: Identify IUPAC sections from SI TOC (reuse existing logic)
|
2283
|
+
iupac_sections = identify_iupac_sections(caption_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2284
|
+
log.info("Campaign %s - identified %d IUPAC sections", campaign_id, len(iupac_sections))
|
2285
|
+
|
2286
|
+
# Step 3: Extract raw entries with campaign context
|
2287
|
+
raw_entries = extract_substrate_scope_entries_for_campaign(
|
2288
|
+
full_text, model, locations, campaign_id, enzyme_ids,
|
2289
|
+
pdf_paths=pdf_paths, debug_dir=debug_dir
|
2290
|
+
)
|
2291
|
+
|
2292
|
+
if not raw_entries:
|
2293
|
+
log.info("No substrate scope entries extracted for campaign %s", campaign_id)
|
2294
|
+
return []
|
2295
|
+
|
2296
|
+
log.info("Campaign %s - extracted %d raw substrate scope entries", campaign_id, len(raw_entries))
|
2297
|
+
|
2298
|
+
# Step 4: Extract compound mappings (reuse existing logic)
|
2299
|
+
figure_images = []
|
2300
|
+
if pdf_paths:
|
2301
|
+
for pdf_path in pdf_paths:
|
2302
|
+
try:
|
2303
|
+
figure_images.extend(extract_figure_images(pdf_path))
|
2304
|
+
except Exception as e:
|
2305
|
+
log.warning("Failed to extract figure images from %s: %s", pdf_path, e)
|
2306
|
+
|
2307
|
+
# Collect all compound IDs from raw entries
|
2308
|
+
all_compound_ids = set()
|
2309
|
+
for entry in raw_entries:
|
2310
|
+
substrate_ids = entry.get("substrate_ids", [])
|
2311
|
+
product_ids = entry.get("product_ids", [])
|
2312
|
+
for sid in substrate_ids:
|
2313
|
+
all_compound_ids.add(str(sid))
|
2314
|
+
for pid in product_ids:
|
2315
|
+
all_compound_ids.add(str(pid))
|
2316
|
+
|
2317
|
+
log.info("Campaign %s - found %d unique compound IDs to map", campaign_id, len(all_compound_ids))
|
2318
|
+
|
2319
|
+
# Extract compound mappings (reuse existing function)
|
2320
|
+
compound_mappings = extract_compound_mappings(full_text, model,
|
2321
|
+
pdf_paths=pdf_paths,
|
2322
|
+
iupac_sections=iupac_sections,
|
2323
|
+
compound_ids=list(all_compound_ids),
|
2324
|
+
primary_locations=locations,
|
2325
|
+
debug_dir=debug_dir)
|
2326
|
+
|
2327
|
+
# Step 5: Parse all entries with compound mappings
|
2328
|
+
entries = _parse_scope_entries(raw_entries, compound_mappings, campaign_id)
|
2329
|
+
log.info("Campaign %s - successfully parsed %d substrate scope entries", campaign_id, len(entries))
|
2330
|
+
|
2331
|
+
return entries
|
2332
|
+
|
2161
2333
|
# === 7. VALIDATION & MERGE ===
|
2162
2334
|
"""Validation, duplicate detection, and merging with lineage data."""
|
2163
2335
|
|
@@ -2353,6 +2525,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2353
2525
|
'parent_enzyme_id': entry.parent_id or '',
|
2354
2526
|
'mutations': entry.mutations or '',
|
2355
2527
|
'generation': entry.generation if entry.generation is not None else '',
|
2528
|
+
'campaign_id': entry.campaign_id or '',
|
2356
2529
|
'protein_sequence': entry.aa_seq or '',
|
2357
2530
|
'nucleotide_sequence': entry.dna_seq or '',
|
2358
2531
|
'sequence_confidence': str(entry.confidence) if entry.confidence is not None else '',
|
@@ -2385,7 +2558,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2385
2558
|
|
2386
2559
|
# Define column order
|
2387
2560
|
column_order = [
|
2388
|
-
'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation',
|
2561
|
+
'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation', 'campaign_id',
|
2389
2562
|
'protein_sequence', 'nucleotide_sequence', 'sequence_confidence', 'flag',
|
2390
2563
|
'substrate_list', 'substrate_iupac_list',
|
2391
2564
|
'product_list', 'product_iupac_list',
|
@@ -2447,23 +2620,83 @@ def run_pipeline(
|
|
2447
2620
|
# 2. Connect to Gemini -----------------------------------------------------
|
2448
2621
|
model = get_model()
|
2449
2622
|
|
2450
|
-
# 3.
|
2451
|
-
|
2623
|
+
# 3. Check for campaign-based extraction -----------------------------------
|
2624
|
+
all_entries = []
|
2625
|
+
|
2626
|
+
if lineage_csv:
|
2627
|
+
import pandas as pd
|
2628
|
+
lineage_df = pd.read_csv(lineage_csv)
|
2629
|
+
|
2630
|
+
# Check if we have campaign_id column - if so, process each campaign separately
|
2631
|
+
if 'campaign_id' in lineage_df.columns:
|
2632
|
+
campaigns = lineage_df['campaign_id'].unique()
|
2633
|
+
log.info("Detected %d campaigns in lineage data - processing each separately", len(campaigns))
|
2634
|
+
log.info("Campaigns: %s", campaigns.tolist())
|
2635
|
+
|
2636
|
+
# Simple campaign context for model reaction awareness
|
2637
|
+
campaigns_context_text = f"All campaigns: {campaigns.tolist()}"
|
2638
|
+
identify_scope_locations_for_campaign._all_campaigns_context = campaigns_context_text
|
2639
|
+
extract_substrate_scope_entries_for_campaign._all_campaigns_context = campaigns_context_text
|
2640
|
+
|
2641
|
+
for campaign_id in campaigns:
|
2642
|
+
log.info("\n" + "="*60)
|
2643
|
+
log.info("Processing campaign: %s", campaign_id)
|
2644
|
+
log.info("="*60)
|
2645
|
+
|
2646
|
+
# Get enzymes for this campaign
|
2647
|
+
campaign_enzymes = lineage_df[lineage_df['campaign_id'] == campaign_id]
|
2648
|
+
if 'enzyme_id' in campaign_enzymes.columns:
|
2649
|
+
enzyme_ids = campaign_enzymes['enzyme_id'].tolist()
|
2650
|
+
elif 'enzyme' in campaign_enzymes.columns:
|
2651
|
+
enzyme_ids = campaign_enzymes['enzyme'].tolist()
|
2652
|
+
elif 'variant_id' in campaign_enzymes.columns:
|
2653
|
+
enzyme_ids = campaign_enzymes['variant_id'].tolist()
|
2654
|
+
else:
|
2655
|
+
raise ValueError("No enzyme ID column found in lineage data")
|
2656
|
+
|
2657
|
+
log.info("Campaign %s has %d enzymes: %s", campaign_id, len(enzyme_ids), enzyme_ids)
|
2658
|
+
|
2659
|
+
# Create campaign-specific debug dir
|
2660
|
+
campaign_debug_dir = Path(debug_dir) / campaign_id if debug_dir else None
|
2661
|
+
|
2662
|
+
# Extract substrate scope for this campaign
|
2663
|
+
campaign_entries = get_substrate_scope_for_campaign(
|
2664
|
+
caption_text, full_text, model,
|
2665
|
+
campaign_id=campaign_id,
|
2666
|
+
enzyme_ids=enzyme_ids,
|
2667
|
+
pdf_paths=pdf_paths,
|
2668
|
+
debug_dir=campaign_debug_dir
|
2669
|
+
)
|
2670
|
+
|
2671
|
+
if campaign_entries:
|
2672
|
+
log.info("Extracted %d substrate scope entries for campaign %s", len(campaign_entries), campaign_id)
|
2673
|
+
all_entries.extend(campaign_entries)
|
2674
|
+
else:
|
2675
|
+
log.info("No substrate scope data found for campaign %s", campaign_id)
|
2676
|
+
else:
|
2677
|
+
# Original single extraction
|
2678
|
+
entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2679
|
+
all_entries = entries
|
2680
|
+
else:
|
2681
|
+
# No lineage data - single extraction
|
2682
|
+
entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2683
|
+
all_entries = entries
|
2452
2684
|
|
2453
|
-
if not
|
2454
|
-
|
2685
|
+
if not all_entries:
|
2686
|
+
log.warning("No substrate scope data extracted from any campaign")
|
2687
|
+
all_entries = [] # Allow empty results
|
2455
2688
|
|
2456
2689
|
# 4. Merge with lineage if available ---------------------------------------
|
2457
|
-
if lineage_csv:
|
2458
|
-
|
2690
|
+
if lineage_csv and all_entries:
|
2691
|
+
all_entries = merge_with_lineage(all_entries, Path(lineage_csv), model)
|
2459
2692
|
|
2460
2693
|
# 5. Validate entries ------------------------------------------------------
|
2461
|
-
warnings = validate_scope_entries(
|
2694
|
+
warnings = validate_scope_entries(all_entries)
|
2462
2695
|
if warnings:
|
2463
2696
|
log.warning("Found %d validation warnings", len(warnings))
|
2464
2697
|
|
2465
2698
|
# 6. Convert to DataFrame --------------------------------------------------
|
2466
|
-
df_final = _entries_to_dataframe(
|
2699
|
+
df_final = _entries_to_dataframe(all_entries)
|
2467
2700
|
|
2468
2701
|
# 7. Write CSV if requested ------------------------------------------------
|
2469
2702
|
if output_csv:
|