debase 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +14 -8
- debase/lineage_format.py +335 -56
- debase/reaction_info_extractor.py +60 -32
- debase/substrate_scope_extractor.py +366 -93
- debase/wrapper.py +37 -11
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/METADATA +1 -1
- debase-0.4.2.dist-info/RECORD +16 -0
- debase-0.4.1.dist-info/RECORD +0 -16
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/WHEEL +0 -0
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/entry_points.txt +0 -0
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/top_level.txt +0 -0
@@ -83,6 +83,7 @@ class ScopeEntry:
|
|
83
83
|
# Metadata
|
84
84
|
data_location: Optional[str] = None
|
85
85
|
data_source_type: Dict[str, str] = field(default_factory=dict)
|
86
|
+
campaign_id: Optional[str] = None
|
86
87
|
|
87
88
|
# Lineage information (populated during merge)
|
88
89
|
parent_id: Optional[str] = None
|
@@ -312,24 +313,23 @@ def extract_figure_image(pdf_paths: List[Path], figure_ref: str) -> Optional[str
|
|
312
313
|
# Extract just the figure with its caption, avoiding excessive white space
|
313
314
|
page_rect = page.rect
|
314
315
|
|
315
|
-
#
|
316
|
-
# Extract from top of page to just below the caption
|
316
|
+
# Extract the entire page containing the identified location
|
317
317
|
fig_top = 0 # Start from top of page
|
318
|
-
fig_bottom =
|
318
|
+
fig_bottom = page_rect.height # Full page height
|
319
319
|
fig_left = 0 # Full width
|
320
320
|
fig_right = page_rect.width
|
321
321
|
|
322
|
-
# Extract
|
322
|
+
# Extract the entire page
|
323
323
|
clip_rect = fitz.Rect(fig_left, fig_top, fig_right, fig_bottom)
|
324
324
|
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
325
325
|
pix = page.get_pixmap(clip=clip_rect, matrix=mat)
|
326
326
|
|
327
|
-
log.info("Extracted
|
327
|
+
log.info("Extracted entire page: %.0fx%.0f pixels from page %d",
|
328
328
|
pix.width, pix.height, page_num + 1)
|
329
329
|
|
330
330
|
# Convert to PNG
|
331
331
|
img_bytes = pix.tobytes("png")
|
332
|
-
log.info("
|
332
|
+
log.info("Converted to PNG: %dx%d pixels from page %d",
|
333
333
|
pix.width, pix.height, page_num + 1)
|
334
334
|
|
335
335
|
return b64encode(img_bytes).decode()
|
@@ -974,25 +974,73 @@ Return as JSON:
|
|
974
974
|
|
975
975
|
# ---- 6.2 Helper functions -------------------------------------------------
|
976
976
|
|
977
|
-
|
977
|
+
|
978
|
+
|
979
|
+
def identify_scope_locations_for_campaign(
|
978
980
|
text: str,
|
979
981
|
model,
|
982
|
+
campaign_id: str,
|
983
|
+
enzyme_ids: List[str],
|
980
984
|
*,
|
981
985
|
max_results: int = 5,
|
982
986
|
debug_dir: str | Path | None = None,
|
983
987
|
) -> List[dict]:
|
984
|
-
"""Ask Gemini where substrate scope data is located."""
|
985
|
-
|
988
|
+
"""Ask Gemini where substrate scope data is located for a specific campaign."""
|
989
|
+
|
990
|
+
# Simple model reaction context
|
991
|
+
model_reactions_context = """
|
992
|
+
IMPORTANT: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
|
993
|
+
Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
|
994
|
+
"""
|
995
|
+
|
996
|
+
# Create campaign-specific prompt
|
997
|
+
campaign_prompt = f"""
|
998
|
+
You are an expert reader of biocatalysis manuscripts.
|
999
|
+
Analyze this paper and identify all locations containing substrate scope data for the specific campaign: "{campaign_id}".
|
1000
|
+
|
1001
|
+
CAMPAIGN CONTEXT:
|
1002
|
+
- Campaign ID: {campaign_id}
|
1003
|
+
- Target enzymes: {', '.join(enzyme_ids)}
|
1004
|
+
|
1005
|
+
{model_reactions_context}
|
1006
|
+
|
1007
|
+
Your task is to:
|
1008
|
+
1. Identify locations (tables, figures, text) containing substrate scope reaction data specifically for this campaign
|
1009
|
+
2. Focus only on substrate scope studies involving the enzymes: {', '.join(enzyme_ids)}
|
1010
|
+
3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
|
1011
|
+
- Model reactions are those used to evolve/optimize the enzymes
|
1012
|
+
- Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
|
1013
|
+
4. Note that not all campaigns have substrate scope data - it's okay to return empty results if no substrate scope data exists for this campaign
|
1014
|
+
5. Determine which enzyme variants from this campaign were tested in substrate scope studies
|
1015
|
+
|
1016
|
+
Return your analysis as JSON array (max {max_results} locations, or empty array if no substrate scope data for this campaign):
|
1017
|
+
[
|
1018
|
+
{{
|
1019
|
+
"location": "Description of where the data is found",
|
1020
|
+
"type": "table|figure|text",
|
1021
|
+
"confidence": 0.0-1.0,
|
1022
|
+
"enzyme_variants": ["list of enzyme IDs found"],
|
1023
|
+
"substrates_tested": ["list of substrates if identifiable"],
|
1024
|
+
"campaign_match": true/false,
|
1025
|
+
"is_substrate_scope": true/false,
|
1026
|
+
"model_reaction_excluded": "reason why this is not a model reaction"
|
1027
|
+
}}
|
1028
|
+
]
|
1029
|
+
|
1030
|
+
Important: Only return locations that contain TRUE substrate scope data (not model reactions) for the specified campaign and enzymes. If no substrate scope data exists for this campaign, return an empty array.
|
1031
|
+
"""
|
1032
|
+
|
1033
|
+
prompt = campaign_prompt + "\n\nTEXT:\n" + text[:15_000]
|
986
1034
|
locs: List[dict] = []
|
987
1035
|
try:
|
988
1036
|
locs = generate_json_with_retry(
|
989
1037
|
model,
|
990
1038
|
prompt,
|
991
1039
|
debug_dir=debug_dir,
|
992
|
-
tag="
|
1040
|
+
tag=f"scope_locate_{campaign_id}",
|
993
1041
|
)
|
994
1042
|
except Exception as exc: # pragma: no cover
|
995
|
-
log.warning("
|
1043
|
+
log.warning("identify_scope_locations_for_campaign(%s): %s", campaign_id, exc)
|
996
1044
|
return locs if isinstance(locs, list) else []
|
997
1045
|
|
998
1046
|
def identify_iupac_sections(
|
@@ -1679,16 +1727,18 @@ def extract_compound_mappings(
|
|
1679
1727
|
log.info("Total compound mappings extracted: %d", len(mappings))
|
1680
1728
|
return mappings
|
1681
1729
|
|
1682
|
-
def
|
1730
|
+
def extract_substrate_scope_entries_for_campaign(
|
1683
1731
|
text: str,
|
1684
1732
|
model,
|
1685
1733
|
locations: List[dict],
|
1734
|
+
campaign_id: str,
|
1735
|
+
enzyme_ids: List[str],
|
1686
1736
|
*,
|
1687
1737
|
pdf_paths: List[Path] = None,
|
1688
|
-
figure_images: Dict[str, str] = None,
|
1689
1738
|
debug_dir: str | Path | None = None,
|
1690
1739
|
) -> List[dict]:
|
1691
|
-
"""Extract
|
1740
|
+
"""Extract substrate scope data specifically for a campaign."""
|
1741
|
+
|
1692
1742
|
extraction_hints = ""
|
1693
1743
|
all_refs = []
|
1694
1744
|
|
@@ -1700,124 +1750,189 @@ def extract_all_substrate_scope_data(
|
|
1700
1750
|
location_strs.append(loc_str)
|
1701
1751
|
all_refs.append(loc_str)
|
1702
1752
|
|
1703
|
-
extraction_hints = f"\nSubstrate scope locations: {', '.join(location_strs)}"
|
1704
|
-
|
1705
|
-
# Collect all enzyme variants
|
1706
|
-
all_variants = []
|
1707
|
-
for loc in locations:
|
1708
|
-
variants = loc.get('enzyme_variants_tested', [])
|
1709
|
-
all_variants.extend(variants)
|
1753
|
+
extraction_hints = f"\nSubstrate scope locations for campaign {campaign_id}: {', '.join(location_strs)}"
|
1710
1754
|
|
1711
|
-
|
1712
|
-
|
1713
|
-
extraction_hints += f"\nEnzyme variants: {', '.join(unique_variants)}"
|
1755
|
+
# Focus on campaign-specific enzyme variants
|
1756
|
+
extraction_hints += f"\nTarget enzymes for this campaign: {', '.join(enzyme_ids)}"
|
1714
1757
|
|
1715
|
-
# Extract text from ALL identified locations
|
1758
|
+
# Extract text from ALL identified locations (like the original function did)
|
1716
1759
|
extraction_texts = []
|
1760
|
+
figure_images = {}
|
1717
1761
|
|
1718
1762
|
for ref in all_refs:
|
1719
1763
|
if ref and pdf_paths:
|
1720
1764
|
ref_text = _extract_text_around_reference(pdf_paths, ref, context_chars=5000)
|
1721
1765
|
if ref_text:
|
1722
|
-
# Add figure image notation if available
|
1723
|
-
if figure_images and ref in figure_images:
|
1724
|
-
ref_text = f"[FIGURE IMAGE EXTRACTED: {ref}]\n\n{ref_text}"
|
1725
1766
|
extraction_texts.append(f"\n=== Data from {ref} ===\n{ref_text}")
|
1767
|
+
|
1768
|
+
# Extract figure images for this reference (crop page around figure)
|
1769
|
+
try:
|
1770
|
+
fig_base64 = extract_figure_image(pdf_paths, ref)
|
1771
|
+
if fig_base64:
|
1772
|
+
figure_images[ref] = fig_base64
|
1773
|
+
log.info("Campaign %s - extracted cropped figure image for %s", campaign_id, ref)
|
1774
|
+
|
1775
|
+
# Save the figure image to debug folder
|
1776
|
+
if debug_dir:
|
1777
|
+
debug_path = Path(debug_dir)
|
1778
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1779
|
+
# Clean ref for filename
|
1780
|
+
safe_ref = re.sub(r'[^\w\s-]', '', ref).strip().replace(' ', '_')
|
1781
|
+
image_file = debug_path / f"figure_{safe_ref}_{campaign_id}.png"
|
1782
|
+
|
1783
|
+
# Decode and save the image
|
1784
|
+
import base64
|
1785
|
+
with open(image_file, 'wb') as f:
|
1786
|
+
f.write(base64.b64decode(fig_base64))
|
1787
|
+
log.info("Campaign %s - saved figure image to %s", campaign_id, image_file)
|
1788
|
+
except Exception as e:
|
1789
|
+
log.warning("Campaign %s - failed to extract figure for %s: %s", campaign_id, ref, e)
|
1726
1790
|
|
1727
1791
|
if not extraction_texts:
|
1728
1792
|
extraction_texts = [text[:50_000]]
|
1729
1793
|
|
1730
1794
|
extraction_text = "\n\n".join(extraction_texts)
|
1731
1795
|
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1796
|
+
# Simple model reaction context
|
1797
|
+
model_reactions_context = """
|
1798
|
+
CRITICAL: Substrate scope reactions are those that test DIFFERENT substrates than the model reactions used for evolution.
|
1799
|
+
Model reactions are used to evolve/optimize enzymes. Substrate scope reactions test evolved enzymes on different substrates.
|
1800
|
+
"""
|
1737
1801
|
|
1738
|
-
#
|
1739
|
-
|
1740
|
-
|
1741
|
-
|
1742
|
-
|
1743
|
-
|
1744
|
-
|
1745
|
-
|
1746
|
-
|
1747
|
-
|
1748
|
-
|
1749
|
-
|
1750
|
-
|
1751
|
-
|
1752
|
-
|
1753
|
-
|
1802
|
+
# Create campaign-specific prompt
|
1803
|
+
campaign_prompt = f"""
|
1804
|
+
You are an expert reader of biocatalysis manuscripts.
|
1805
|
+
Extract ALL substrate scope reaction data specifically for campaign: "{campaign_id}".
|
1806
|
+
|
1807
|
+
CAMPAIGN CONTEXT:
|
1808
|
+
- Campaign ID: {campaign_id}
|
1809
|
+
- Target enzymes: {', '.join(enzyme_ids)}
|
1810
|
+
|
1811
|
+
{model_reactions_context}
|
1812
|
+
|
1813
|
+
IMPORTANT INSTRUCTIONS:
|
1814
|
+
1. Focus ONLY on substrate scope data for the specified campaign and enzymes
|
1815
|
+
2. Extract reactions involving enzymes: {', '.join(enzyme_ids)}
|
1816
|
+
3. CRITICAL: Distinguish substrate scope studies from model reactions used for evolution
|
1817
|
+
- Model reactions are those used to evolve/optimize the enzymes (listed above)
|
1818
|
+
- Substrate scope reactions test the evolved enzymes on DIFFERENT substrates
|
1819
|
+
- DO NOT include model reactions in substrate scope data
|
1820
|
+
4. Not all campaigns have substrate scope data - if no substrate scope data exists for this campaign, return an empty array
|
1821
|
+
5. Include all relevant reaction performance data (yield, ee, ttn, etc.)
|
1822
|
+
|
1823
|
+
{extraction_hints}
|
1824
|
+
|
1825
|
+
Return your analysis as JSON in this format:
|
1826
|
+
{{
|
1827
|
+
"substrate_scope_data": [
|
1828
|
+
{{
|
1829
|
+
"enzyme_id": "enzyme identifier",
|
1830
|
+
"substrate_ids": ["substrate identifiers"],
|
1831
|
+
"product_ids": ["product identifiers"],
|
1832
|
+
"substrate_names": ["substrate names"],
|
1833
|
+
"product_names": ["product names"],
|
1834
|
+
"yield_percent": number or null,
|
1835
|
+
"ee": number or null,
|
1836
|
+
"ttn": number or null,
|
1837
|
+
"temperature": "temperature" or null,
|
1838
|
+
"ph": "pH" or null,
|
1839
|
+
"buffer": "buffer" or null,
|
1840
|
+
"substrate_concentration": "concentration" or null,
|
1841
|
+
"data_location": "where this data was found",
|
1842
|
+
"campaign_id": "{campaign_id}",
|
1843
|
+
"is_substrate_scope": true,
|
1844
|
+
"model_reaction_excluded": "reason why this is not a model reaction"
|
1845
|
+
}}
|
1846
|
+
]
|
1847
|
+
}}
|
1848
|
+
|
1849
|
+
Important: Only return TRUE substrate scope data (not model reactions) for the specified campaign. If no substrate scope data exists for this campaign, return {{"substrate_scope_data": []}}.
|
1850
|
+
"""
|
1754
1851
|
|
1755
1852
|
try:
|
1756
|
-
# Use multimodal
|
1757
|
-
if
|
1758
|
-
|
1759
|
-
log.info("=== GEMINI MULTIMODAL API CALL: SUBSTRATE_SCOPE_WITH_FIGURES ===")
|
1760
|
-
log.info("Text prompt length: %d characters", len(prompt))
|
1761
|
-
log.info("Number of images: %d", len(content_parts) - 1)
|
1762
|
-
log.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
|
1853
|
+
# Use multimodal extraction if we have figure images
|
1854
|
+
if figure_images:
|
1855
|
+
log.info("Campaign %s - using multimodal extraction with %d figure images", campaign_id, len(figure_images))
|
1763
1856
|
|
1764
|
-
#
|
1857
|
+
# Prepare multimodal content
|
1858
|
+
import PIL.Image
|
1859
|
+
import io
|
1860
|
+
import base64
|
1861
|
+
|
1862
|
+
content_parts = [campaign_prompt + "\n\nTEXT:\n" + extraction_text]
|
1863
|
+
|
1864
|
+
for fig_ref, fig_base64 in figure_images.items():
|
1865
|
+
try:
|
1866
|
+
# Convert base64 to PIL Image
|
1867
|
+
img_bytes = base64.b64decode(fig_base64)
|
1868
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1869
|
+
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1870
|
+
content_parts.append(image)
|
1871
|
+
log.info("Campaign %s - added figure %s to multimodal prompt", campaign_id, fig_ref)
|
1872
|
+
except Exception as e:
|
1873
|
+
log.warning("Campaign %s - failed to add figure %s: %s", campaign_id, fig_ref, e)
|
1874
|
+
|
1875
|
+
# Save debug info
|
1765
1876
|
if debug_dir:
|
1766
1877
|
debug_path = Path(debug_dir)
|
1767
1878
|
debug_path.mkdir(parents=True, exist_ok=True)
|
1768
|
-
prompt_file = debug_path / f"
|
1879
|
+
prompt_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_prompt.txt"
|
1769
1880
|
|
1770
|
-
|
1771
|
-
prompt_info = f"=== PROMPT FOR SUBSTRATE_SCOPE_MULTIMODAL ===\n"
|
1881
|
+
prompt_info = f"=== CAMPAIGN {campaign_id} MULTIMODAL PROMPT ===\n"
|
1772
1882
|
prompt_info += f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n"
|
1773
|
-
prompt_info += f"Text length: {len(
|
1774
|
-
prompt_info += f"Images included: {len(
|
1883
|
+
prompt_info += f"Text length: {len(extraction_text)} characters\n"
|
1884
|
+
prompt_info += f"Images included: {len(figure_images)}\n"
|
1775
1885
|
for fig_ref in figure_images.keys():
|
1776
1886
|
prompt_info += f" - {fig_ref}\n"
|
1777
1887
|
prompt_info += "="*80 + "\n\n"
|
1778
|
-
prompt_info +=
|
1888
|
+
prompt_info += campaign_prompt + "\n\nTEXT:\n" + extraction_text
|
1779
1889
|
|
1780
|
-
|
1781
|
-
|
1890
|
+
with open(prompt_file, 'w') as f:
|
1891
|
+
f.write(prompt_info)
|
1892
|
+
log.info("Campaign %s - prompt saved to: %s", campaign_id, prompt_file)
|
1782
1893
|
|
1783
|
-
|
1894
|
+
# Call multimodal API
|
1784
1895
|
response = model.generate_content(content_parts)
|
1785
|
-
raw_text =
|
1786
|
-
|
1787
|
-
# Log and save response
|
1788
|
-
log.info("Gemini multimodal response length: %d characters", len(raw_text))
|
1789
|
-
log.info("First 500 chars of response:\n%s\n...(truncated)", raw_text[:500])
|
1896
|
+
raw_text = response.text.strip()
|
1790
1897
|
|
1898
|
+
# Save response
|
1791
1899
|
if debug_dir:
|
1792
|
-
|
1793
|
-
response_file = debug_path / f"substrate_scope_multimodal_response_{int(time.time())}.txt"
|
1900
|
+
response_file = debug_path / f"substrate_scope_{campaign_id}_multimodal_response.txt"
|
1794
1901
|
with open(response_file, 'w') as f:
|
1795
|
-
f.write(f"===
|
1902
|
+
f.write(f"=== CAMPAIGN {campaign_id} MULTIMODAL RESPONSE ===\n")
|
1796
1903
|
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
1797
1904
|
f.write(f"Length: {len(raw_text)} characters\n")
|
1798
1905
|
f.write("="*80 + "\n\n")
|
1799
1906
|
f.write(raw_text)
|
1800
|
-
log.info("
|
1907
|
+
log.info("Campaign %s - response saved to: %s", campaign_id, response_file)
|
1801
1908
|
|
1802
|
-
# Parse JSON
|
1909
|
+
# Parse JSON
|
1803
1910
|
import json
|
1804
1911
|
data = json.loads(raw_text.strip('```json').strip('```').strip())
|
1805
1912
|
else:
|
1913
|
+
log.info("Campaign %s - using text-only extraction", campaign_id)
|
1806
1914
|
data = generate_json_with_retry(
|
1807
1915
|
model,
|
1808
|
-
|
1916
|
+
campaign_prompt + "\n\nTEXT:\n" + extraction_text,
|
1809
1917
|
debug_dir=debug_dir,
|
1810
|
-
tag="
|
1918
|
+
tag=f"substrate_scope_{campaign_id}",
|
1811
1919
|
)
|
1812
1920
|
|
1813
1921
|
scope_data = data.get("substrate_scope_data", [])
|
1814
|
-
|
1922
|
+
|
1923
|
+
# Add campaign_id to each entry if not present
|
1924
|
+
for entry in scope_data:
|
1925
|
+
if "campaign_id" not in entry:
|
1926
|
+
entry["campaign_id"] = campaign_id
|
1927
|
+
|
1928
|
+
log.info("Campaign %s - extracted %d substrate scope entries", campaign_id, len(scope_data))
|
1815
1929
|
return scope_data
|
1816
1930
|
|
1817
1931
|
except Exception as exc:
|
1818
|
-
log.error("Failed to extract substrate scope data: %s", exc)
|
1932
|
+
log.error("Failed to extract substrate scope data for campaign %s: %s", campaign_id, exc)
|
1819
1933
|
return []
|
1820
1934
|
|
1935
|
+
|
1821
1936
|
def _extract_single_reaction(
|
1822
1937
|
text: str,
|
1823
1938
|
model,
|
@@ -1871,7 +1986,7 @@ def _extract_single_reaction(
|
|
1871
1986
|
log.error("Failed to extract reaction %s-%s: %s", enzyme_id, substrate_name, exc)
|
1872
1987
|
return None
|
1873
1988
|
|
1874
|
-
def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping]) -> List[ScopeEntry]:
|
1989
|
+
def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, CompoundMapping], campaign_id: Optional[str] = None) -> List[ScopeEntry]:
|
1875
1990
|
"""Convert raw JSON to ScopeEntry objects with IUPAC enhancement."""
|
1876
1991
|
entries: List[ScopeEntry] = []
|
1877
1992
|
|
@@ -1980,6 +2095,7 @@ def _parse_scope_entries(data: List[dict], compound_mappings: Dict[str, Compound
|
|
1980
2095
|
conditions=conditions,
|
1981
2096
|
data_location=item.get("data_location", ""),
|
1982
2097
|
data_source_type={"all": "text/figure"},
|
2098
|
+
campaign_id=campaign_id or item.get("campaign_id", ""),
|
1983
2099
|
notes=item.get("notes", "")
|
1984
2100
|
)
|
1985
2101
|
|
@@ -2010,7 +2126,10 @@ def get_substrate_scope(
|
|
2010
2126
|
5. Extract individual reactions with context
|
2011
2127
|
"""
|
2012
2128
|
# Step 1: Find locations using captions
|
2013
|
-
|
2129
|
+
# For backward compatibility, use campaign-specific function with generic parameters
|
2130
|
+
locations = identify_scope_locations_for_campaign(
|
2131
|
+
caption_text, model, "general", ["all"], debug_dir=debug_dir
|
2132
|
+
)
|
2014
2133
|
if locations:
|
2015
2134
|
location_summary = []
|
2016
2135
|
for loc in locations[:3]:
|
@@ -2071,10 +2190,13 @@ def get_substrate_scope(
|
|
2071
2190
|
log.warning("Failed to extract %s image for %s", location_type, figure_ref)
|
2072
2191
|
|
2073
2192
|
# Extract all substrate scope data in one call
|
2074
|
-
|
2193
|
+
# Note: This function is now deprecated in favor of campaign-specific extraction
|
2194
|
+
# For backward compatibility, we'll use a generic campaign approach
|
2195
|
+
raw_entries = extract_substrate_scope_entries_for_campaign(
|
2075
2196
|
full_text, model, locations,
|
2197
|
+
campaign_id="general",
|
2198
|
+
enzyme_ids=["all"],
|
2076
2199
|
pdf_paths=pdf_paths,
|
2077
|
-
figure_images=figure_images,
|
2078
2200
|
debug_dir=debug_dir
|
2079
2201
|
)
|
2080
2202
|
|
@@ -2118,6 +2240,96 @@ def get_substrate_scope(
|
|
2118
2240
|
|
2119
2241
|
return entries
|
2120
2242
|
|
2243
|
+
|
2244
|
+
def get_substrate_scope_for_campaign(
|
2245
|
+
caption_text: str,
|
2246
|
+
full_text: str,
|
2247
|
+
model,
|
2248
|
+
*,
|
2249
|
+
campaign_id: str,
|
2250
|
+
enzyme_ids: List[str],
|
2251
|
+
pdf_paths: Optional[List[Path]] = None,
|
2252
|
+
debug_dir: str | Path | None = None,
|
2253
|
+
) -> List[ScopeEntry]:
|
2254
|
+
"""
|
2255
|
+
Campaign-specific substrate scope extraction.
|
2256
|
+
|
2257
|
+
Like get_substrate_scope but focuses on a specific campaign and its enzymes.
|
2258
|
+
Tells Gemini about the specific campaign and that it's okay to return null if
|
2259
|
+
no substrate scope data exists for this campaign.
|
2260
|
+
"""
|
2261
|
+
log.info("Starting campaign-specific substrate scope extraction for: %s", campaign_id)
|
2262
|
+
log.info("Target enzymes: %s", enzyme_ids)
|
2263
|
+
|
2264
|
+
# Step 1: Find locations using captions with campaign context
|
2265
|
+
locations = identify_scope_locations_for_campaign(
|
2266
|
+
caption_text, model, campaign_id, enzyme_ids, debug_dir=debug_dir
|
2267
|
+
)
|
2268
|
+
|
2269
|
+
if not locations:
|
2270
|
+
log.info("No substrate scope locations identified for campaign %s", campaign_id)
|
2271
|
+
return []
|
2272
|
+
|
2273
|
+
location_summary = []
|
2274
|
+
for loc in locations[:3]:
|
2275
|
+
location_summary.append(
|
2276
|
+
f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, "
|
2277
|
+
f"confidence: {loc.get('confidence', 0)})"
|
2278
|
+
)
|
2279
|
+
log.info("Campaign %s - identified %d substrate scope locations: %s",
|
2280
|
+
campaign_id, len(locations), ", ".join(location_summary))
|
2281
|
+
|
2282
|
+
# Step 2: Identify IUPAC sections from SI TOC (reuse existing logic)
|
2283
|
+
iupac_sections = identify_iupac_sections(caption_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2284
|
+
log.info("Campaign %s - identified %d IUPAC sections", campaign_id, len(iupac_sections))
|
2285
|
+
|
2286
|
+
# Step 3: Extract raw entries with campaign context
|
2287
|
+
raw_entries = extract_substrate_scope_entries_for_campaign(
|
2288
|
+
full_text, model, locations, campaign_id, enzyme_ids,
|
2289
|
+
pdf_paths=pdf_paths, debug_dir=debug_dir
|
2290
|
+
)
|
2291
|
+
|
2292
|
+
if not raw_entries:
|
2293
|
+
log.info("No substrate scope entries extracted for campaign %s", campaign_id)
|
2294
|
+
return []
|
2295
|
+
|
2296
|
+
log.info("Campaign %s - extracted %d raw substrate scope entries", campaign_id, len(raw_entries))
|
2297
|
+
|
2298
|
+
# Step 4: Extract compound mappings (reuse existing logic)
|
2299
|
+
figure_images = []
|
2300
|
+
if pdf_paths:
|
2301
|
+
for pdf_path in pdf_paths:
|
2302
|
+
try:
|
2303
|
+
figure_images.extend(extract_figure_images(pdf_path))
|
2304
|
+
except Exception as e:
|
2305
|
+
log.warning("Failed to extract figure images from %s: %s", pdf_path, e)
|
2306
|
+
|
2307
|
+
# Collect all compound IDs from raw entries
|
2308
|
+
all_compound_ids = set()
|
2309
|
+
for entry in raw_entries:
|
2310
|
+
substrate_ids = entry.get("substrate_ids", [])
|
2311
|
+
product_ids = entry.get("product_ids", [])
|
2312
|
+
for sid in substrate_ids:
|
2313
|
+
all_compound_ids.add(str(sid))
|
2314
|
+
for pid in product_ids:
|
2315
|
+
all_compound_ids.add(str(pid))
|
2316
|
+
|
2317
|
+
log.info("Campaign %s - found %d unique compound IDs to map", campaign_id, len(all_compound_ids))
|
2318
|
+
|
2319
|
+
# Extract compound mappings (reuse existing function)
|
2320
|
+
compound_mappings = extract_compound_mappings(full_text, model,
|
2321
|
+
pdf_paths=pdf_paths,
|
2322
|
+
iupac_sections=iupac_sections,
|
2323
|
+
compound_ids=list(all_compound_ids),
|
2324
|
+
primary_locations=locations,
|
2325
|
+
debug_dir=debug_dir)
|
2326
|
+
|
2327
|
+
# Step 5: Parse all entries with compound mappings
|
2328
|
+
entries = _parse_scope_entries(raw_entries, compound_mappings, campaign_id)
|
2329
|
+
log.info("Campaign %s - successfully parsed %d substrate scope entries", campaign_id, len(entries))
|
2330
|
+
|
2331
|
+
return entries
|
2332
|
+
|
2121
2333
|
# === 7. VALIDATION & MERGE ===
|
2122
2334
|
"""Validation, duplicate detection, and merging with lineage data."""
|
2123
2335
|
|
@@ -2313,6 +2525,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2313
2525
|
'parent_enzyme_id': entry.parent_id or '',
|
2314
2526
|
'mutations': entry.mutations or '',
|
2315
2527
|
'generation': entry.generation if entry.generation is not None else '',
|
2528
|
+
'campaign_id': entry.campaign_id or '',
|
2316
2529
|
'protein_sequence': entry.aa_seq or '',
|
2317
2530
|
'nucleotide_sequence': entry.dna_seq or '',
|
2318
2531
|
'sequence_confidence': str(entry.confidence) if entry.confidence is not None else '',
|
@@ -2345,7 +2558,7 @@ def _entries_to_dataframe(entries: List[ScopeEntry]) -> pd.DataFrame:
|
|
2345
2558
|
|
2346
2559
|
# Define column order
|
2347
2560
|
column_order = [
|
2348
|
-
'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation',
|
2561
|
+
'enzyme_id', 'parent_enzyme_id', 'mutations', 'generation', 'campaign_id',
|
2349
2562
|
'protein_sequence', 'nucleotide_sequence', 'sequence_confidence', 'flag',
|
2350
2563
|
'substrate_list', 'substrate_iupac_list',
|
2351
2564
|
'product_list', 'product_iupac_list',
|
@@ -2407,23 +2620,83 @@ def run_pipeline(
|
|
2407
2620
|
# 2. Connect to Gemini -----------------------------------------------------
|
2408
2621
|
model = get_model()
|
2409
2622
|
|
2410
|
-
# 3.
|
2411
|
-
|
2623
|
+
# 3. Check for campaign-based extraction -----------------------------------
|
2624
|
+
all_entries = []
|
2625
|
+
|
2626
|
+
if lineage_csv:
|
2627
|
+
import pandas as pd
|
2628
|
+
lineage_df = pd.read_csv(lineage_csv)
|
2629
|
+
|
2630
|
+
# Check if we have campaign_id column - if so, process each campaign separately
|
2631
|
+
if 'campaign_id' in lineage_df.columns:
|
2632
|
+
campaigns = lineage_df['campaign_id'].unique()
|
2633
|
+
log.info("Detected %d campaigns in lineage data - processing each separately", len(campaigns))
|
2634
|
+
log.info("Campaigns: %s", campaigns.tolist())
|
2635
|
+
|
2636
|
+
# Simple campaign context for model reaction awareness
|
2637
|
+
campaigns_context_text = f"All campaigns: {campaigns.tolist()}"
|
2638
|
+
identify_scope_locations_for_campaign._all_campaigns_context = campaigns_context_text
|
2639
|
+
extract_substrate_scope_entries_for_campaign._all_campaigns_context = campaigns_context_text
|
2640
|
+
|
2641
|
+
for campaign_id in campaigns:
|
2642
|
+
log.info("\n" + "="*60)
|
2643
|
+
log.info("Processing campaign: %s", campaign_id)
|
2644
|
+
log.info("="*60)
|
2645
|
+
|
2646
|
+
# Get enzymes for this campaign
|
2647
|
+
campaign_enzymes = lineage_df[lineage_df['campaign_id'] == campaign_id]
|
2648
|
+
if 'enzyme_id' in campaign_enzymes.columns:
|
2649
|
+
enzyme_ids = campaign_enzymes['enzyme_id'].tolist()
|
2650
|
+
elif 'enzyme' in campaign_enzymes.columns:
|
2651
|
+
enzyme_ids = campaign_enzymes['enzyme'].tolist()
|
2652
|
+
elif 'variant_id' in campaign_enzymes.columns:
|
2653
|
+
enzyme_ids = campaign_enzymes['variant_id'].tolist()
|
2654
|
+
else:
|
2655
|
+
raise ValueError("No enzyme ID column found in lineage data")
|
2656
|
+
|
2657
|
+
log.info("Campaign %s has %d enzymes: %s", campaign_id, len(enzyme_ids), enzyme_ids)
|
2658
|
+
|
2659
|
+
# Create campaign-specific debug dir
|
2660
|
+
campaign_debug_dir = Path(debug_dir) / campaign_id if debug_dir else None
|
2661
|
+
|
2662
|
+
# Extract substrate scope for this campaign
|
2663
|
+
campaign_entries = get_substrate_scope_for_campaign(
|
2664
|
+
caption_text, full_text, model,
|
2665
|
+
campaign_id=campaign_id,
|
2666
|
+
enzyme_ids=enzyme_ids,
|
2667
|
+
pdf_paths=pdf_paths,
|
2668
|
+
debug_dir=campaign_debug_dir
|
2669
|
+
)
|
2670
|
+
|
2671
|
+
if campaign_entries:
|
2672
|
+
log.info("Extracted %d substrate scope entries for campaign %s", len(campaign_entries), campaign_id)
|
2673
|
+
all_entries.extend(campaign_entries)
|
2674
|
+
else:
|
2675
|
+
log.info("No substrate scope data found for campaign %s", campaign_id)
|
2676
|
+
else:
|
2677
|
+
# Original single extraction
|
2678
|
+
entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2679
|
+
all_entries = entries
|
2680
|
+
else:
|
2681
|
+
# No lineage data - single extraction
|
2682
|
+
entries = get_substrate_scope(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
|
2683
|
+
all_entries = entries
|
2412
2684
|
|
2413
|
-
if not
|
2414
|
-
|
2685
|
+
if not all_entries:
|
2686
|
+
log.warning("No substrate scope data extracted from any campaign")
|
2687
|
+
all_entries = [] # Allow empty results
|
2415
2688
|
|
2416
2689
|
# 4. Merge with lineage if available ---------------------------------------
|
2417
|
-
if lineage_csv:
|
2418
|
-
|
2690
|
+
if lineage_csv and all_entries:
|
2691
|
+
all_entries = merge_with_lineage(all_entries, Path(lineage_csv), model)
|
2419
2692
|
|
2420
2693
|
# 5. Validate entries ------------------------------------------------------
|
2421
|
-
warnings = validate_scope_entries(
|
2694
|
+
warnings = validate_scope_entries(all_entries)
|
2422
2695
|
if warnings:
|
2423
2696
|
log.warning("Found %d validation warnings", len(warnings))
|
2424
2697
|
|
2425
2698
|
# 6. Convert to DataFrame --------------------------------------------------
|
2426
|
-
df_final = _entries_to_dataframe(
|
2699
|
+
df_final = _entries_to_dataframe(all_entries)
|
2427
2700
|
|
2428
2701
|
# 7. Write CSV if requested ------------------------------------------------
|
2429
2702
|
if output_csv:
|