debase 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {debase-0.6.0/src/debase.egg-info → debase-0.6.2}/PKG-INFO +1 -1
- debase-0.6.2/manuscript/DEBase_LLM_Validater.ipynb +197 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase/_version.py +1 -1
- debase-0.6.2/src/debase/campaign_utils.py +146 -0
- debase-0.6.2/src/debase/caption_pattern.py +44 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase/cleanup_sequence.py +34 -6
- {debase-0.6.0 → debase-0.6.2}/src/debase/enzyme_lineage_extractor.py +481 -106
- {debase-0.6.0 → debase-0.6.2}/src/debase/lineage_format.py +44 -1
- {debase-0.6.0 → debase-0.6.2}/src/debase/reaction_info_extractor.py +479 -135
- {debase-0.6.0 → debase-0.6.2}/src/debase/substrate_scope_extractor.py +207 -80
- {debase-0.6.0 → debase-0.6.2}/src/debase/wrapper.py +3 -3
- {debase-0.6.0 → debase-0.6.2/src/debase.egg-info}/PKG-INFO +1 -1
- {debase-0.6.0 → debase-0.6.2}/src/debase.egg-info/SOURCES.txt +3 -0
- {debase-0.6.0 → debase-0.6.2}/.gitignore +0 -0
- {debase-0.6.0 → debase-0.6.2}/LICENSE +0 -0
- {debase-0.6.0 → debase-0.6.2}/MANIFEST.in +0 -0
- {debase-0.6.0 → debase-0.6.2}/README.md +0 -0
- {debase-0.6.0 → debase-0.6.2}/environment.yml +0 -0
- {debase-0.6.0 → debase-0.6.2}/pyproject.toml +0 -0
- {debase-0.6.0 → debase-0.6.2}/setup.cfg +0 -0
- {debase-0.6.0 → debase-0.6.2}/setup.py +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/__init__.py +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase/__init__.py +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase/__main__.py +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase/build_db.py +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase.egg-info/dependency_links.txt +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase.egg-info/entry_points.txt +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase.egg-info/requires.txt +0 -0
- {debase-0.6.0 → debase-0.6.2}/src/debase.egg-info/top_level.txt +0 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
{
|
2
|
+
"cells": [
|
3
|
+
{
|
4
|
+
"cell_type": "markdown",
|
5
|
+
"id": "1714f461-f0c0-4a33-8b81-54c40ce47b80",
|
6
|
+
"metadata": {},
|
7
|
+
"source": [
|
8
|
+
"# Test all the downloaded papers against the LLM extractor\n",
|
9
|
+
"\n",
|
10
|
+
"Command: \n",
|
11
|
+
"```\n",
|
12
|
+
"debase --manuscript \"downloaded_papers/Stereospecific Enzymatic Conversion of Boronic Acids to Amines.pdf\" --si \"downloaded_papers/si-Stereospecific Enzymatic Conversion of Boronic Acids to Amines.pdf\" --output \"LLM/Stereospecific Enzymatic Conversion of Boronic Acids to Amines.csv\" --keep-intermediates\n",
|
13
|
+
"```"
|
14
|
+
]
|
15
|
+
},
|
16
|
+
{
|
17
|
+
"cell_type": "code",
|
18
|
+
"execution_count": null,
|
19
|
+
"id": "a77e19f4-ef52-469d-8a15-6816101fa58b",
|
20
|
+
"metadata": {},
|
21
|
+
"outputs": [
|
22
|
+
{
|
23
|
+
"name": "stderr",
|
24
|
+
"output_type": "stream",
|
25
|
+
"text": [
|
26
|
+
"2025-07-16 09:04:29 [WARNING] debase.lineage_format: Local PubChem DB not found at /Users/arianemora/miniconda3/envs/debase/lib/python3.11/data/iupac2smiles.db\n",
|
27
|
+
"usage: debase [-h] [--manuscript MANUSCRIPT] [--si SI] [--output OUTPUT]\n",
|
28
|
+
" [--keep-intermediates] [--debug-dir DEBUG_DIR]\n",
|
29
|
+
"debase: error: SI not found: downloaded_papers/si-Enzymatic Nitrogen Insertion into Unactivated C-H Bonds.pdf\n"
|
30
|
+
]
|
31
|
+
},
|
32
|
+
{
|
33
|
+
"name": "stdout",
|
34
|
+
"output_type": "stream",
|
35
|
+
"text": [
|
36
|
+
"TIME: 0.6438441276550293\n"
|
37
|
+
]
|
38
|
+
},
|
39
|
+
{
|
40
|
+
"name": "stderr",
|
41
|
+
"output_type": "stream",
|
42
|
+
"text": [
|
43
|
+
"2025-07-16 09:04:30 [WARNING] debase.lineage_format: Local PubChem DB not found at /Users/arianemora/miniconda3/envs/debase/lib/python3.11/data/iupac2smiles.db\n",
|
44
|
+
"2025-07-16 09:04:30,049 - INFO - ============================================================\n",
|
45
|
+
"2025-07-16 09:04:30,049 - INFO - Starting DEBase Enzyme Analysis Pipeline\n",
|
46
|
+
"2025-07-16 09:04:30,049 - INFO - Manuscript: downloaded_papers/Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.pdf\n",
|
47
|
+
"2025-07-16 09:04:30,049 - INFO - SI: downloaded_papers/si-Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.pdf\n",
|
48
|
+
"2025-07-16 09:04:30,049 - INFO - Output: LLM/manuscript_1/Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.csv\n",
|
49
|
+
"2025-07-16 09:04:30,049 - INFO - Log file: LLM/manuscript_1/debase_pipeline_20250716_090430.log\n",
|
50
|
+
"2025-07-16 09:04:30,049 - INFO - ============================================================\n",
|
51
|
+
"2025-07-16 09:04:30,049 - INFO - \n",
|
52
|
+
"[Step 1/5] Extracting enzyme lineage...\n",
|
53
|
+
"2025-07-16 09:04:30,049 - INFO - Extracting enzyme lineage from Nitrene Transfer Catalyzed by a Non-Heme Iron Enzyme and Enhanced by Non-Native Small-Molecule Ligan.pdf\n",
|
54
|
+
"2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: Loaded 11495 chars of captions for identification and 80929 chars of full text for extraction\n",
|
55
|
+
"2025-07-16 09:04:30,454 - INFO - Loaded 11495 chars of captions for identification and 80929 chars of full text for extraction\n",
|
56
|
+
"2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: === GEMINI API CALL: CAMPAIGNS ===\n",
|
57
|
+
"2025-07-16 09:04:30,454 - INFO - === GEMINI API CALL: CAMPAIGNS ===\n",
|
58
|
+
"2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: Prompt length: 82371 characters\n",
|
59
|
+
"2025-07-16 09:04:30,454 - INFO - Prompt length: 82371 characters\n",
|
60
|
+
"2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: First 500 chars of prompt:\n",
|
61
|
+
"You are an expert reader of protein engineering manuscripts.\n",
|
62
|
+
"Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.\n",
|
63
|
+
"\n",
|
64
|
+
"Each campaign represents a separate evolutionary lineage targeting different:\n",
|
65
|
+
"- Model reactions (e.g., different chemical transformations)\n",
|
66
|
+
"- Substrate scopes\n",
|
67
|
+
"- Activities (e.g., different enzymatic reactions)\n",
|
68
|
+
"\n",
|
69
|
+
"Look for:\n",
|
70
|
+
"1. Different model substrates/products mentioned (e.g., different substrate/product pairs)\n",
|
71
|
+
"2. Distinct enzyme lineage names (\n",
|
72
|
+
"...(truncated)\n",
|
73
|
+
"2025-07-16 09:04:30,454 - INFO - First 500 chars of prompt:\n",
|
74
|
+
"You are an expert reader of protein engineering manuscripts.\n",
|
75
|
+
"Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.\n",
|
76
|
+
"\n",
|
77
|
+
"Each campaign represents a separate evolutionary lineage targeting different:\n",
|
78
|
+
"- Model reactions (e.g., different chemical transformations)\n",
|
79
|
+
"- Substrate scopes\n",
|
80
|
+
"- Activities (e.g., different enzymatic reactions)\n",
|
81
|
+
"\n",
|
82
|
+
"Look for:\n",
|
83
|
+
"1. Different model substrates/products mentioned (e.g., different substrate/product pairs)\n",
|
84
|
+
"2. Distinct enzyme lineage names (\n",
|
85
|
+
"...(truncated)\n",
|
86
|
+
"2025-07-16 09:04:30 [INFO] debase.enzyme_lineage_extractor: Calling Gemini API (attempt 1/4)...\n",
|
87
|
+
"2025-07-16 09:04:30,454 - INFO - Calling Gemini API (attempt 1/4)...\n",
|
88
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Gemini response length: 2188 characters\n",
|
89
|
+
"2025-07-16 09:04:49,096 - INFO - Gemini response length: 2188 characters\n",
|
90
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: First 500 chars of response:\n",
|
91
|
+
"```json\n",
|
92
|
+
"[\n",
|
93
|
+
" {\n",
|
94
|
+
" \"campaign_id\": \"ps_efe_olefin_aziridination_evolution\",\n",
|
95
|
+
" \"campaign_name\": \"PsEFE Olefin Aziridination Evolution\",\n",
|
96
|
+
" \"description\": \"Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\",\n",
|
97
|
+
" \"model_substrate\": \"styrene and p-toluenesulfonyl azide\",\n",
|
98
|
+
" \"model_product\": \"2-phenyl-1-(p-toluenesulfonyl)aziridine\",\n",
|
99
|
+
" \"\n",
|
100
|
+
"...(truncated)\n",
|
101
|
+
"2025-07-16 09:04:49,096 - INFO - First 500 chars of response:\n",
|
102
|
+
"```json\n",
|
103
|
+
"[\n",
|
104
|
+
" {\n",
|
105
|
+
" \"campaign_id\": \"ps_efe_olefin_aziridination_evolution\",\n",
|
106
|
+
" \"campaign_name\": \"PsEFE Olefin Aziridination Evolution\",\n",
|
107
|
+
" \"description\": \"Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\",\n",
|
108
|
+
" \"model_substrate\": \"styrene and p-toluenesulfonyl azide\",\n",
|
109
|
+
" \"model_product\": \"2-phenyl-1-(p-toluenesulfonyl)aziridine\",\n",
|
110
|
+
" \"\n",
|
111
|
+
"...(truncated)\n",
|
112
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Successfully parsed JSON response\n",
|
113
|
+
"2025-07-16 09:04:49,096 - INFO - Successfully parsed JSON response\n",
|
114
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Identified campaign: PsEFE Olefin Aziridination Evolution (ps_efe_olefin_aziridination_evolution)\n",
|
115
|
+
"2025-07-16 09:04:49,096 - INFO - Identified campaign: PsEFE Olefin Aziridination Evolution (ps_efe_olefin_aziridination_evolution)\n",
|
116
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Identified campaign: PsEFE Nitrene C-H Insertion Screening and Optimization (ps_efe_nitrene_ch_insertion_screening)\n",
|
117
|
+
"2025-07-16 09:04:49,096 - INFO - Identified campaign: PsEFE Nitrene C-H Insertion Screening and Optimization (ps_efe_nitrene_ch_insertion_screening)\n",
|
118
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Identified 2 distinct campaigns\n",
|
119
|
+
"2025-07-16 09:04:49,096 - INFO - Identified 2 distinct campaigns\n",
|
120
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: - PsEFE Olefin Aziridination Evolution: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
|
121
|
+
"2025-07-16 09:04:49,096 - INFO - - PsEFE Olefin Aziridination Evolution: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
|
122
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: - PsEFE Nitrene C-H Insertion Screening and Optimization: Screening and characterization of PsEFE variants (derived from the aziridination evolution) for intramolecular nitrene C-H bond insertion, identifying variants with enhanced activity, chemoselectivity, and enantioselectivity, particularly with N-oxalylglycine as a ligand.\n",
|
123
|
+
"2025-07-16 09:04:49,096 - INFO - - PsEFE Nitrene C-H Insertion Screening and Optimization: Screening and characterization of PsEFE variants (derived from the aziridination evolution) for intramolecular nitrene C-H bond insertion, identifying variants with enhanced activity, chemoselectivity, and enantioselectivity, particularly with N-oxalylglycine as a ligand.\n",
|
124
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Using campaign-aware location identification\n",
|
125
|
+
"2025-07-16 09:04:49,096 - INFO - Using campaign-aware location identification\n",
|
126
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: \n",
|
127
|
+
"Processing campaign: ps_efe_olefin_aziridination_evolution - PsEFE Olefin Aziridination Evolution\n",
|
128
|
+
"2025-07-16 09:04:49,096 - INFO - \n",
|
129
|
+
"Processing campaign: ps_efe_olefin_aziridination_evolution - PsEFE Olefin Aziridination Evolution\n",
|
130
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: === GEMINI API CALL: LOCATE ===\n",
|
131
|
+
"2025-07-16 09:04:49,143 - INFO - === GEMINI API CALL: LOCATE ===\n",
|
132
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Prompt length: 18517 characters\n",
|
133
|
+
"2025-07-16 09:04:49,143 - INFO - Prompt length: 18517 characters\n",
|
134
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: First 500 chars of prompt:\n",
|
135
|
+
"You are an expert reader of protein engineering manuscripts.\n",
|
136
|
+
"\n",
|
137
|
+
"You are looking for lineage data for a SPECIFIC campaign:\n",
|
138
|
+
"- Campaign: PsEFE Olefin Aziridination Evolution\n",
|
139
|
+
"- Description: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
|
140
|
+
"- Key identifiers: Initial screening was performed with α-ketoglutarate, but subsequent evolution and sc\n",
|
141
|
+
"...(truncated)\n",
|
142
|
+
"2025-07-16 09:04:49,143 - INFO - First 500 chars of prompt:\n",
|
143
|
+
"You are an expert reader of protein engineering manuscripts.\n",
|
144
|
+
"\n",
|
145
|
+
"You are looking for lineage data for a SPECIFIC campaign:\n",
|
146
|
+
"- Campaign: PsEFE Olefin Aziridination Evolution\n",
|
147
|
+
"- Description: Directed evolution of Pseudomonas savastanoi ethylene-forming enzyme (PsEFE) to improve its catalytic activity and enantioselectivity for the intermolecular aziridination of styrene with p-toluenesulfonyl azide.\n",
|
148
|
+
"- Key identifiers: Initial screening was performed with α-ketoglutarate, but subsequent evolution and sc\n",
|
149
|
+
"...(truncated)\n",
|
150
|
+
"2025-07-16 09:04:49 [INFO] debase.enzyme_lineage_extractor: Calling Gemini API (attempt 1/4)...\n",
|
151
|
+
"2025-07-16 09:04:49,143 - INFO - Calling Gemini API (attempt 1/4)...\n"
|
152
|
+
]
|
153
|
+
}
|
154
|
+
],
|
155
|
+
"source": [
|
156
|
+
"import time\n",
|
157
|
+
"import os \n",
|
158
|
+
"\n",
|
159
|
+
"data_dir = 'downloaded_papers/'\n",
|
160
|
+
"files = [f for f in os.listdir(data_dir) if f[:3] != 'si-']\n",
|
161
|
+
"\n",
|
162
|
+
"with open('time_log.txt', 'w+') as fout:\n",
|
163
|
+
" for i, f in enumerate(files):\n",
|
164
|
+
" start = time.time() \n",
|
165
|
+
" output = f.replace('.pdf', '.csv')\n",
|
166
|
+
" os.system(f'mkdir LLM/manuscript_{i}/')\n",
|
167
|
+
" cmd = f'debase --manuscript \"downloaded_papers/{f}\" --si \"downloaded_papers/si-{f}\" --output \"LLM/manuscript_{i}/{output}\" --keep-intermediates'\n",
|
168
|
+
" os.system(cmd)\n",
|
169
|
+
" end = time.time()\n",
|
170
|
+
" \n",
|
171
|
+
" print('TIME:', end - start)\n",
|
172
|
+
" fout.write(f'{f}\\tmanuscript_{i}\\t{end-start}\\t{cmd}\\n')"
|
173
|
+
]
|
174
|
+
}
|
175
|
+
],
|
176
|
+
"metadata": {
|
177
|
+
"kernelspec": {
|
178
|
+
"display_name": "Python 3 (ipykernel)",
|
179
|
+
"language": "python",
|
180
|
+
"name": "python3"
|
181
|
+
},
|
182
|
+
"language_info": {
|
183
|
+
"codemirror_mode": {
|
184
|
+
"name": "ipython",
|
185
|
+
"version": 3
|
186
|
+
},
|
187
|
+
"file_extension": ".py",
|
188
|
+
"mimetype": "text/x-python",
|
189
|
+
"name": "python",
|
190
|
+
"nbconvert_exporter": "python",
|
191
|
+
"pygments_lexer": "ipython3",
|
192
|
+
"version": "3.11.13"
|
193
|
+
}
|
194
|
+
},
|
195
|
+
"nbformat": 4,
|
196
|
+
"nbformat_minor": 5
|
197
|
+
}
|
@@ -0,0 +1,146 @@
|
|
1
|
+
"""Utilities for handling campaign information across extractors.
|
2
|
+
|
3
|
+
This module provides functions to load and use campaign information
|
4
|
+
to improve extraction accuracy by providing context about model substrates,
|
5
|
+
products, and data locations.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
from pathlib import Path
|
11
|
+
from typing import List, Dict, Optional, Any
|
12
|
+
|
13
|
+
logger = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
def load_campaigns_from_file(campaign_file: Path) -> List[Dict[str, Any]]:
|
17
|
+
"""Load campaign information from a JSON file.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
campaign_file: Path to campaigns.json file
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
List of campaign dictionaries
|
24
|
+
"""
|
25
|
+
if not campaign_file.exists():
|
26
|
+
logger.warning(f"Campaign file not found: {campaign_file}")
|
27
|
+
return []
|
28
|
+
|
29
|
+
try:
|
30
|
+
with open(campaign_file, 'r') as f:
|
31
|
+
campaigns = json.load(f)
|
32
|
+
logger.info(f"Loaded {len(campaigns)} campaigns from {campaign_file}")
|
33
|
+
return campaigns
|
34
|
+
except Exception as e:
|
35
|
+
logger.error(f"Failed to load campaigns from {campaign_file}: {e}")
|
36
|
+
return []
|
37
|
+
|
38
|
+
|
39
|
+
def find_campaign_by_id(campaigns: List[Dict[str, Any]], campaign_id: str) -> Optional[Dict[str, Any]]:
|
40
|
+
"""Find a specific campaign by ID.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
campaigns: List of campaign dictionaries
|
44
|
+
campaign_id: Campaign ID to search for
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
Campaign dictionary if found, None otherwise
|
48
|
+
"""
|
49
|
+
for campaign in campaigns:
|
50
|
+
if campaign.get('campaign_id') == campaign_id:
|
51
|
+
return campaign
|
52
|
+
return None
|
53
|
+
|
54
|
+
|
55
|
+
def get_campaign_context(campaign: Dict[str, Any]) -> str:
|
56
|
+
"""Generate context string for prompts from campaign information.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
campaign: Campaign dictionary
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
Formatted context string for inclusion in prompts
|
63
|
+
"""
|
64
|
+
context_parts = []
|
65
|
+
|
66
|
+
# Basic campaign info
|
67
|
+
context_parts.append(f"Campaign: {campaign.get('campaign_name', 'Unknown')}")
|
68
|
+
context_parts.append(f"Description: {campaign.get('description', '')}")
|
69
|
+
|
70
|
+
# Model reaction info
|
71
|
+
if campaign.get('model_substrate'):
|
72
|
+
context_parts.append(f"Model Substrate: {campaign['model_substrate']} (ID: {campaign.get('substrate_id', 'unknown')})")
|
73
|
+
if campaign.get('model_product'):
|
74
|
+
context_parts.append(f"Model Product: {campaign['model_product']} (ID: {campaign.get('product_id', 'unknown')})")
|
75
|
+
|
76
|
+
# Data locations
|
77
|
+
if campaign.get('data_locations'):
|
78
|
+
locations = ', '.join(campaign['data_locations'])
|
79
|
+
context_parts.append(f"Key Data Locations: {locations}")
|
80
|
+
|
81
|
+
# Lineage hint if available
|
82
|
+
if campaign.get('lineage_hint'):
|
83
|
+
context_parts.append(f"Evolution Pathway: {campaign['lineage_hint']}")
|
84
|
+
|
85
|
+
# Additional notes
|
86
|
+
if campaign.get('notes'):
|
87
|
+
context_parts.append(f"Notes: {campaign['notes']}")
|
88
|
+
|
89
|
+
return '\n'.join(context_parts)
|
90
|
+
|
91
|
+
|
92
|
+
def get_location_hints_for_campaign(campaign: Dict[str, Any]) -> List[str]:
|
93
|
+
"""Extract specific location hints from campaign data.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
campaign: Campaign dictionary
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
List of location strings (e.g., ["Figure 2a", "Table S4"])
|
100
|
+
"""
|
101
|
+
return campaign.get('data_locations', [])
|
102
|
+
|
103
|
+
|
104
|
+
def enhance_prompt_with_campaign(prompt: str, campaign: Optional[Dict[str, Any]],
|
105
|
+
section_name: str = "CAMPAIGN CONTEXT") -> str:
|
106
|
+
"""Enhance a prompt with campaign context information.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
prompt: Original prompt
|
110
|
+
campaign: Campaign dictionary (optional)
|
111
|
+
section_name: Section header for the campaign context
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
Enhanced prompt with campaign context
|
115
|
+
"""
|
116
|
+
if not campaign:
|
117
|
+
return prompt
|
118
|
+
|
119
|
+
context = get_campaign_context(campaign)
|
120
|
+
locations = get_location_hints_for_campaign(campaign)
|
121
|
+
|
122
|
+
campaign_section = f"\n\n{section_name}:\n{'-' * 50}\n{context}"
|
123
|
+
|
124
|
+
if locations:
|
125
|
+
campaign_section += f"\n\nIMPORTANT: Focus particularly on these locations: {', '.join(locations)}"
|
126
|
+
|
127
|
+
campaign_section += f"\n{'-' * 50}\n"
|
128
|
+
|
129
|
+
# Insert campaign context early in the prompt
|
130
|
+
# Look for a good insertion point after initial instructions
|
131
|
+
lines = prompt.split('\n')
|
132
|
+
insert_idx = 0
|
133
|
+
|
134
|
+
# Find a good place to insert (after first paragraph or instruction block)
|
135
|
+
for i, line in enumerate(lines):
|
136
|
+
if i > 5 and (not line.strip() or line.startswith('Given') or line.startswith('You')):
|
137
|
+
insert_idx = i
|
138
|
+
break
|
139
|
+
|
140
|
+
if insert_idx == 0:
|
141
|
+
# Fallback: just prepend
|
142
|
+
return campaign_section + prompt
|
143
|
+
else:
|
144
|
+
# Insert at found position
|
145
|
+
lines.insert(insert_idx, campaign_section)
|
146
|
+
return '\n'.join(lines)
|
@@ -0,0 +1,44 @@
|
|
1
|
+
"""Universal caption pattern for all DEBase extractors.
|
2
|
+
|
3
|
+
This module provides a consistent caption pattern that handles various
|
4
|
+
formats found in scientific papers, including:
|
5
|
+
- Standard formats: Figure 1, Fig. 1, Table 1
|
6
|
+
- Supplementary formats: Supplementary Figure 1, Supp. Table 1
|
7
|
+
- Extended data: Extended Data Figure 1, ED Fig. 1
|
8
|
+
- Other types: Scheme 1, Chart 1
|
9
|
+
- Page headers: S14 Table 5
|
10
|
+
- Various punctuation: Figure 1. Figure 1: Figure 1 |
|
11
|
+
- Inline captions: ...text Table 1. Caption text...
|
12
|
+
"""
|
13
|
+
|
14
|
+
import re
|
15
|
+
|
16
|
+
# Universal caption pattern that handles all common formats
|
17
|
+
# Now includes both start-of-line and inline caption patterns
|
18
|
+
UNIVERSAL_CAPTION_PATTERN = re.compile(
|
19
|
+
r"""
|
20
|
+
(?: # Non-capturing group for position
|
21
|
+
^[^\n]{0,20}? # Start of line with up to 20 chars before
|
22
|
+
| # OR
|
23
|
+
(?<=[a-zA-Z0-9\s]) # Look-behind for alphanumeric or space (for inline)
|
24
|
+
)
|
25
|
+
( # Start capture group
|
26
|
+
(?:Extended\s+Data\s+)? # Optional "Extended Data" prefix
|
27
|
+
(?:ED\s+)? # Optional "ED" prefix
|
28
|
+
(?:Supplementary|Supp\.?|Suppl\.?)?\s* # Optional supplementary prefixes
|
29
|
+
(?:Table|Fig(?:ure)?|Scheme|Chart) # Main caption types
|
30
|
+
) # End capture group
|
31
|
+
(?: # Non-capturing group for what follows
|
32
|
+
\s* # Optional whitespace
|
33
|
+
(?:S?\d+[A-Za-z]?|[IVX]+) # Number (with optional S prefix or roman)
|
34
|
+
(?:[.:|]|\s+\|)? # Optional punctuation (. : or |)
|
35
|
+
| # OR
|
36
|
+
\. # Just a period (for "Fig." without number)
|
37
|
+
)
|
38
|
+
""",
|
39
|
+
re.I | re.X | re.M
|
40
|
+
)
|
41
|
+
|
42
|
+
def get_universal_caption_pattern():
|
43
|
+
"""Get the universal caption pattern for use in extractors."""
|
44
|
+
return UNIVERSAL_CAPTION_PATTERN
|
@@ -1016,13 +1016,38 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1016
1016
|
|
1017
1017
|
if source_enzyme_id:
|
1018
1018
|
# Find the source enzyme's sequence in the dataframe
|
1019
|
+
# Prefer sequences from OTHER campaigns (not the current empty campaign)
|
1019
1020
|
source_rows = df[df['enzyme_id'] == source_enzyme_id]
|
1020
1021
|
if source_rows.empty:
|
1021
1022
|
log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
|
1022
1023
|
else:
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1024
|
+
# Look for a row with a sequence, preferring other campaigns
|
1025
|
+
source_sequence = None
|
1026
|
+
source_row_idx = None
|
1027
|
+
|
1028
|
+
# First, try to find a row with sequence from a different campaign
|
1029
|
+
for idx, row in source_rows.iterrows():
|
1030
|
+
seq = str(row['protein_sequence']).strip()
|
1031
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1032
|
+
# Check if this is from a different campaign
|
1033
|
+
if row['campaign_id'] != campaign_id:
|
1034
|
+
source_sequence = seq
|
1035
|
+
source_row_idx = idx
|
1036
|
+
log.info(f"Found source sequence for {source_enzyme_id} from campaign {row['campaign_id']}")
|
1037
|
+
break
|
1038
|
+
|
1039
|
+
# If not found in other campaigns, try any row with sequence
|
1040
|
+
if not source_sequence:
|
1041
|
+
for idx, row in source_rows.iterrows():
|
1042
|
+
seq = str(row['protein_sequence']).strip()
|
1043
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1044
|
+
source_sequence = seq
|
1045
|
+
source_row_idx = idx
|
1046
|
+
log.info(f"Found source sequence for {source_enzyme_id} from same campaign {row['campaign_id']}")
|
1047
|
+
break
|
1048
|
+
|
1049
|
+
if not source_sequence:
|
1050
|
+
log.warning(f"Source enzyme {source_enzyme_id} has no sequence in any campaign")
|
1026
1051
|
else:
|
1027
1052
|
# Find the target enzyme in our empty list
|
1028
1053
|
seed_found = False
|
@@ -1031,7 +1056,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1031
1056
|
if relationship_type == "EXACT_MATCH":
|
1032
1057
|
# Exact match - copy sequence directly
|
1033
1058
|
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
1034
|
-
df.at[entry['idx'], 'flag']
|
1059
|
+
current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
|
1060
|
+
df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_exact"
|
1035
1061
|
log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
|
1036
1062
|
seed_found = True
|
1037
1063
|
|
@@ -1045,7 +1071,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1045
1071
|
|
1046
1072
|
if success:
|
1047
1073
|
df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
|
1048
|
-
df.at[entry['idx'], 'flag']
|
1074
|
+
current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
|
1075
|
+
df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent"
|
1049
1076
|
log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
|
1050
1077
|
seed_found = True
|
1051
1078
|
else:
|
@@ -1053,7 +1080,8 @@ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
|
1053
1080
|
else:
|
1054
1081
|
# No mutations - use parent sequence directly
|
1055
1082
|
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
1056
|
-
df.at[entry['idx'], 'flag']
|
1083
|
+
current_flag = str(df.at[entry['idx'], 'flag']) if pd.notna(df.at[entry['idx'], 'flag']) else ""
|
1084
|
+
df.at[entry['idx'], 'flag'] = current_flag + " gemini_cross_campaign_seed_parent_no_mutations"
|
1057
1085
|
log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
|
1058
1086
|
seed_found = True
|
1059
1087
|
break
|