debase 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/PIPELINE_FLOW.md +100 -0
- debase/__init__.py +18 -0
- debase/__main__.py +9 -0
- debase/_version.py +3 -0
- debase/build_db.py +190 -0
- debase/cleanup_sequence.py +905 -0
- debase/enzyme_lineage_extractor.py +2169 -0
- debase/lineage_format.py +808 -0
- debase/reaction_info_extractor.py +2331 -0
- debase/substrate_scope_extractor.py +2039 -0
- debase/wrapper.py +303 -0
- debase-0.1.0.dist-info/METADATA +299 -0
- debase-0.1.0.dist-info/RECORD +17 -0
- debase-0.1.0.dist-info/WHEEL +5 -0
- debase-0.1.0.dist-info/entry_points.txt +2 -0
- debase-0.1.0.dist-info/licenses/LICENSE +21 -0
- debase-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2331 @@
|
|
1
|
+
"""reaction_info_extractor_clean.py
|
2
|
+
|
3
|
+
Single-file, maintainable CLI tool that pulls **enzyme-reaction performance data**
|
4
|
+
from chemistry PDFs using Google Gemini (text-only *and* vision) - now with
|
5
|
+
**true figure-image extraction** mirroring the enzyme-lineage workflow.
|
6
|
+
|
7
|
+
Key June 2025 additions
|
8
|
+
=======================
|
9
|
+
1. **Figure image helper** - locates the figure caption, then exports the first
|
10
|
+
image **above** that caption using PyMuPDF (fitz). This PNG is sent to
|
11
|
+
Gemini Vision for metric extraction.
|
12
|
+
2. **GeminiClient.generate()** now accepts an optional `image_b64` arg and
|
13
|
+
automatically switches to a *vision* invocation when provided.
|
14
|
+
3. **extract_metrics_for_enzyme()** chooses between three tiers:
|
15
|
+
|
16
|
+
* *Table* -> caption + following rows (text-only)
|
17
|
+
* *Figure* -> image bytes (vision) *or* caption fallback
|
18
|
+
* *Other* -> page-level text
|
19
|
+
|
20
|
+
If the vision route fails (no JSON), it gracefully falls back to caption
|
21
|
+
text so the pipeline never crashes.
|
22
|
+
"""
|
23
|
+
|
24
|
+
from __future__ import annotations
|
25
|
+
|
26
|
+
import argparse
|
27
|
+
import json
|
28
|
+
import logging
|
29
|
+
import os
|
30
|
+
import re
|
31
|
+
import sys
|
32
|
+
import time
|
33
|
+
from base64 import b64encode, b64decode
|
34
|
+
from dataclasses import dataclass, field
|
35
|
+
from pathlib import Path
|
36
|
+
from textwrap import dedent
|
37
|
+
from typing import Any, Dict, List, Optional, Tuple
|
38
|
+
|
39
|
+
import fitz # PyMuPDF - for image extraction
|
40
|
+
import google.generativeai as genai # type: ignore
|
41
|
+
import pandas as pd
|
42
|
+
from PyPDF2 import PdfReader
|
43
|
+
import PIL.Image
|
44
|
+
import io
|
45
|
+
|
46
|
+
###############################################################################
|
47
|
+
# 1 - CONFIG & CONSTANTS
|
48
|
+
###############################################################################
|
49
|
+
|
50
|
+
@dataclass
|
51
|
+
class Config:
|
52
|
+
"""Centralised tunables so tests can override them easily."""
|
53
|
+
|
54
|
+
model_name: str = "gemini-1.5-pro-latest"
|
55
|
+
location_temperature: float = 0.2
|
56
|
+
extract_temperature: float = 0.0
|
57
|
+
model_reaction_temperature: float = 0.0
|
58
|
+
top_p: float = 1.0
|
59
|
+
max_tokens: int = 4096
|
60
|
+
pdf_cache_size: int = 8
|
61
|
+
retries: int = 2
|
62
|
+
|
63
|
+
@dataclass
|
64
|
+
class CompoundMapping:
|
65
|
+
"""Mapping between compound identifiers and IUPAC names."""
|
66
|
+
identifiers: List[str]
|
67
|
+
iupac_name: str
|
68
|
+
common_names: List[str] = field(default_factory=list)
|
69
|
+
compound_type: str = "unknown"
|
70
|
+
source_location: Optional[str] = None
|
71
|
+
|
72
|
+
###############################################################################
|
73
|
+
# 2 - LOGGING
|
74
|
+
###############################################################################
|
75
|
+
|
76
|
+
LOGGER = logging.getLogger("reaction_info_extractor")
|
77
|
+
handler = logging.StreamHandler()
|
78
|
+
handler.setFormatter(logging.Formatter("%(levelname)s [%(name)s] %(message)s"))
|
79
|
+
LOGGER.addHandler(handler)
|
80
|
+
LOGGER.setLevel(logging.INFO)
|
81
|
+
|
82
|
+
# --- Debug dump helper ----------------------------------------------------
|
83
|
+
def _dump(text: str | bytes, path: Path | str) -> None:
|
84
|
+
"""Write `text` / `bytes` to `path`, creating parent dirs as needed."""
|
85
|
+
p = Path(path)
|
86
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
87
|
+
mode = "wb" if isinstance(text, (bytes, bytearray)) else "w"
|
88
|
+
with p.open(mode) as fh:
|
89
|
+
fh.write(text)
|
90
|
+
|
91
|
+
###############################################################################
|
92
|
+
# 3 - PDF UTILITIES
|
93
|
+
###############################################################################
|
94
|
+
|
95
|
+
def extract_text_by_page(path: Optional[Path]) -> List[str]:
|
96
|
+
if path is None:
|
97
|
+
return []
|
98
|
+
reader = PdfReader(str(path))
|
99
|
+
pages: List[str] = []
|
100
|
+
for page in reader.pages:
|
101
|
+
try:
|
102
|
+
pages.append(page.extract_text() or "")
|
103
|
+
except Exception as exc: # pragma: no cover
|
104
|
+
LOGGER.warning("PyPDF2 failed on a page: %s", exc)
|
105
|
+
pages.append("")
|
106
|
+
return pages
|
107
|
+
|
108
|
+
###############################################################################
|
109
|
+
# 4 - GEMINI WRAPPER (text & vision)
|
110
|
+
###############################################################################
|
111
|
+
|
112
|
+
def get_model(cfg: Config):
|
113
|
+
"""Configure API key and return a `GenerativeModel` instance."""
|
114
|
+
api_key = os.getenv("GEMINI_API_KEY")
|
115
|
+
if not api_key:
|
116
|
+
raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
|
117
|
+
genai.configure(api_key=api_key)
|
118
|
+
return genai.GenerativeModel(cfg.model_name)
|
119
|
+
|
120
|
+
def generate_json_with_retry(
|
121
|
+
model,
|
122
|
+
prompt: str,
|
123
|
+
schema_hint: str | None = None,
|
124
|
+
*,
|
125
|
+
max_retries: int = 2,
|
126
|
+
temperature: float = 0.0,
|
127
|
+
debug_dir: str | Path | None = None,
|
128
|
+
tag: str = 'gemini',
|
129
|
+
image_b64: Optional[str] = None,
|
130
|
+
):
|
131
|
+
"""Call Gemini with retries & exponential back-off, returning parsed JSON."""
|
132
|
+
# Log prompt details
|
133
|
+
LOGGER.info("=== GEMINI API CALL: %s ===", tag.upper())
|
134
|
+
LOGGER.info("Prompt length: %d characters", len(prompt))
|
135
|
+
LOGGER.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
|
136
|
+
|
137
|
+
# Save full prompt to debug directory if provided
|
138
|
+
if debug_dir:
|
139
|
+
debug_path = Path(debug_dir)
|
140
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
141
|
+
prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
|
142
|
+
_dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\n{'='*80}\n\n{prompt}",
|
143
|
+
prompt_file)
|
144
|
+
LOGGER.info("Full prompt saved to: %s", prompt_file)
|
145
|
+
|
146
|
+
fence_re = re.compile(r"```json|```", re.I)
|
147
|
+
for attempt in range(1, max_retries + 1):
|
148
|
+
try:
|
149
|
+
LOGGER.info("Calling Gemini API (attempt %d/%d)...", attempt, max_retries)
|
150
|
+
|
151
|
+
# Handle image if provided
|
152
|
+
if image_b64:
|
153
|
+
parts = [prompt, {"mime_type": "image/png", "data": image_b64}]
|
154
|
+
else:
|
155
|
+
parts = [prompt]
|
156
|
+
|
157
|
+
resp = model.generate_content(
|
158
|
+
parts,
|
159
|
+
generation_config={
|
160
|
+
"temperature": temperature,
|
161
|
+
"max_output_tokens": 4096,
|
162
|
+
}
|
163
|
+
)
|
164
|
+
raw = resp.text.strip()
|
165
|
+
|
166
|
+
# Log response
|
167
|
+
LOGGER.info("Gemini response length: %d characters", len(raw))
|
168
|
+
LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
|
169
|
+
|
170
|
+
# Save full response to debug directory
|
171
|
+
if debug_dir:
|
172
|
+
response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
|
173
|
+
_dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}",
|
174
|
+
response_file)
|
175
|
+
LOGGER.info("Full response saved to: %s", response_file)
|
176
|
+
|
177
|
+
# Remove common Markdown fences
|
178
|
+
if raw.startswith("```"):
|
179
|
+
raw = fence_re.sub("", raw).strip()
|
180
|
+
|
181
|
+
# Try to find JSON in the response
|
182
|
+
# First, try to parse as-is
|
183
|
+
try:
|
184
|
+
parsed = json.loads(raw)
|
185
|
+
except json.JSONDecodeError:
|
186
|
+
# If that fails, look for JSON array or object
|
187
|
+
# Find the first '[' or '{' and the matching closing bracket
|
188
|
+
json_start = -1
|
189
|
+
json_end = -1
|
190
|
+
bracket_stack = []
|
191
|
+
in_string = False
|
192
|
+
escape_next = False
|
193
|
+
|
194
|
+
for i, char in enumerate(raw):
|
195
|
+
if escape_next:
|
196
|
+
escape_next = False
|
197
|
+
continue
|
198
|
+
|
199
|
+
if char == '\\':
|
200
|
+
escape_next = True
|
201
|
+
continue
|
202
|
+
|
203
|
+
if char == '"' and not escape_next:
|
204
|
+
in_string = not in_string
|
205
|
+
continue
|
206
|
+
|
207
|
+
if in_string:
|
208
|
+
continue
|
209
|
+
|
210
|
+
if char in '[{':
|
211
|
+
if json_start == -1:
|
212
|
+
json_start = i
|
213
|
+
bracket_stack.append(char)
|
214
|
+
elif char in ']}':
|
215
|
+
if bracket_stack:
|
216
|
+
opening = bracket_stack.pop()
|
217
|
+
if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
|
218
|
+
if not bracket_stack: # Found complete JSON
|
219
|
+
json_end = i + 1
|
220
|
+
break
|
221
|
+
|
222
|
+
if json_start >= 0 and json_end > json_start:
|
223
|
+
# Extract the JSON portion
|
224
|
+
json_str = raw[json_start:json_end]
|
225
|
+
parsed = json.loads(json_str)
|
226
|
+
else:
|
227
|
+
# Look for simple [] in the response
|
228
|
+
if '[]' in raw:
|
229
|
+
parsed = []
|
230
|
+
else:
|
231
|
+
# No JSON structure found, re-raise the original error
|
232
|
+
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
233
|
+
LOGGER.info("Successfully parsed JSON response")
|
234
|
+
return parsed
|
235
|
+
except Exception as exc:
|
236
|
+
LOGGER.warning(
|
237
|
+
"Gemini call failed (attempt %d/%d): %s",
|
238
|
+
attempt, max_retries, exc,
|
239
|
+
)
|
240
|
+
if attempt == max_retries:
|
241
|
+
raise
|
242
|
+
time.sleep(2 ** attempt)
|
243
|
+
|
244
|
+
|
245
|
+
###############################################################################
|
246
|
+
# 5 - PROMPTS (unchanged except for brevity)
|
247
|
+
###############################################################################
|
248
|
+
|
249
|
+
PROMPT_FIND_LOCATIONS = dedent("""
|
250
|
+
You are an expert reader of protein engineering manuscripts.
|
251
|
+
Given the following article captions and section titles, identify ALL locations
|
252
|
+
(tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
|
253
|
+
activity, etc.) for enzyme variants.
|
254
|
+
|
255
|
+
IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
|
256
|
+
performance data locations. Pay careful attention to:
|
257
|
+
- The caption text to identify which campaign/lineage the data is for
|
258
|
+
- Enzyme name prefixes (e.g., PYS vs INS) that indicate different campaigns
|
259
|
+
- Different substrate/product types mentioned in captions
|
260
|
+
|
261
|
+
Respond with a JSON array where each element contains:
|
262
|
+
- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
|
263
|
+
- "type": one of "table", "figure"
|
264
|
+
- "confidence": your confidence score (0-100)
|
265
|
+
- "caption": the exact caption text for this location
|
266
|
+
- "reason": brief explanation (including if this is for a specific lineage/campaign)
|
267
|
+
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
268
|
+
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
269
|
+
|
270
|
+
Tables are preferred over figures when both contain the same data.
|
271
|
+
|
272
|
+
Respond ONLY with **minified JSON**. NO markdown fences.
|
273
|
+
|
274
|
+
Example:
|
275
|
+
[{"location": "Table S1", "type": "table", "confidence": 95, "caption": "Table S1. Detailed information...", "reason": "Complete performance metrics", "lineage_hint": "first enzyme family", "campaign_clues": "PYS lineage, pyrrolidine synthesis"}]
|
276
|
+
""")
|
277
|
+
|
278
|
+
PROMPT_EXTRACT_METRICS = dedent("""
|
279
|
+
You are given either (a) the PNG image of a figure panel, or (b) the caption /
|
280
|
+
text excerpt that contains numeric reaction performance data for an enzyme.
|
281
|
+
|
282
|
+
Extract ONLY the performance metrics, NOT substrate/product names or reaction conditions.
|
283
|
+
Return a JSON object with the following keys (use **null** if not found):
|
284
|
+
* "yield" - yield as percentage with ONE decimal place precision
|
285
|
+
* "ttn" - turnover number (total turnovers)
|
286
|
+
* "ton" - turnover number if TTN not available
|
287
|
+
* "selectivity" - ee or er value with unit (e.g., "98% ee", ">99:1 er")
|
288
|
+
* "conversion" - conversion percentage if different from yield
|
289
|
+
* "tof" - turnover frequency (turnovers per time unit) if provided
|
290
|
+
* "activity" - specific activity if provided (with unit)
|
291
|
+
* "other_metrics" - dictionary of any other performance metrics with their units
|
292
|
+
* "notes" - any performance-related notes
|
293
|
+
|
294
|
+
IMPORTANT:
|
295
|
+
- Extract ALL performance metrics provided, even if they use different units.
|
296
|
+
- Do NOT extract substrate/product names - these will come from SI
|
297
|
+
- Do NOT extract reaction conditions (temperature, pH, time, solvent)
|
298
|
+
- If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
|
299
|
+
|
300
|
+
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
301
|
+
""")
|
302
|
+
|
303
|
+
PROMPT_EXTRACT_FIGURE_METRICS_BATCH = dedent("""
|
304
|
+
You are analyzing a figure showing enzyme reaction performance data for multiple variants.
|
305
|
+
|
306
|
+
Extract performance metrics for ALL the following enzyme variants:
|
307
|
+
{enzyme_names}
|
308
|
+
|
309
|
+
Steps:
|
310
|
+
1. CHECK THE Y-AXIS SCALE: What is the maximum value? (e.g., 10%, 30%, 50%, 100%)
|
311
|
+
2. For each enzyme variant listed above:
|
312
|
+
- Find its position on the X-axis
|
313
|
+
- Read the bar height or data point value
|
314
|
+
- Calculate the actual value based on the Y-axis scale
|
315
|
+
3. Compare all bars to understand relative performance
|
316
|
+
|
317
|
+
Return a JSON object with enzyme names as keys, each containing:
|
318
|
+
* "yield" - yield with ONE decimal place precision
|
319
|
+
* "ttn" - turnover number if shown
|
320
|
+
* "ton" - turnover number if TTN not available
|
321
|
+
* "selectivity" - ee or er value with unit
|
322
|
+
* "conversion" - conversion percentage if different from yield
|
323
|
+
* "tof" - turnover frequency if provided
|
324
|
+
* "activity" - specific activity if provided
|
325
|
+
* "other_metrics" - dictionary of any other metrics
|
326
|
+
* "notes" - any relevant notes (including reaction type if different reactions are shown)
|
327
|
+
|
328
|
+
CRITICAL:
|
329
|
+
- Read ALL pages provided in the image
|
330
|
+
- If different enzymes are tested for different reactions (e.g., pyrrolidine vs indoline synthesis), note this in "notes"
|
331
|
+
- For tables, check if data continues beyond what's shown
|
332
|
+
- Read the Y-axis scale carefully for figures
|
333
|
+
|
334
|
+
Example format:
|
335
|
+
{{"ApePgb LVQ": {{"yield": 0.0, "ttn": null, "notes": "pyrrolidine synthesis", ...}}, ...}}
|
336
|
+
|
337
|
+
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
338
|
+
""")
|
339
|
+
|
340
|
+
# Removed substrate scope IUPAC extraction - now handled in model reaction only
|
341
|
+
|
342
|
+
PROMPT_FIND_MODEL_REACTION_LOCATION = dedent("""
|
343
|
+
You are an expert reader of chemistry manuscripts.
|
344
|
+
Given the following text sections, identify where the MODEL REACTION information is located.
|
345
|
+
|
346
|
+
The model reaction is the STANDARD reaction used to evaluate all enzyme variants
|
347
|
+
(not the substrate scope). Look for:
|
348
|
+
|
349
|
+
- Sections titled "Model Reaction", "Standard Reaction", "General Procedure"
|
350
|
+
- Text describing the reaction conditions used for enzyme evolution/screening
|
351
|
+
- Sections describing which substrates were used as the benchmark
|
352
|
+
- Compound numbers (e.g., "6a", "7a") used in the model reaction
|
353
|
+
|
354
|
+
Also identify where the IUPAC names for these specific compounds are listed.
|
355
|
+
|
356
|
+
Respond with a JSON object containing:
|
357
|
+
{
|
358
|
+
"model_reaction_location": {
|
359
|
+
"location": "section name or description",
|
360
|
+
"confidence": 0-100,
|
361
|
+
"reason": "why this contains the model reaction",
|
362
|
+
"compound_ids": ["list", "of", "compound", "IDs", "if", "found"]
|
363
|
+
},
|
364
|
+
"conditions_location": {
|
365
|
+
"location": "where reaction conditions are described",
|
366
|
+
"confidence": 0-100
|
367
|
+
},
|
368
|
+
"iupac_location": {
|
369
|
+
"location": "where IUPAC names are listed (usually SI compound characterization)",
|
370
|
+
"confidence": 0-100,
|
371
|
+
"compound_section_hint": "specific section to look for compound IDs"
|
372
|
+
}
|
373
|
+
}
|
374
|
+
|
375
|
+
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
376
|
+
""")
|
377
|
+
|
378
|
+
PROMPT_MODEL_REACTION = dedent("""
|
379
|
+
Extract the model/standard reaction used to evaluate enzyme variants in this paper.
|
380
|
+
|
381
|
+
This is the reaction used for directed evolution screening, NOT the substrate scope.
|
382
|
+
Look for terms like "model reaction", "standard substrate", "benchmark reaction",
|
383
|
+
or the specific reaction mentioned in enzyme screening/evolution sections.
|
384
|
+
|
385
|
+
CRITICAL STEPS FOR IUPAC NAMES:
|
386
|
+
1. First identify the compound IDs used in the model reaction (e.g., "6a", "7a")
|
387
|
+
2. Then search the provided context for these compound IDs to find their IUPAC names
|
388
|
+
3. Look for sections with "Compound 6a", "Product 7a", or similar patterns
|
389
|
+
4. The IUPAC names are usually given after the compound ID in parentheses or after a colon
|
390
|
+
|
391
|
+
CRITICAL FOR SUBSTRATE CONCENTRATION:
|
392
|
+
- Look carefully in FIGURES and figure captions for substrate concentration information
|
393
|
+
- Figures often show detailed reaction conditions that may not be in the main text
|
394
|
+
- Identify the ACTUAL SUBSTRATES being transformed (not reducing agents or cofactors)
|
395
|
+
- Common pattern: "[X] mM [substrate name]" or "[substrate]: [X] mM"
|
396
|
+
- DO NOT confuse reducing agents (dithionite, NADH, etc.) with actual substrates
|
397
|
+
- The substrate is the molecule being chemically transformed by the enzyme
|
398
|
+
|
399
|
+
Return a JSON object with:
|
400
|
+
* "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["5", "6a"])
|
401
|
+
* "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents
|
402
|
+
* "product_list" - Array of product identifiers as used in the paper (e.g., ["7a"])
|
403
|
+
* "product_iupac_list" - Array of IUPAC names for ALL products formed
|
404
|
+
* "reaction_substrate_concentration" - Concentration of actual substrate(s) being transformed, NOT reducing agents like dithionite
|
405
|
+
* "cofactor" - Any cofactors used (e.g., "NADH", "NADPH", "FAD", "heme", etc.) or null if none
|
406
|
+
* "reaction_temperature" - reaction temperature (e.g., "25°C", "room temperature")
|
407
|
+
* "reaction_ph" - reaction pH
|
408
|
+
* "reaction_buffer" - buffer system (e.g., "50 mM potassium phosphate")
|
409
|
+
* "reaction_other_conditions" - other important conditions (enzyme loading, reducing agents like dithionite, time, anaerobic, etc.)
|
410
|
+
|
411
|
+
IMPORTANT:
|
412
|
+
- Extract the reaction used for ENZYME EVOLUTION/SCREENING (not substrate scope)
|
413
|
+
- Substrate concentration = concentration of chemicals being transformed, NOT reducing agents (dithionite, NADH, etc.)
|
414
|
+
- Maintain correspondence: substrate_list[i] should map to substrate_iupac_list[i], same for products
|
415
|
+
- If a compound ID has no IUPAC name found, still include it in the list with null in the IUPAC list
|
416
|
+
- For IUPAC names, look for the SYSTEMATIC chemical names, NOT common/trivial names
|
417
|
+
- Search the provided context for systematic names - they typically:
|
418
|
+
* Use numerical locants (e.g., "prop-2-enoate" not "acrylate")
|
419
|
+
* Follow IUPAC nomenclature rules
|
420
|
+
* May be found in compound characterization sections
|
421
|
+
- If you find a common name in the reaction description, search the context for its systematic equivalent
|
422
|
+
- Look for the exact systematic names as written in the compound characterization
|
423
|
+
- Do NOT include stereochemistry prefixes like (1R,2S) unless they are part of the compound name in the SI
|
424
|
+
|
425
|
+
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
426
|
+
""")
|
427
|
+
|
428
|
+
PROMPT_ANALYZE_LINEAGE_GROUPS = dedent("""
|
429
|
+
You are analyzing enzyme performance data from a protein engineering manuscript.
|
430
|
+
Based on the performance data locations and enzyme names, determine if there are
|
431
|
+
distinct enzyme lineage groups that were evolved for different purposes.
|
432
|
+
|
433
|
+
Look for patterns such as:
|
434
|
+
- Different tables/figures for different enzyme groups
|
435
|
+
- Enzyme naming patterns that suggest different lineages
|
436
|
+
- Different reaction types mentioned in notes or captions
|
437
|
+
- Clear separations in how variants are organized
|
438
|
+
|
439
|
+
Return a JSON object with:
|
440
|
+
{
|
441
|
+
"has_multiple_lineages": true/false,
|
442
|
+
"lineage_groups": [
|
443
|
+
{
|
444
|
+
"group_id": "unique identifier you assign",
|
445
|
+
"data_location": "where this group's data is found",
|
446
|
+
"enzyme_pattern": "naming pattern or list of enzymes",
|
447
|
+
"reaction_type": "what reaction this group catalyzes",
|
448
|
+
"evidence": "why you grouped these together"
|
449
|
+
}
|
450
|
+
],
|
451
|
+
"confidence": 0-100
|
452
|
+
}
|
453
|
+
|
454
|
+
If only one lineage exists, return has_multiple_lineages: false with a single group.
|
455
|
+
|
456
|
+
Respond ONLY with **minified JSON**.
|
457
|
+
""")
|
458
|
+
|
459
|
+
PROMPT_FIND_LINEAGE_MODEL_REACTION = dedent("""
|
460
|
+
For the enzyme group with performance data in {location}, identify the specific
|
461
|
+
model reaction used to screen/evaluate these variants.
|
462
|
+
|
463
|
+
Context about this group:
|
464
|
+
{group_context}
|
465
|
+
|
466
|
+
Look for:
|
467
|
+
- References to the specific substrate/product used for this enzyme group
|
468
|
+
- Text near the performance data location describing the reaction
|
469
|
+
- Connections between the enzyme names and specific substrates
|
470
|
+
- Any mention of "screened with", "tested against", "substrate X was used"
|
471
|
+
|
472
|
+
Return:
|
473
|
+
{{
|
474
|
+
"substrate_ids": ["list of substrate IDs for this group"],
|
475
|
+
"product_ids": ["list of product IDs for this group"],
|
476
|
+
"confidence": 0-100,
|
477
|
+
"evidence": "text supporting this substrate/product assignment"
|
478
|
+
}}
|
479
|
+
|
480
|
+
Respond ONLY with **minified JSON**.
|
481
|
+
""")
|
482
|
+
|
483
|
+
PROMPT_COMPOUND_MAPPING = dedent("""
|
484
|
+
Extract compound identifiers and their IUPAC names from the provided sections.
|
485
|
+
|
486
|
+
Look for ALL compounds mentioned, including:
|
487
|
+
1. Compounds with explicit IUPAC names in the text
|
488
|
+
2. Common reagents where you can provide standard IUPAC names
|
489
|
+
3. Products that may not be explicitly characterized
|
490
|
+
|
491
|
+
CRITICAL - NO HALLUCINATION:
|
492
|
+
- Extract IUPAC names EXACTLY as written in the source
|
493
|
+
- DO NOT modify, correct, or "improve" any chemical names
|
494
|
+
- If a name is written as "benzyl-2-phenylcyclopropane-1-carboxylate", keep it exactly
|
495
|
+
- Only provide standard IUPAC names for common reagents if not found in text
|
496
|
+
- If no IUPAC name is found for a compound, return null for iupac_name
|
497
|
+
- Include ALL compounds found or referenced
|
498
|
+
|
499
|
+
Return as JSON:
|
500
|
+
{
|
501
|
+
"compound_mappings": [
|
502
|
+
{
|
503
|
+
"identifier": "compound identifier",
|
504
|
+
"iupac_name": "complete IUPAC name",
|
505
|
+
"common_names": ["any alternative names"],
|
506
|
+
"compound_type": "substrate/product/reagent/other",
|
507
|
+
"source_location": "where found or inferred"
|
508
|
+
}
|
509
|
+
]
|
510
|
+
}
|
511
|
+
""")
|
512
|
+
|
513
|
+
###############################################################################
|
514
|
+
# 6 - EXTRACTION ENGINE
|
515
|
+
###############################################################################
|
516
|
+
|
517
|
+
class ReactionExtractor:
|
518
|
+
_FIG_RE = re.compile(r"fig(?:ure)?\s+s?\d+[a-z]?", re.I)
|
519
|
+
_TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
520
|
+
|
521
|
+
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
522
|
+
campaign_filter: Optional[str] = None):
|
523
|
+
self.manuscript = manuscript
|
524
|
+
self.si = si
|
525
|
+
self.cfg = cfg
|
526
|
+
self.model = get_model(cfg)
|
527
|
+
self.debug_dir = debug_dir
|
528
|
+
self.campaign_filter = campaign_filter # Filter for specific campaign
|
529
|
+
|
530
|
+
# Create debug directory if specified
|
531
|
+
if self.debug_dir:
|
532
|
+
self.debug_dir = Path(self.debug_dir)
|
533
|
+
self.debug_dir.mkdir(parents=True, exist_ok=True)
|
534
|
+
LOGGER.info("Debug output will be saved to: %s", self.debug_dir)
|
535
|
+
|
536
|
+
if self.campaign_filter:
|
537
|
+
LOGGER.info("Filtering extraction for campaign: %s", self.campaign_filter)
|
538
|
+
|
539
|
+
# Preload text pages
|
540
|
+
LOGGER.info("Reading PDFs…")
|
541
|
+
self.ms_pages = extract_text_by_page(manuscript)
|
542
|
+
self.si_pages = extract_text_by_page(si)
|
543
|
+
self.all_pages = self.ms_pages + self.si_pages
|
544
|
+
|
545
|
+
# Keep open fitz Docs for image extraction
|
546
|
+
self.ms_doc = fitz.open(str(manuscript))
|
547
|
+
self.si_doc = fitz.open(str(si)) if si else None
|
548
|
+
|
549
|
+
# ------------------------------------------------------------------
|
550
|
+
# 6.1 Find locations (unchanged)
|
551
|
+
# ------------------------------------------------------------------
|
552
|
+
|
553
|
+
def _collect_captions_and_titles(self) -> str:
|
554
|
+
# Simpler pattern: match any line starting with Table or Figure
|
555
|
+
# This catches all variations like "Table S 2", "Table.", "Figure S1", etc.
|
556
|
+
cap_pattern = re.compile(r"^(Table|Figure).*", re.I | re.M)
|
557
|
+
captions: List[str] = []
|
558
|
+
|
559
|
+
# Collect from all pages
|
560
|
+
all_text = "\n".join(self.all_pages)
|
561
|
+
|
562
|
+
# Find all figure/table captions
|
563
|
+
for match in cap_pattern.finditer(all_text):
|
564
|
+
caption_start = match.start()
|
565
|
+
# Get up to 1200 chars or until double newline
|
566
|
+
caption_end = all_text.find("\n\n", caption_start)
|
567
|
+
if caption_end == -1 or caption_end - caption_start > 1200:
|
568
|
+
caption_end = caption_start + 1200
|
569
|
+
caption = all_text[caption_start:caption_end].strip()
|
570
|
+
captions.append(caption)
|
571
|
+
|
572
|
+
# Also look for SI section titles
|
573
|
+
si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
|
574
|
+
|
575
|
+
result = "\n".join(captions + si_titles)
|
576
|
+
LOGGER.debug("Collected %d captions/titles, total length: %d chars",
|
577
|
+
len(captions) + len(si_titles), len(result))
|
578
|
+
|
579
|
+
# Log first few captions for debugging
|
580
|
+
if captions:
|
581
|
+
LOGGER.debug("First few captions: %s", captions[:3])
|
582
|
+
|
583
|
+
return result
|
584
|
+
|
585
|
+
def find_reaction_locations(self) -> List[Dict[str, Any]]:
|
586
|
+
"""Find all locations containing reaction performance data."""
|
587
|
+
# Add campaign context if available
|
588
|
+
campaign_context = ""
|
589
|
+
if self.campaign_filter:
|
590
|
+
campaign_context = f"""
|
591
|
+
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
592
|
+
Only return locations that contain data for this specific campaign.
|
593
|
+
Ignore locations that contain data for other campaigns.
|
594
|
+
|
595
|
+
"""
|
596
|
+
|
597
|
+
prompt = campaign_context + PROMPT_FIND_LOCATIONS + "\n\n" + self._collect_captions_and_titles()
|
598
|
+
try:
|
599
|
+
data = generate_json_with_retry(
|
600
|
+
self.model,
|
601
|
+
prompt,
|
602
|
+
temperature=self.cfg.location_temperature,
|
603
|
+
debug_dir=self.debug_dir,
|
604
|
+
tag="find_locations"
|
605
|
+
)
|
606
|
+
# Handle both single dict (backwards compatibility) and list
|
607
|
+
if isinstance(data, dict):
|
608
|
+
return [data]
|
609
|
+
elif isinstance(data, list):
|
610
|
+
return data
|
611
|
+
else:
|
612
|
+
LOGGER.error("Expected list or dict from Gemini, got: %s", type(data))
|
613
|
+
return []
|
614
|
+
except Exception as e:
|
615
|
+
LOGGER.error("Failed to find reaction locations: %s", e)
|
616
|
+
return []
|
617
|
+
|
618
|
+
def _get_base_location(self, location: str) -> str:
|
619
|
+
"""Extract the base location identifier (e.g., 'Table S1' from 'Table S1' or 'S41-S47').
|
620
|
+
|
621
|
+
This helps group related locations that likely share the same model reaction.
|
622
|
+
"""
|
623
|
+
# Common patterns for locations
|
624
|
+
patterns = [
|
625
|
+
(r'Table\s+S\d+', 'table'),
|
626
|
+
(r'Figure\s+S\d+', 'figure'),
|
627
|
+
(r'Table\s+\d+', 'table'),
|
628
|
+
(r'Figure\s+\d+', 'figure'),
|
629
|
+
(r'S\d+(?:-S\d+)?', 'supp'), # Supplementary pages like S41-S47
|
630
|
+
]
|
631
|
+
|
632
|
+
for pattern, loc_type in patterns:
|
633
|
+
match = re.search(pattern, location, re.I)
|
634
|
+
if match:
|
635
|
+
return match.group(0)
|
636
|
+
|
637
|
+
# Default: use the location as-is
|
638
|
+
return location
|
639
|
+
|
640
|
+
def analyze_lineage_groups(self, locations: List[Dict[str, Any]], enzyme_df: pd.DataFrame) -> Dict[str, Any]:
|
641
|
+
"""Analyze if there are distinct lineage groups based on different locations.
|
642
|
+
|
643
|
+
Key principle: Different locations (tables/figures) indicate different model reactions.
|
644
|
+
"""
|
645
|
+
# Group locations by their base identifier
|
646
|
+
location_groups = {}
|
647
|
+
|
648
|
+
for loc in locations:
|
649
|
+
location_id = loc['location']
|
650
|
+
base_location = self._get_base_location(location_id)
|
651
|
+
|
652
|
+
if base_location not in location_groups:
|
653
|
+
location_groups[base_location] = []
|
654
|
+
location_groups[base_location].append(loc)
|
655
|
+
|
656
|
+
# Each unique base location represents a potential lineage group
|
657
|
+
lineage_groups = []
|
658
|
+
|
659
|
+
for base_loc, locs in location_groups.items():
|
660
|
+
# Use the location with highest confidence as primary
|
661
|
+
primary_loc = max(locs, key=lambda x: x.get('confidence', 0))
|
662
|
+
|
663
|
+
# Create a group for this location
|
664
|
+
group = {
|
665
|
+
'group_id': base_loc,
|
666
|
+
'data_location': primary_loc['location'],
|
667
|
+
'all_locations': [l['location'] for l in locs],
|
668
|
+
'lineage_hint': primary_loc.get('lineage_hint', ''),
|
669
|
+
'caption': primary_loc.get('caption', ''),
|
670
|
+
'confidence': primary_loc.get('confidence', 0)
|
671
|
+
}
|
672
|
+
lineage_groups.append(group)
|
673
|
+
|
674
|
+
# Multiple distinct base locations = multiple model reactions
|
675
|
+
has_multiple = len(location_groups) > 1
|
676
|
+
|
677
|
+
LOGGER.info("Location-based lineage analysis: %d distinct base locations found",
|
678
|
+
len(location_groups))
|
679
|
+
for group in lineage_groups:
|
680
|
+
LOGGER.info(" - %s: %s", group['group_id'], group['data_location'])
|
681
|
+
|
682
|
+
return {
|
683
|
+
'has_multiple_lineages': has_multiple,
|
684
|
+
'lineage_groups': lineage_groups,
|
685
|
+
'confidence': 95
|
686
|
+
}
|
687
|
+
|
688
|
+
def find_lineage_model_reaction(self, location: str, group_context: str) -> Dict[str, Any]:
|
689
|
+
"""Find the model reaction for a specific lineage group."""
|
690
|
+
# Gather relevant text near this location
|
691
|
+
page_text = self._page_with_reference(location) or ""
|
692
|
+
|
693
|
+
# Also check manuscript introduction for model reaction info
|
694
|
+
intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
|
695
|
+
|
696
|
+
prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
|
697
|
+
location=location,
|
698
|
+
group_context=group_context
|
699
|
+
)
|
700
|
+
prompt += f"\n\nText near {location}:\n{page_text[:3000]}"
|
701
|
+
prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
|
702
|
+
|
703
|
+
try:
|
704
|
+
data = generate_json_with_retry(
|
705
|
+
self.model,
|
706
|
+
prompt,
|
707
|
+
temperature=self.cfg.model_reaction_temperature,
|
708
|
+
debug_dir=self.debug_dir,
|
709
|
+
tag=f"lineage_model_reaction_{location.replace(' ', '_')}"
|
710
|
+
)
|
711
|
+
return data if isinstance(data, dict) else {}
|
712
|
+
except Exception as e:
|
713
|
+
LOGGER.error("Failed to find model reaction for lineage at %s: %s", location, e)
|
714
|
+
return {}
|
715
|
+
|
716
|
+
# ------------------------------------------------------------------
|
717
|
+
# 6.2 Figure / Table context helpers
|
718
|
+
# ------------------------------------------------------------------
|
719
|
+
|
720
|
+
def _page_with_reference(self, ref_id: str) -> Optional[str]:
|
721
|
+
for page in self.all_pages:
|
722
|
+
if ref_id.lower() in page.lower():
|
723
|
+
return page
|
724
|
+
return None
|
725
|
+
|
726
|
+
# ---- Table text helper - now returns full page ----
|
727
|
+
def _extract_table_context(self, ref: str) -> str:
|
728
|
+
page = self._page_with_reference(ref)
|
729
|
+
if not page:
|
730
|
+
return ""
|
731
|
+
# Return the entire page content for better table extraction
|
732
|
+
return page
|
733
|
+
|
734
|
+
# ---- Figure caption helper (text fallback) ----
|
735
|
+
def _extract_figure_caption(self, ref: str) -> str:
|
736
|
+
page = self._page_with_reference(ref)
|
737
|
+
if not page:
|
738
|
+
return ""
|
739
|
+
m = re.search(rf"({re.escape(ref)}[\s\S]{{0,800}}?\.)", page, re.I)
|
740
|
+
if m:
|
741
|
+
return m.group(1)
|
742
|
+
for line in page.split("\n"):
|
743
|
+
if ref.lower() in line.lower():
|
744
|
+
return line
|
745
|
+
return page[:800]
|
746
|
+
|
747
|
+
# ---- NEW: Page image helper for both figures and tables ----
|
748
|
+
def _extract_page_png(self, ref: str, extract_figure_only: bool = True) -> Optional[str]:
|
749
|
+
"""Export the page containing the reference as PNG.
|
750
|
+
If extract_figure_only=True, extracts just the figure above the caption.
|
751
|
+
If False, extracts the entire page (useful for tables).
|
752
|
+
Returns a base64-encoded PNG or None."""
|
753
|
+
|
754
|
+
# For table extraction, use multi-page approach
|
755
|
+
if not extract_figure_only:
|
756
|
+
pages_with_ref = self._find_pages_with_reference(ref)
|
757
|
+
if pages_with_ref:
|
758
|
+
LOGGER.debug(f"Found {len(pages_with_ref)} pages containing {ref}")
|
759
|
+
return self._extract_multiple_pages_png(pages_with_ref)
|
760
|
+
return None
|
761
|
+
|
762
|
+
# For figure extraction, search both documents
|
763
|
+
for doc in filter(None, [self.ms_doc, self.si_doc]):
|
764
|
+
for page_number in range(doc.page_count):
|
765
|
+
page = doc.load_page(page_number)
|
766
|
+
page_text = page.get_text()
|
767
|
+
if ref.lower() not in page_text.lower():
|
768
|
+
continue
|
769
|
+
# Get caption bbox
|
770
|
+
text_instances = page.search_for(ref, quads=False)
|
771
|
+
if not text_instances:
|
772
|
+
continue
|
773
|
+
cap_rect = text_instances[0] # first match
|
774
|
+
|
775
|
+
if extract_figure_only:
|
776
|
+
# Sort images by y0 (top) coordinate ascending
|
777
|
+
images = sorted(page.get_images(full=True), key=lambda im: im[7])
|
778
|
+
# Find first image whose bottom y is **above** caption top y
|
779
|
+
for img in images:
|
780
|
+
xref = img[0]
|
781
|
+
# Get image rectangles to find position
|
782
|
+
img_rects = page.get_image_rects(xref)
|
783
|
+
if img_rects:
|
784
|
+
img_rect = img_rects[0] # First rectangle
|
785
|
+
if img_rect.y1 < cap_rect.y0: # fully above caption
|
786
|
+
# Extract image bytes
|
787
|
+
pix = fitz.Pixmap(doc, xref)
|
788
|
+
if pix.alpha: # RGBA -> RGB
|
789
|
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
790
|
+
img_bytes = pix.tobytes("png")
|
791
|
+
return b64encode(img_bytes).decode()
|
792
|
+
else:
|
793
|
+
# Extract the entire page as an image
|
794
|
+
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
795
|
+
pix = page.get_pixmap(matrix=mat)
|
796
|
+
img_bytes = pix.tobytes("png")
|
797
|
+
return b64encode(img_bytes).decode()
|
798
|
+
return None
|
799
|
+
|
800
|
+
def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
|
801
|
+
"""Find all pages containing the reference across documents.
|
802
|
+
Returns list of (document, page_number) tuples."""
|
803
|
+
pages_found = []
|
804
|
+
|
805
|
+
for doc in filter(None, [self.ms_doc, self.si_doc]):
|
806
|
+
for page_number in range(doc.page_count):
|
807
|
+
page = doc.load_page(page_number)
|
808
|
+
page_text = page.get_text()
|
809
|
+
if ref.lower() in page_text.lower():
|
810
|
+
pages_found.append((doc, page_number))
|
811
|
+
|
812
|
+
return pages_found
|
813
|
+
|
814
|
+
def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]]) -> Optional[str]:
|
815
|
+
"""Extract multiple pages as a combined PNG image."""
|
816
|
+
if not pages:
|
817
|
+
return None
|
818
|
+
|
819
|
+
# Sort pages by document and page number
|
820
|
+
pages.sort(key=lambda x: (id(x[0]), x[1]))
|
821
|
+
|
822
|
+
# Extract the range of pages including one page after
|
823
|
+
all_images = []
|
824
|
+
for i, (doc, page_num) in enumerate(pages):
|
825
|
+
# Add the current page
|
826
|
+
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
|
827
|
+
pix = doc.load_page(page_num).get_pixmap(matrix=mat)
|
828
|
+
all_images.append(pix)
|
829
|
+
|
830
|
+
# If this is the last page with the reference, also add the next page
|
831
|
+
if i == len(pages) - 1 and page_num + 1 < doc.page_count:
|
832
|
+
next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
|
833
|
+
all_images.append(next_pix)
|
834
|
+
LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
|
835
|
+
|
836
|
+
if not all_images:
|
837
|
+
return None
|
838
|
+
|
839
|
+
# If only one page, return it directly
|
840
|
+
if len(all_images) == 1:
|
841
|
+
return b64encode(all_images[0].tobytes("png")).decode()
|
842
|
+
|
843
|
+
# Combine multiple pages vertically
|
844
|
+
if not all_images:
|
845
|
+
return None
|
846
|
+
|
847
|
+
if len(all_images) == 1:
|
848
|
+
return b64encode(all_images[0].tobytes("png")).decode()
|
849
|
+
|
850
|
+
# Calculate dimensions for combined image
|
851
|
+
total_height = sum(pix.height for pix in all_images)
|
852
|
+
max_width = max(pix.width for pix in all_images)
|
853
|
+
|
854
|
+
LOGGER.info(f"Combining {len(all_images)} pages into single image ({max_width}x{total_height})")
|
855
|
+
|
856
|
+
# Create a new document with a single page that can hold all images
|
857
|
+
output_doc = fitz.open()
|
858
|
+
|
859
|
+
# Create a page with the combined dimensions
|
860
|
+
# Note: PDF pages have a max size, so we scale if needed
|
861
|
+
max_pdf_dimension = 14400 # PDF max is ~200 inches at 72 DPI
|
862
|
+
scale = 1.0
|
863
|
+
if total_height > max_pdf_dimension or max_width > max_pdf_dimension:
|
864
|
+
scale = min(max_pdf_dimension / total_height, max_pdf_dimension / max_width)
|
865
|
+
total_height = int(total_height * scale)
|
866
|
+
max_width = int(max_width * scale)
|
867
|
+
LOGGER.warning(f"Scaling down by {scale:.2f} to fit PDF limits")
|
868
|
+
|
869
|
+
page = output_doc.new_page(width=max_width, height=total_height)
|
870
|
+
|
871
|
+
# Insert each image into the page
|
872
|
+
y_offset = 0
|
873
|
+
for i, pix in enumerate(all_images):
|
874
|
+
# Center each image horizontally
|
875
|
+
x_offset = (max_width - pix.width * scale) / 2
|
876
|
+
|
877
|
+
# Create rect for image placement
|
878
|
+
rect = fitz.Rect(x_offset, y_offset,
|
879
|
+
x_offset + pix.width * scale,
|
880
|
+
y_offset + pix.height * scale)
|
881
|
+
|
882
|
+
# Insert the image
|
883
|
+
page.insert_image(rect, pixmap=pix)
|
884
|
+
y_offset += pix.height * scale
|
885
|
+
|
886
|
+
# Convert the page to a pixmap
|
887
|
+
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for quality
|
888
|
+
combined_pix = page.get_pixmap(matrix=mat)
|
889
|
+
|
890
|
+
# Convert to PNG and return
|
891
|
+
img_bytes = combined_pix.tobytes("png")
|
892
|
+
output_doc.close()
|
893
|
+
|
894
|
+
return b64encode(img_bytes).decode()
|
895
|
+
|
896
|
+
# ------------------------------------------------------------------
|
897
|
+
# 6.3 Extract metrics in batch
|
898
|
+
# ------------------------------------------------------------------
|
899
|
+
|
900
|
+
def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
|
901
|
+
"""Extract performance metrics for multiple enzymes from the identified location in batch."""
|
902
|
+
ref_lc = ref.lower()
|
903
|
+
image_b64: Optional[str] = None
|
904
|
+
|
905
|
+
# Add campaign context if available
|
906
|
+
campaign_context = ""
|
907
|
+
if self.campaign_filter:
|
908
|
+
campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\n"
|
909
|
+
|
910
|
+
if self._TAB_RE.search(ref_lc):
|
911
|
+
# For tables, try to extract the page as an image first
|
912
|
+
image_b64 = self._extract_page_png(ref, extract_figure_only=False)
|
913
|
+
if not image_b64:
|
914
|
+
LOGGER.debug("No page image found for %s - using full page text", ref)
|
915
|
+
snippet = self._extract_table_context(ref)
|
916
|
+
elif self._FIG_RE.search(ref_lc):
|
917
|
+
# For figures, extract just the figure image
|
918
|
+
image_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
919
|
+
if not image_b64:
|
920
|
+
LOGGER.debug("No figure image found for %s - using caption text", ref)
|
921
|
+
snippet = self._extract_figure_caption(ref)
|
922
|
+
else:
|
923
|
+
snippet = self._page_with_reference(ref) or ""
|
924
|
+
|
925
|
+
enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
|
926
|
+
|
927
|
+
if image_b64:
|
928
|
+
# Use batch extraction prompt for image analysis
|
929
|
+
prompt = campaign_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
|
930
|
+
LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
931
|
+
tag = f"extract_metrics_batch_vision"
|
932
|
+
else:
|
933
|
+
# Add enzyme names to prompt for batch extraction
|
934
|
+
prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\n=== CONTEXT ===\n" + snippet[:4000]
|
935
|
+
LOGGER.info("Gemini: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
936
|
+
tag = f"extract_metrics_batch"
|
937
|
+
|
938
|
+
try:
|
939
|
+
data = generate_json_with_retry(
|
940
|
+
self.model,
|
941
|
+
prompt,
|
942
|
+
temperature=self.cfg.extract_temperature,
|
943
|
+
debug_dir=self.debug_dir,
|
944
|
+
tag=tag,
|
945
|
+
image_b64=image_b64
|
946
|
+
)
|
947
|
+
|
948
|
+
# Handle the response format - expecting a dict with enzyme names as keys
|
949
|
+
results = []
|
950
|
+
if isinstance(data, dict):
|
951
|
+
for enzyme in enzyme_list:
|
952
|
+
enzyme_data = data.get(enzyme, {})
|
953
|
+
if not isinstance(enzyme_data, dict):
|
954
|
+
enzyme_data = {"error": "No data found"}
|
955
|
+
|
956
|
+
# Normalize keys
|
957
|
+
# No need to rename - we now use "yield" directly
|
958
|
+
if "TTN" in enzyme_data and "ttn" not in enzyme_data:
|
959
|
+
enzyme_data["ttn"] = enzyme_data.pop("TTN")
|
960
|
+
|
961
|
+
# Add metadata
|
962
|
+
enzyme_data["enzyme"] = enzyme
|
963
|
+
enzyme_data["location_ref"] = ref
|
964
|
+
enzyme_data["used_image"] = bool(image_b64)
|
965
|
+
results.append(enzyme_data)
|
966
|
+
else:
|
967
|
+
# Fallback if response format is unexpected
|
968
|
+
LOGGER.warning("Unexpected response format from batch extraction")
|
969
|
+
for enzyme in enzyme_list:
|
970
|
+
results.append({
|
971
|
+
"enzyme": enzyme,
|
972
|
+
"location_ref": ref,
|
973
|
+
"used_image": bool(image_b64),
|
974
|
+
"error": "Invalid response format"
|
975
|
+
})
|
976
|
+
|
977
|
+
except Exception as e:
|
978
|
+
LOGGER.warning("Failed to extract metrics batch: %s", e)
|
979
|
+
results = []
|
980
|
+
for enzyme in enzyme_list:
|
981
|
+
results.append({
|
982
|
+
"enzyme": enzyme,
|
983
|
+
"location_ref": ref,
|
984
|
+
"used_image": bool(image_b64),
|
985
|
+
"error": str(e)
|
986
|
+
})
|
987
|
+
|
988
|
+
return results
|
989
|
+
|
990
|
+
# Removed extract_iupac_names - substrate scope IUPAC extraction no longer needed
|
991
|
+
|
992
|
+
# ------------------------------------------------------------------
|
993
|
+
# 6.4 Model reaction with location finding
|
994
|
+
# ------------------------------------------------------------------
|
995
|
+
|
996
|
+
def find_model_reaction_locations(self, enzyme_variants: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
|
997
|
+
"""Find locations for model reaction scheme, conditions, and IUPAC names."""
|
998
|
+
# Collect all text including section titles, captions, and schemes
|
999
|
+
all_text = self._collect_captions_and_titles()
|
1000
|
+
|
1001
|
+
# Also add first few pages of main text and SI
|
1002
|
+
ms_preview = "\n".join(self.ms_pages[:5])[:5000]
|
1003
|
+
si_preview = "\n".join(self.si_pages[:10])[:5000] if self.si_pages else ""
|
1004
|
+
|
1005
|
+
# Add enzyme context if provided
|
1006
|
+
enzyme_context = ""
|
1007
|
+
if enzyme_variants and self.campaign_filter:
|
1008
|
+
enzyme_context = f"""
|
1009
|
+
IMPORTANT CONTEXT:
|
1010
|
+
You are looking for the model reaction used specifically for these enzyme variants:
|
1011
|
+
{', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
|
1012
|
+
|
1013
|
+
These variants belong to campaign: {self.campaign_filter}
|
1014
|
+
|
1015
|
+
Focus on finding the model reaction that was used to evaluate THESE specific variants.
|
1016
|
+
Different campaigns may use different model reactions.
|
1017
|
+
"""
|
1018
|
+
|
1019
|
+
prompt = enzyme_context + PROMPT_FIND_MODEL_REACTION_LOCATION + "\n\n=== CAPTIONS AND SECTIONS ===\n" + all_text + "\n\n=== MANUSCRIPT TEXT PREVIEW ===\n" + ms_preview + "\n\n=== SI TEXT PREVIEW ===\n" + si_preview
|
1020
|
+
|
1021
|
+
try:
|
1022
|
+
data = generate_json_with_retry(
|
1023
|
+
self.model,
|
1024
|
+
prompt,
|
1025
|
+
temperature=self.cfg.location_temperature,
|
1026
|
+
debug_dir=self.debug_dir,
|
1027
|
+
tag="find_model_reaction_locations"
|
1028
|
+
)
|
1029
|
+
if not isinstance(data, dict):
|
1030
|
+
LOGGER.error("Expected dict from Gemini, got: %s", type(data))
|
1031
|
+
return None
|
1032
|
+
return data
|
1033
|
+
except Exception as e:
|
1034
|
+
LOGGER.error("Failed to find model reaction locations: %s", e)
|
1035
|
+
return None
|
1036
|
+
|
1037
|
+
def _get_text_around_location(self, location: str) -> Optional[str]:
|
1038
|
+
"""Extract text around a given location identifier."""
|
1039
|
+
location_lower = location.lower()
|
1040
|
+
|
1041
|
+
# Search in all pages
|
1042
|
+
for page_text in self.all_pages:
|
1043
|
+
if location_lower in page_text.lower():
|
1044
|
+
# Find the location and extract context around it
|
1045
|
+
idx = page_text.lower().index(location_lower)
|
1046
|
+
start = max(0, idx - 500)
|
1047
|
+
end = min(len(page_text), idx + 3000)
|
1048
|
+
return page_text[start:end]
|
1049
|
+
|
1050
|
+
# If not found in exact form, try pattern matching
|
1051
|
+
# For scheme/figure references
|
1052
|
+
if re.search(r"(scheme|figure|table)\s*\d+", location_lower):
|
1053
|
+
pattern = re.compile(location.replace(" ", r"\s*"), re.I)
|
1054
|
+
for page_text in self.all_pages:
|
1055
|
+
match = pattern.search(page_text)
|
1056
|
+
if match:
|
1057
|
+
start = max(0, match.start() - 500)
|
1058
|
+
end = min(len(page_text), match.end() + 3000)
|
1059
|
+
return page_text[start:end]
|
1060
|
+
|
1061
|
+
return None
|
1062
|
+
|
1063
|
+
def _get_extended_text_around_location(self, location: str, before: int = 2000, after: int = 10000) -> Optional[str]:
|
1064
|
+
"""Extract extended text around a given location identifier."""
|
1065
|
+
location_lower = location.lower()
|
1066
|
+
|
1067
|
+
# Search in all pages
|
1068
|
+
for i, page_text in enumerate(self.all_pages):
|
1069
|
+
if location_lower in page_text.lower():
|
1070
|
+
# Find the location
|
1071
|
+
idx = page_text.lower().index(location_lower)
|
1072
|
+
|
1073
|
+
# Collect text from multiple pages if needed
|
1074
|
+
result = []
|
1075
|
+
|
1076
|
+
# Start from current page
|
1077
|
+
start = max(0, idx - before)
|
1078
|
+
result.append(page_text[start:])
|
1079
|
+
|
1080
|
+
# Add subsequent pages up to 'after' characters
|
1081
|
+
chars_collected = len(page_text) - start
|
1082
|
+
page_idx = i + 1
|
1083
|
+
|
1084
|
+
while chars_collected < after + before and page_idx < len(self.all_pages):
|
1085
|
+
next_page = self.all_pages[page_idx]
|
1086
|
+
chars_to_take = min(len(next_page), after + before - chars_collected)
|
1087
|
+
result.append(next_page[:chars_to_take])
|
1088
|
+
chars_collected += chars_to_take
|
1089
|
+
page_idx += 1
|
1090
|
+
|
1091
|
+
return "\n".join(result)
|
1092
|
+
|
1093
|
+
return None
|
1094
|
+
|
1095
|
+
def _extract_sections_by_title(self, sections: List[str], max_chars_per_section: int = 5000) -> str:
|
1096
|
+
"""Extract text from sections with specific titles."""
|
1097
|
+
extracted_text = []
|
1098
|
+
|
1099
|
+
for section_title in sections:
|
1100
|
+
pattern = re.compile(rf"{re.escape(section_title)}.*?(?=\n\n[A-Z]|\Z)", re.I | re.S)
|
1101
|
+
|
1102
|
+
# Search in all pages
|
1103
|
+
for page in self.all_pages:
|
1104
|
+
match = pattern.search(page)
|
1105
|
+
if match:
|
1106
|
+
section_text = match.group(0)[:max_chars_per_section]
|
1107
|
+
extracted_text.append(f"=== {section_title} ===\n{section_text}")
|
1108
|
+
break
|
1109
|
+
|
1110
|
+
return "\n\n".join(extracted_text)
|
1111
|
+
|
1112
|
+
def _extract_compound_mappings_from_text(
|
1113
|
+
self,
|
1114
|
+
extraction_text: str,
|
1115
|
+
compound_ids: List[str] = None,
|
1116
|
+
tag_suffix: str = "",
|
1117
|
+
) -> Dict[str, CompoundMapping]:
|
1118
|
+
"""Helper function to extract compound mappings from provided text."""
|
1119
|
+
prompt = PROMPT_COMPOUND_MAPPING
|
1120
|
+
if compound_ids:
|
1121
|
+
prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
|
1122
|
+
prompt += "\n\nTEXT:\n" + extraction_text
|
1123
|
+
|
1124
|
+
tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
|
1125
|
+
|
1126
|
+
try:
|
1127
|
+
data = generate_json_with_retry(
|
1128
|
+
self.model,
|
1129
|
+
prompt,
|
1130
|
+
temperature=self.cfg.model_reaction_temperature,
|
1131
|
+
debug_dir=self.debug_dir,
|
1132
|
+
tag=tag,
|
1133
|
+
)
|
1134
|
+
|
1135
|
+
mappings = {}
|
1136
|
+
for item in data.get("compound_mappings", []):
|
1137
|
+
# Handle both old format (with identifiers list) and new format (with identifier string)
|
1138
|
+
identifiers = item.get("identifiers", [])
|
1139
|
+
if not identifiers and item.get("identifier"):
|
1140
|
+
identifiers = [item.get("identifier")]
|
1141
|
+
|
1142
|
+
mapping = CompoundMapping(
|
1143
|
+
identifiers=identifiers,
|
1144
|
+
iupac_name=item.get("iupac_name", ""),
|
1145
|
+
common_names=item.get("common_names", []),
|
1146
|
+
compound_type=item.get("compound_type", "unknown"),
|
1147
|
+
source_location=item.get("source_location")
|
1148
|
+
)
|
1149
|
+
|
1150
|
+
# Create lookup entries for all identifiers and common names
|
1151
|
+
for identifier in mapping.identifiers + mapping.common_names:
|
1152
|
+
if identifier:
|
1153
|
+
mappings[identifier.lower().strip()] = mapping
|
1154
|
+
|
1155
|
+
return mappings
|
1156
|
+
|
1157
|
+
except Exception as exc:
|
1158
|
+
LOGGER.error("Failed to extract compound mappings: %s", exc)
|
1159
|
+
return {}
|
1160
|
+
|
1161
|
+
def _extract_compound_mappings_with_figures(
|
1162
|
+
self,
|
1163
|
+
text: str,
|
1164
|
+
compound_ids: List[str],
|
1165
|
+
figure_images: Dict[str, str],
|
1166
|
+
tag_suffix: str = "",
|
1167
|
+
) -> Dict[str, CompoundMapping]:
|
1168
|
+
"""Extract compound mappings using multimodal approach with figures."""
|
1169
|
+
# Enhanced prompt for figure-based extraction
|
1170
|
+
prompt = """You are analyzing chemical figures and manuscript text to identify compound IUPAC names.
|
1171
|
+
|
1172
|
+
TASK: Find the IUPAC names for these specific compound identifiers: """ + ", ".join(sorted(compound_ids)) + """
|
1173
|
+
|
1174
|
+
Use your best knowledge, Look carefully in:
|
1175
|
+
1. The chemical structures shown in figures - infer IUPAC names from drawn structures
|
1176
|
+
2. Figure captions that may define compounds
|
1177
|
+
3. Text that refers to these compound numbers
|
1178
|
+
4. Reaction schemes showing transformations
|
1179
|
+
|
1180
|
+
|
1181
|
+
IMPORTANT:
|
1182
|
+
- Only provide IUPAC names you can determine from the figures or text
|
1183
|
+
- If a structure is clearly shown in a figure, derive the IUPAC name from it
|
1184
|
+
|
1185
|
+
Return as JSON:
|
1186
|
+
{
|
1187
|
+
"compound_mappings": [
|
1188
|
+
{
|
1189
|
+
"identifier": "compound identifier",
|
1190
|
+
"iupac_name": "IUPAC name",
|
1191
|
+
"common_names": ["common names if any"],
|
1192
|
+
"compound_type": "substrate/product/reagent",
|
1193
|
+
"source_location": "where found (e.g., Figure 3, manuscript text)"
|
1194
|
+
}
|
1195
|
+
]
|
1196
|
+
}
|
1197
|
+
|
1198
|
+
TEXT FROM MANUSCRIPT:
|
1199
|
+
""" + text
|
1200
|
+
|
1201
|
+
# Prepare multimodal content
|
1202
|
+
content_parts = [prompt]
|
1203
|
+
|
1204
|
+
# Add figure images
|
1205
|
+
if figure_images:
|
1206
|
+
for fig_ref, fig_base64 in figure_images.items():
|
1207
|
+
try:
|
1208
|
+
img_bytes = b64decode(fig_base64)
|
1209
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1210
|
+
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1211
|
+
content_parts.append(image)
|
1212
|
+
LOGGER.info("Added figure %s to multimodal compound mapping", fig_ref)
|
1213
|
+
except Exception as e:
|
1214
|
+
LOGGER.warning("Failed to add figure %s: %s", fig_ref, e)
|
1215
|
+
|
1216
|
+
tag = f"compound_mapping_{tag_suffix}" if tag_suffix else "compound_mapping"
|
1217
|
+
|
1218
|
+
try:
|
1219
|
+
# Log multimodal call
|
1220
|
+
LOGGER.info("=== GEMINI MULTIMODAL API CALL: COMPOUND_MAPPING_WITH_FIGURES ===")
|
1221
|
+
LOGGER.info("Text prompt length: %d characters", len(prompt))
|
1222
|
+
LOGGER.info("Number of images: %d", len(content_parts) - 1)
|
1223
|
+
LOGGER.info("Compounds to find: %s", ", ".join(sorted(compound_ids)))
|
1224
|
+
|
1225
|
+
# Save debug info
|
1226
|
+
if self.debug_dir:
|
1227
|
+
prompt_file = self.debug_dir / f"{tag}_prompt_{int(time.time())}.txt"
|
1228
|
+
with open(prompt_file, 'w') as f:
|
1229
|
+
f.write(f"=== PROMPT FOR {tag.upper()} ===\n")
|
1230
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
1231
|
+
f.write(f"Text length: {len(prompt)} characters\n")
|
1232
|
+
f.write(f"Images included: {len(content_parts) - 1}\n")
|
1233
|
+
for fig_ref in figure_images.keys():
|
1234
|
+
f.write(f" - {fig_ref}\n")
|
1235
|
+
f.write("="*80 + "\n\n")
|
1236
|
+
f.write(prompt)
|
1237
|
+
LOGGER.info("Full prompt saved to: %s", prompt_file)
|
1238
|
+
|
1239
|
+
# Make multimodal API call
|
1240
|
+
response = self.model.generate_content(content_parts)
|
1241
|
+
raw_text = response.text.strip()
|
1242
|
+
|
1243
|
+
# Log response
|
1244
|
+
LOGGER.info("Gemini multimodal response length: %d characters", len(raw_text))
|
1245
|
+
|
1246
|
+
if self.debug_dir:
|
1247
|
+
response_file = self.debug_dir / f"{tag}_response_{int(time.time())}.txt"
|
1248
|
+
with open(response_file, 'w') as f:
|
1249
|
+
f.write(f"=== RESPONSE FOR {tag.upper()} ===\n")
|
1250
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
1251
|
+
f.write(f"Length: {len(raw_text)} characters\n")
|
1252
|
+
f.write("="*80 + "\n\n")
|
1253
|
+
f.write(raw_text)
|
1254
|
+
LOGGER.info("Full response saved to: %s", response_file)
|
1255
|
+
|
1256
|
+
# Parse JSON
|
1257
|
+
data = json.loads(raw_text.strip('```json').strip('```').strip())
|
1258
|
+
|
1259
|
+
mappings = {}
|
1260
|
+
for item in data.get("compound_mappings", []):
|
1261
|
+
identifiers = item.get("identifiers", [])
|
1262
|
+
if not identifiers and item.get("identifier"):
|
1263
|
+
identifiers = [item.get("identifier")]
|
1264
|
+
|
1265
|
+
mapping = CompoundMapping(
|
1266
|
+
identifiers=identifiers,
|
1267
|
+
iupac_name=item.get("iupac_name", ""),
|
1268
|
+
common_names=item.get("common_names", []),
|
1269
|
+
compound_type=item.get("compound_type", "unknown"),
|
1270
|
+
source_location=item.get("source_location")
|
1271
|
+
)
|
1272
|
+
|
1273
|
+
for identifier in mapping.identifiers + mapping.common_names:
|
1274
|
+
if identifier:
|
1275
|
+
mappings[identifier.lower().strip()] = mapping
|
1276
|
+
|
1277
|
+
return mappings
|
1278
|
+
|
1279
|
+
except Exception as exc:
|
1280
|
+
LOGGER.error("Failed to extract compound mappings with figures: %s", exc)
|
1281
|
+
return {}
|
1282
|
+
|
1283
|
+
def _extract_compound_mappings_adaptive(
|
1284
|
+
self,
|
1285
|
+
compound_ids: List[str],
|
1286
|
+
initial_sections: List[str] = None,
|
1287
|
+
) -> Dict[str, CompoundMapping]:
|
1288
|
+
"""Extract compound ID to IUPAC name mappings using adaptive 3-tier strategy.
|
1289
|
+
|
1290
|
+
1. First attempts extraction from standard sections
|
1291
|
+
2. Expands search to additional sections if compounds are missing
|
1292
|
+
3. Uses multimodal figure analysis as final fallback
|
1293
|
+
"""
|
1294
|
+
if not compound_ids:
|
1295
|
+
return {}
|
1296
|
+
|
1297
|
+
LOGGER.info("Starting adaptive compound mapping for %d compounds: %s",
|
1298
|
+
len(compound_ids), sorted(compound_ids))
|
1299
|
+
|
1300
|
+
# Tier 1: Standard sections (manuscript + initial SI sections)
|
1301
|
+
initial_sections = initial_sections or [
|
1302
|
+
"General procedure", "Compound characterization",
|
1303
|
+
"Synthesis", "Experimental", "Materials and methods"
|
1304
|
+
]
|
1305
|
+
|
1306
|
+
# Include manuscript pages (first 10) for model reaction context
|
1307
|
+
manuscript_text = "\n\n".join(self.ms_pages[:10])
|
1308
|
+
|
1309
|
+
# Extract from initial sections
|
1310
|
+
extraction_text = self._extract_sections_by_title(initial_sections)
|
1311
|
+
if extraction_text:
|
1312
|
+
extraction_text = manuscript_text + "\n\n" + extraction_text
|
1313
|
+
else:
|
1314
|
+
extraction_text = manuscript_text
|
1315
|
+
|
1316
|
+
# First extraction attempt
|
1317
|
+
mappings = self._extract_compound_mappings_from_text(
|
1318
|
+
extraction_text[:50000], compound_ids, tag_suffix="initial"
|
1319
|
+
)
|
1320
|
+
LOGGER.info("Tier 1: Found %d compound mappings from standard sections", len(mappings))
|
1321
|
+
|
1322
|
+
# Check for missing compounds
|
1323
|
+
missing_compounds = []
|
1324
|
+
for cid in compound_ids:
|
1325
|
+
mapping = mappings.get(cid.lower().strip())
|
1326
|
+
if not mapping or not mapping.iupac_name:
|
1327
|
+
missing_compounds.append(cid)
|
1328
|
+
|
1329
|
+
# Tier 2: Expanded search + multimodal with figures
|
1330
|
+
if missing_compounds:
|
1331
|
+
LOGGER.info("Tier 2: %d compounds still missing IUPAC names: %s",
|
1332
|
+
len(missing_compounds), sorted(missing_compounds))
|
1333
|
+
|
1334
|
+
# Additional sections to search
|
1335
|
+
additional_sections = [
|
1336
|
+
"Engineering strategy", "Evolution campaign",
|
1337
|
+
"Screening", "Optimization", "Substrate synthesis",
|
1338
|
+
"Supporting Information", "Supplementary Methods"
|
1339
|
+
]
|
1340
|
+
|
1341
|
+
# Extract from additional sections
|
1342
|
+
additional_text = self._extract_sections_by_title(additional_sections)
|
1343
|
+
|
1344
|
+
# Also extract any figures that might contain compound structures
|
1345
|
+
figure_images = {}
|
1346
|
+
figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Scheme 1", "Scheme 2"]
|
1347
|
+
for ref in figure_refs:
|
1348
|
+
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1349
|
+
if img_b64:
|
1350
|
+
figure_images[ref] = img_b64
|
1351
|
+
LOGGER.info("Extracted %s for compound mapping", ref)
|
1352
|
+
|
1353
|
+
# Try multimodal approach with figures and expanded text
|
1354
|
+
if figure_images or additional_text:
|
1355
|
+
combined_text = additional_text[:30000] if additional_text else ""
|
1356
|
+
expanded_mappings = self._extract_compound_mappings_with_figures(
|
1357
|
+
combined_text, missing_compounds, figure_images, tag_suffix="tier2"
|
1358
|
+
)
|
1359
|
+
|
1360
|
+
# Merge new mappings
|
1361
|
+
new_found = 0
|
1362
|
+
for key, mapping in expanded_mappings.items():
|
1363
|
+
if key not in mappings or not mappings[key].iupac_name:
|
1364
|
+
if mapping.iupac_name:
|
1365
|
+
mappings[key] = mapping
|
1366
|
+
new_found += 1
|
1367
|
+
LOGGER.info("Found IUPAC name for '%s': %s",
|
1368
|
+
key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
|
1369
|
+
|
1370
|
+
LOGGER.info("Tier 2: Found %d additional compound mappings", new_found)
|
1371
|
+
|
1372
|
+
# Check again for still missing compounds
|
1373
|
+
still_missing = []
|
1374
|
+
for cid in missing_compounds:
|
1375
|
+
mapping = mappings.get(cid.lower().strip())
|
1376
|
+
if not mapping or not mapping.iupac_name:
|
1377
|
+
still_missing.append(cid)
|
1378
|
+
|
1379
|
+
# Tier 3: Full manuscript search with all available figures
|
1380
|
+
if still_missing:
|
1381
|
+
LOGGER.info("Tier 3: %d compounds still missing, trying full manuscript search",
|
1382
|
+
len(still_missing))
|
1383
|
+
|
1384
|
+
# Get all SI figures
|
1385
|
+
si_figure_refs = []
|
1386
|
+
for page in self.si_pages[:5]: # Check first 5 SI pages
|
1387
|
+
matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
|
1388
|
+
si_figure_refs.extend(matches[:5]) # Limit to 5 figures
|
1389
|
+
|
1390
|
+
# Extract SI figures
|
1391
|
+
for ref in set(si_figure_refs):
|
1392
|
+
if ref not in figure_images:
|
1393
|
+
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1394
|
+
if img_b64:
|
1395
|
+
figure_images[ref] = img_b64
|
1396
|
+
LOGGER.info("Extracted %s for final compound mapping", ref)
|
1397
|
+
|
1398
|
+
# Full text search including all pages
|
1399
|
+
full_text = "\n\n".join(self.all_pages[:30]) # First 30 pages
|
1400
|
+
|
1401
|
+
final_mappings = self._extract_compound_mappings_with_figures(
|
1402
|
+
full_text[:50000], still_missing, figure_images, tag_suffix="tier3"
|
1403
|
+
)
|
1404
|
+
|
1405
|
+
# Merge final mappings
|
1406
|
+
final_found = 0
|
1407
|
+
for key, mapping in final_mappings.items():
|
1408
|
+
if key not in mappings or not mappings[key].iupac_name:
|
1409
|
+
if mapping.iupac_name:
|
1410
|
+
mappings[key] = mapping
|
1411
|
+
final_found += 1
|
1412
|
+
LOGGER.info("Found IUPAC name for '%s' in final search: %s",
|
1413
|
+
key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
|
1414
|
+
|
1415
|
+
LOGGER.info("Tier 3: Found %d additional compound mappings", final_found)
|
1416
|
+
|
1417
|
+
LOGGER.info("Adaptive compound mapping complete: %d total mappings", len(mappings))
|
1418
|
+
return mappings
|
1419
|
+
|
1420
|
+
def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
|
1421
|
+
"""Extract model reaction information using identified locations and 3-tier compound mapping."""
|
1422
|
+
# First find the best locations
|
1423
|
+
locations = self.find_model_reaction_locations(enzyme_variants)
|
1424
|
+
if not locations:
|
1425
|
+
LOGGER.warning("Could not find model reaction locations, using fallback approach")
|
1426
|
+
# Fallback to old approach but include more manuscript text
|
1427
|
+
pattern = re.compile(r"(model reaction|general procedure|typical .*run|standard conditions|scheme 1|figure 1)", re.I)
|
1428
|
+
snippets: List[str] = []
|
1429
|
+
# Search both manuscript and SI
|
1430
|
+
for page in self.all_pages:
|
1431
|
+
if pattern.search(page):
|
1432
|
+
para_match = re.search(r"(.{0,3000}?\n\n)", page)
|
1433
|
+
if para_match:
|
1434
|
+
snippets.append(para_match.group(0))
|
1435
|
+
if len(snippets) >= 5:
|
1436
|
+
break
|
1437
|
+
text_context = "\n---\n".join(snippets)[:10000]
|
1438
|
+
else:
|
1439
|
+
# Gather text from identified locations
|
1440
|
+
text_snippets = []
|
1441
|
+
|
1442
|
+
# Always include manuscript abstract and introduction for context
|
1443
|
+
if self.ms_pages:
|
1444
|
+
# First 3 pages typically contain abstract, introduction, and model reaction info
|
1445
|
+
manuscript_intro = "\n\n".join(self.ms_pages[:3])
|
1446
|
+
text_snippets.append(f"=== MANUSCRIPT INTRODUCTION ===\n{manuscript_intro}")
|
1447
|
+
|
1448
|
+
# Get model reaction context
|
1449
|
+
if locations.get("model_reaction_location", {}).get("location"):
|
1450
|
+
model_loc = locations["model_reaction_location"]["location"]
|
1451
|
+
LOGGER.info("Looking for model reaction at: %s", model_loc)
|
1452
|
+
model_text = self._get_text_around_location(model_loc)
|
1453
|
+
if model_text:
|
1454
|
+
text_snippets.append(f"=== {model_loc} ===\n{model_text}")
|
1455
|
+
|
1456
|
+
# Get conditions context
|
1457
|
+
if locations.get("conditions_location", {}).get("location"):
|
1458
|
+
cond_loc = locations["conditions_location"]["location"]
|
1459
|
+
LOGGER.info("Looking for reaction conditions at: %s", cond_loc)
|
1460
|
+
cond_text = self._get_text_around_location(cond_loc)
|
1461
|
+
if cond_text:
|
1462
|
+
text_snippets.append(f"=== {cond_loc} ===\n{cond_text}")
|
1463
|
+
|
1464
|
+
# Get IUPAC names context from the specific location identified
|
1465
|
+
if locations.get("iupac_location", {}).get("location"):
|
1466
|
+
iupac_loc = locations["iupac_location"]["location"]
|
1467
|
+
LOGGER.info("Looking for IUPAC names at: %s", iupac_loc)
|
1468
|
+
|
1469
|
+
# If we have compound IDs from the model reaction location, search for them specifically
|
1470
|
+
compound_ids = locations.get("model_reaction_location", {}).get("compound_ids", [])
|
1471
|
+
if compound_ids:
|
1472
|
+
LOGGER.info("Looking for specific compound IDs: %s", compound_ids)
|
1473
|
+
# Search for each compound ID in the SI
|
1474
|
+
for compound_id in compound_ids:
|
1475
|
+
# Search patterns for compound characterization
|
1476
|
+
patterns = [
|
1477
|
+
rf"(?:compound\s+)?{re.escape(compound_id)}[:\s]*\([^)]+\)", # 6a: (IUPAC name)
|
1478
|
+
rf"(?:compound\s+)?{re.escape(compound_id)}[.\s]+[A-Z][^.]+", # 6a. IUPAC name
|
1479
|
+
rf"{re.escape(compound_id)}[^:]*:\s*[^.]+", # Any format with colon
|
1480
|
+
]
|
1481
|
+
|
1482
|
+
for page in self.si_pages:
|
1483
|
+
for pattern in patterns:
|
1484
|
+
match = re.search(pattern, page, re.I)
|
1485
|
+
if match:
|
1486
|
+
# Get extended context around the match
|
1487
|
+
start = max(0, match.start() - 200)
|
1488
|
+
end = min(len(page), match.end() + 500)
|
1489
|
+
text_snippets.append(f"=== Compound {compound_id} characterization ===\n{page[start:end]}")
|
1490
|
+
break
|
1491
|
+
|
1492
|
+
# Also search for substrate names mentioned in the reaction to find their IUPAC equivalents
|
1493
|
+
# Look for common substrate patterns in compound listings
|
1494
|
+
substrate_patterns = [
|
1495
|
+
r"(?:substrate|reactant|reagent)s?\s*:?\s*([^.]+)",
|
1496
|
+
r"(?:starting\s+material)s?\s*:?\s*([^.]+)",
|
1497
|
+
r"\d+\.\s*([A-Za-z\s\-]+)(?:\s*\([^)]+\))?", # numbered compound lists
|
1498
|
+
]
|
1499
|
+
|
1500
|
+
for pattern in substrate_patterns:
|
1501
|
+
for page in self.si_pages[:5]: # Check first few SI pages
|
1502
|
+
matches = re.finditer(pattern, page, re.I)
|
1503
|
+
for match in matches:
|
1504
|
+
text = match.group(0)
|
1505
|
+
if len(text) < 200: # Reasonable length check
|
1506
|
+
start = max(0, match.start() - 100)
|
1507
|
+
end = min(len(page), match.end() + 300)
|
1508
|
+
snippet = page[start:end]
|
1509
|
+
if "prop-2-enoate" in snippet or "diazirin" in snippet:
|
1510
|
+
text_snippets.append(f"=== Substrate characterization ===\n{snippet}")
|
1511
|
+
break
|
1512
|
+
|
1513
|
+
# Also get general IUPAC context
|
1514
|
+
iupac_text = self._get_text_around_location(iupac_loc)
|
1515
|
+
if iupac_text:
|
1516
|
+
# Get more context around the identified location
|
1517
|
+
extended_iupac_text = self._get_extended_text_around_location(iupac_loc, before=2000, after=10000)
|
1518
|
+
if extended_iupac_text:
|
1519
|
+
text_snippets.append(f"=== {iupac_loc} ===\n{extended_iupac_text}")
|
1520
|
+
else:
|
1521
|
+
text_snippets.append(f"=== {iupac_loc} ===\n{iupac_text}")
|
1522
|
+
|
1523
|
+
text_context = "\n\n".join(text_snippets)[:35000] # Increase limit for more context
|
1524
|
+
|
1525
|
+
# Extract figure images for model reaction if identified
|
1526
|
+
figure_images = {}
|
1527
|
+
if locations:
|
1528
|
+
# Extract images from model reaction and conditions locations
|
1529
|
+
for loc_key in ["model_reaction_location", "conditions_location"]:
|
1530
|
+
loc_info = locations.get(loc_key, {})
|
1531
|
+
location = loc_info.get("location", "")
|
1532
|
+
if location and ("figure" in location.lower() or "fig" in location.lower()):
|
1533
|
+
# Extract just the figure reference (e.g., "Figure 2" from "Figure 2. Caption...")
|
1534
|
+
fig_match = re.search(r"(Figure\s+\d+|Fig\s+\d+|Scheme\s+\d+)", location, re.I)
|
1535
|
+
if fig_match:
|
1536
|
+
fig_ref = fig_match.group(1)
|
1537
|
+
LOGGER.info("Extracting image for %s from %s", fig_ref, loc_key)
|
1538
|
+
img_b64 = self._extract_page_png(fig_ref, extract_figure_only=True)
|
1539
|
+
if img_b64:
|
1540
|
+
figure_images[fig_ref] = img_b64
|
1541
|
+
LOGGER.info("Successfully extracted %s image for model reaction analysis", fig_ref)
|
1542
|
+
|
1543
|
+
# Extract compound IDs from locations
|
1544
|
+
compound_ids = []
|
1545
|
+
if locations and locations.get("model_reaction_location", {}).get("compound_ids"):
|
1546
|
+
compound_ids = locations["model_reaction_location"]["compound_ids"]
|
1547
|
+
LOGGER.info("Found compound IDs in model reaction: %s", compound_ids)
|
1548
|
+
|
1549
|
+
# Use the 3-tier compound mapping approach if we have compound IDs
|
1550
|
+
compound_mappings = {}
|
1551
|
+
if compound_ids:
|
1552
|
+
LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
|
1553
|
+
compound_mappings = self._extract_compound_mappings_adaptive(compound_ids)
|
1554
|
+
|
1555
|
+
# Add the mapped IUPAC names to the context for better extraction
|
1556
|
+
if compound_mappings:
|
1557
|
+
mapping_text = "\n\n=== COMPOUND MAPPINGS ===\n"
|
1558
|
+
for cid in compound_ids:
|
1559
|
+
mapping = compound_mappings.get(cid.lower().strip())
|
1560
|
+
if mapping and mapping.iupac_name:
|
1561
|
+
mapping_text += f"Compound {cid}: {mapping.iupac_name}\n"
|
1562
|
+
text_context += mapping_text
|
1563
|
+
|
1564
|
+
# Include both manuscript and SI text for better coverage
|
1565
|
+
prompt = PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
|
1566
|
+
|
1567
|
+
try:
|
1568
|
+
# Use multimodal extraction if we have figure images
|
1569
|
+
if figure_images:
|
1570
|
+
LOGGER.info("Using multimodal extraction with %d figure images", len(figure_images))
|
1571
|
+
# Prepare multimodal content
|
1572
|
+
content_parts = [prompt]
|
1573
|
+
|
1574
|
+
# Add figure images
|
1575
|
+
for fig_ref, fig_base64 in figure_images.items():
|
1576
|
+
try:
|
1577
|
+
img_bytes = b64decode(fig_base64)
|
1578
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1579
|
+
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1580
|
+
content_parts.append(image)
|
1581
|
+
except Exception as e:
|
1582
|
+
LOGGER.warning("Failed to process figure %s: %s", fig_ref, e)
|
1583
|
+
|
1584
|
+
# Use multimodal model if we have valid images
|
1585
|
+
if len(content_parts) > 1:
|
1586
|
+
# Create multimodal request
|
1587
|
+
model = genai.GenerativeModel(
|
1588
|
+
model_name=self.cfg.model_name,
|
1589
|
+
generation_config={
|
1590
|
+
"temperature": self.cfg.model_reaction_temperature,
|
1591
|
+
"top_p": self.cfg.top_p,
|
1592
|
+
"top_k": 1,
|
1593
|
+
"max_output_tokens": self.cfg.max_tokens,
|
1594
|
+
}
|
1595
|
+
)
|
1596
|
+
|
1597
|
+
response = model.generate_content(content_parts)
|
1598
|
+
|
1599
|
+
# Parse JSON from response
|
1600
|
+
if response and response.text:
|
1601
|
+
# Save debug output
|
1602
|
+
if self.debug_dir:
|
1603
|
+
timestamp = int(time.time())
|
1604
|
+
_dump(prompt, self.debug_dir / f"model_reaction_multimodal_prompt_{timestamp}.txt")
|
1605
|
+
_dump(response.text, self.debug_dir / f"model_reaction_multimodal_response_{timestamp}.txt")
|
1606
|
+
|
1607
|
+
# Extract JSON from response
|
1608
|
+
text = response.text.strip()
|
1609
|
+
if text.startswith("```json"):
|
1610
|
+
text = text[7:]
|
1611
|
+
if text.endswith("```"):
|
1612
|
+
text = text[:-3]
|
1613
|
+
data = json.loads(text.strip())
|
1614
|
+
else:
|
1615
|
+
raise ValueError("Empty response from multimodal model")
|
1616
|
+
else:
|
1617
|
+
# Fall back to text-only extraction
|
1618
|
+
data = generate_json_with_retry(
|
1619
|
+
self.model,
|
1620
|
+
prompt,
|
1621
|
+
temperature=self.cfg.model_reaction_temperature,
|
1622
|
+
debug_dir=self.debug_dir,
|
1623
|
+
tag="model_reaction"
|
1624
|
+
)
|
1625
|
+
else:
|
1626
|
+
# Standard text-only extraction
|
1627
|
+
data = generate_json_with_retry(
|
1628
|
+
self.model,
|
1629
|
+
prompt,
|
1630
|
+
temperature=self.cfg.model_reaction_temperature,
|
1631
|
+
debug_dir=self.debug_dir,
|
1632
|
+
tag="model_reaction"
|
1633
|
+
)
|
1634
|
+
|
1635
|
+
# Handle the new array format for substrates/products
|
1636
|
+
if isinstance(data, dict):
|
1637
|
+
# If we have compound mappings, enhance the IUPAC names
|
1638
|
+
if compound_ids and compound_mappings:
|
1639
|
+
# Try to map substrate/product lists through compound IDs
|
1640
|
+
substrate_list = data.get("substrate_iupac_list", [])
|
1641
|
+
if isinstance(substrate_list, list):
|
1642
|
+
enhanced_substrates = []
|
1643
|
+
for item in substrate_list:
|
1644
|
+
# Check if it's a compound ID that we can map
|
1645
|
+
mapping = compound_mappings.get(str(item).lower().strip())
|
1646
|
+
if mapping and mapping.iupac_name:
|
1647
|
+
enhanced_substrates.append(mapping.iupac_name)
|
1648
|
+
elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
|
1649
|
+
# Keep valid IUPAC names
|
1650
|
+
enhanced_substrates.append(str(item))
|
1651
|
+
data["substrate_iupac_list"] = enhanced_substrates
|
1652
|
+
|
1653
|
+
product_list = data.get("product_iupac_list", [])
|
1654
|
+
if isinstance(product_list, list):
|
1655
|
+
enhanced_products = []
|
1656
|
+
for item in product_list:
|
1657
|
+
# Check if it's a compound ID that we can map
|
1658
|
+
mapping = compound_mappings.get(str(item).lower().strip())
|
1659
|
+
if mapping and mapping.iupac_name:
|
1660
|
+
enhanced_products.append(mapping.iupac_name)
|
1661
|
+
elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
|
1662
|
+
# Keep valid IUPAC names
|
1663
|
+
enhanced_products.append(str(item))
|
1664
|
+
data["product_iupac_list"] = enhanced_products
|
1665
|
+
|
1666
|
+
# Validate and convert arrays to semicolon-separated strings for CSV compatibility
|
1667
|
+
if "substrate_iupac_list" in data and isinstance(data["substrate_iupac_list"], list):
|
1668
|
+
# Filter out non-IUPAC names (abbreviations like "1a", "S1", etc.)
|
1669
|
+
valid_substrates = [s for s in data["substrate_iupac_list"]
|
1670
|
+
if s and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', s)]
|
1671
|
+
# Join with semicolons instead of JSON encoding
|
1672
|
+
data["substrate_iupac_list"] = "; ".join(valid_substrates) if valid_substrates else ""
|
1673
|
+
else:
|
1674
|
+
data["substrate_iupac_list"] = ""
|
1675
|
+
|
1676
|
+
if "product_iupac_list" in data and isinstance(data["product_iupac_list"], list):
|
1677
|
+
# Filter out non-IUPAC names
|
1678
|
+
valid_products = [p for p in data["product_iupac_list"]
|
1679
|
+
if p and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', p)]
|
1680
|
+
# Join with semicolons instead of JSON encoding
|
1681
|
+
data["product_iupac_list"] = "; ".join(valid_products) if valid_products else ""
|
1682
|
+
else:
|
1683
|
+
data["product_iupac_list"] = ""
|
1684
|
+
|
1685
|
+
except Exception as exc:
|
1686
|
+
LOGGER.error("Failed to extract model reaction: %s", exc)
|
1687
|
+
data = {
|
1688
|
+
"substrate_iupac_list": None,
|
1689
|
+
"product_iupac_list": None,
|
1690
|
+
"reaction_substrate_concentration": None,
|
1691
|
+
"cofactor": None,
|
1692
|
+
"reaction_temperature": None,
|
1693
|
+
"reaction_ph": None,
|
1694
|
+
"reaction_buffer": None,
|
1695
|
+
"reaction_other_conditions": None,
|
1696
|
+
"error": str(exc)
|
1697
|
+
}
|
1698
|
+
|
1699
|
+
# Ensure all expected keys are present
|
1700
|
+
expected_keys = [
|
1701
|
+
"substrate_list", "substrate_iupac_list", "product_list", "product_iupac_list",
|
1702
|
+
"reaction_substrate_concentration", "cofactor", "reaction_temperature",
|
1703
|
+
"reaction_ph", "reaction_buffer", "reaction_other_conditions"
|
1704
|
+
]
|
1705
|
+
for key in expected_keys:
|
1706
|
+
data.setdefault(key, None)
|
1707
|
+
|
1708
|
+
return data
|
1709
|
+
|
1710
|
+
def _process_single_lineage(self, location: Dict[str, Any], enzyme_df: pd.DataFrame) -> pd.DataFrame:
|
1711
|
+
"""Process a single lineage case - still extract based on location."""
|
1712
|
+
# Even for single lineage, use location-based extraction
|
1713
|
+
lineage_analysis = {
|
1714
|
+
'has_multiple_lineages': False,
|
1715
|
+
'lineage_groups': [{
|
1716
|
+
'group_id': self._get_base_location(location['location']),
|
1717
|
+
'data_location': location['location'],
|
1718
|
+
'lineage_hint': location.get('lineage_hint', ''),
|
1719
|
+
'caption': location.get('caption', ''),
|
1720
|
+
'confidence': location.get('confidence', 0)
|
1721
|
+
}]
|
1722
|
+
}
|
1723
|
+
|
1724
|
+
return self._process_multiple_lineages([location], enzyme_df, lineage_analysis)
|
1725
|
+
|
1726
|
+
def _process_multiple_lineages_by_confidence(self, locations: List[Dict[str, Any]],
|
1727
|
+
enzyme_df: pd.DataFrame,
|
1728
|
+
lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
|
1729
|
+
"""Process multiple lineages by confidence, detecting which enzymes belong to which campaign."""
|
1730
|
+
# Get all enzyme IDs
|
1731
|
+
all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else enzyme_df['enzyme'].tolist()
|
1732
|
+
all_variants = set(all_enzyme_ids)
|
1733
|
+
variants_with_data = set()
|
1734
|
+
all_results = []
|
1735
|
+
|
1736
|
+
# If enzyme_df has campaign_id column, we can use it to filter
|
1737
|
+
has_campaign_info = 'campaign_id' in enzyme_df.columns
|
1738
|
+
|
1739
|
+
# Process locations in order of confidence
|
1740
|
+
for location in locations:
|
1741
|
+
if len(variants_with_data) >= len(all_variants):
|
1742
|
+
LOGGER.info("All variants have data, stopping extraction")
|
1743
|
+
break
|
1744
|
+
|
1745
|
+
LOGGER.info("\nProcessing location %s (confidence: %d%%)",
|
1746
|
+
location['location'], location.get('confidence', 0))
|
1747
|
+
|
1748
|
+
# Extract metrics from this location for ALL enzymes
|
1749
|
+
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, location['location'])
|
1750
|
+
|
1751
|
+
# Filter to valid metrics
|
1752
|
+
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
1753
|
+
|
1754
|
+
if not valid_metrics:
|
1755
|
+
LOGGER.warning("No valid metrics found in %s", location['location'])
|
1756
|
+
continue
|
1757
|
+
|
1758
|
+
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), location['location'])
|
1759
|
+
|
1760
|
+
# Create DataFrame for this location
|
1761
|
+
df_location = pd.DataFrame(valid_metrics)
|
1762
|
+
|
1763
|
+
# Track which variants we got data for
|
1764
|
+
new_variants = set(df_location['enzyme'].tolist()) - variants_with_data
|
1765
|
+
LOGGER.info("Found data for %d new variants in %s", len(new_variants), location['location'])
|
1766
|
+
variants_with_data.update(new_variants)
|
1767
|
+
|
1768
|
+
# Determine which campaign/lineage this location represents
|
1769
|
+
# by checking which variants are present
|
1770
|
+
location_variants = set(df_location['enzyme'].tolist())
|
1771
|
+
|
1772
|
+
# If we have campaign info, determine the campaign for this location
|
1773
|
+
campaign_id = None
|
1774
|
+
if has_campaign_info:
|
1775
|
+
# Find which campaign(s) these variants belong to
|
1776
|
+
if 'enzyme_id' in enzyme_df.columns:
|
1777
|
+
variant_campaigns = enzyme_df[enzyme_df['enzyme_id'].isin(location_variants)]['campaign_id'].unique()
|
1778
|
+
else:
|
1779
|
+
variant_campaigns = enzyme_df[enzyme_df['enzyme'].isin(location_variants)]['campaign_id'].unique()
|
1780
|
+
if len(variant_campaigns) == 1:
|
1781
|
+
campaign_id = variant_campaigns[0]
|
1782
|
+
LOGGER.info("Location %s contains variants from campaign: %s",
|
1783
|
+
location['location'], campaign_id)
|
1784
|
+
elif len(variant_campaigns) > 1:
|
1785
|
+
LOGGER.warning("Location %s contains variants from multiple campaigns: %s",
|
1786
|
+
location['location'], variant_campaigns)
|
1787
|
+
|
1788
|
+
# Extract model reaction specific to this location/campaign
|
1789
|
+
location_context = f"Location: {location['location']}"
|
1790
|
+
if location.get('caption'):
|
1791
|
+
location_context += f"\nCaption: {location['caption']}"
|
1792
|
+
|
1793
|
+
# Try to find model reaction for this specific lineage
|
1794
|
+
location_model_reaction = self.find_lineage_model_reaction(location['location'], location_context)
|
1795
|
+
|
1796
|
+
# Get full model reaction info with IUPAC names
|
1797
|
+
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
1798
|
+
model_info = self._extract_lineage_model_info(location_model_reaction)
|
1799
|
+
else:
|
1800
|
+
# Fall back to general model reaction extraction
|
1801
|
+
# Pass the enzyme variants from this location
|
1802
|
+
location_enzymes = df_location['enzyme'].unique().tolist()
|
1803
|
+
model_info = self.gather_model_reaction_info(location_enzymes)
|
1804
|
+
|
1805
|
+
# Add model reaction info to all enzymes from this location
|
1806
|
+
for key, value in model_info.items():
|
1807
|
+
if isinstance(value, list):
|
1808
|
+
value = "; ".join(str(v) for v in value) if value else None
|
1809
|
+
df_location[key] = value
|
1810
|
+
|
1811
|
+
# Add location and campaign info
|
1812
|
+
df_location['data_location'] = location['location']
|
1813
|
+
df_location['location_type'] = location.get('type', 'unknown')
|
1814
|
+
df_location['location_confidence'] = location.get('confidence', 0)
|
1815
|
+
# Remove lineage_group column - not working properly
|
1816
|
+
# df_location['lineage_group'] = location.get('lineage_hint', campaign_id or 'unknown')
|
1817
|
+
|
1818
|
+
all_results.append(df_location)
|
1819
|
+
|
1820
|
+
# Log progress
|
1821
|
+
LOGGER.info("Progress: %d/%d variants have data",
|
1822
|
+
len(variants_with_data), len(all_variants))
|
1823
|
+
|
1824
|
+
if all_results:
|
1825
|
+
# Combine all results
|
1826
|
+
df_combined = pd.concat(all_results, ignore_index=True)
|
1827
|
+
|
1828
|
+
# If we have duplicates (same variant in multiple locations), keep the one with highest confidence
|
1829
|
+
if df_combined.duplicated(subset=['enzyme']).any():
|
1830
|
+
LOGGER.info("Removing duplicates, keeping highest confidence data")
|
1831
|
+
df_combined = df_combined.sort_values(
|
1832
|
+
['enzyme', 'location_confidence'],
|
1833
|
+
ascending=[True, False]
|
1834
|
+
).drop_duplicates(subset=['enzyme'], keep='first')
|
1835
|
+
|
1836
|
+
# Log extraction summary
|
1837
|
+
LOGGER.info("Extraction complete: %d unique variants from %d locations",
|
1838
|
+
len(df_combined), len(all_results))
|
1839
|
+
|
1840
|
+
if 'data_location' in df_combined.columns:
|
1841
|
+
for location in df_combined['data_location'].unique():
|
1842
|
+
location_enzymes = df_combined[df_combined['data_location'] == location]
|
1843
|
+
LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
|
1844
|
+
|
1845
|
+
return df_combined
|
1846
|
+
else:
|
1847
|
+
LOGGER.warning("No metrics extracted from any location")
|
1848
|
+
return pd.DataFrame()
|
1849
|
+
|
1850
|
+
def _process_multiple_lineages(self, locations: List[Dict[str, Any]],
|
1851
|
+
enzyme_df: pd.DataFrame,
|
1852
|
+
lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
|
1853
|
+
"""Process multiple lineages where each location represents a different model reaction."""
|
1854
|
+
all_metrics = []
|
1855
|
+
lineage_groups = lineage_analysis.get('lineage_groups', [])
|
1856
|
+
|
1857
|
+
# Get all enzyme IDs for extraction attempts
|
1858
|
+
all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else []
|
1859
|
+
|
1860
|
+
for group in lineage_groups:
|
1861
|
+
group_location = group.get('data_location')
|
1862
|
+
group_id = group.get('group_id')
|
1863
|
+
|
1864
|
+
# Find the location info
|
1865
|
+
location_info = next((loc for loc in locations if loc['location'] == group_location), None)
|
1866
|
+
if not location_info:
|
1867
|
+
LOGGER.warning("No location info found for group %s at %s", group_id, group_location)
|
1868
|
+
continue
|
1869
|
+
|
1870
|
+
LOGGER.info("Processing location %s (%s)", group_location, group_id)
|
1871
|
+
|
1872
|
+
# Extract metrics from this location for ALL enzymes
|
1873
|
+
# The extractor will return only those that actually have data
|
1874
|
+
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, group_location)
|
1875
|
+
|
1876
|
+
# Filter to enzymes that actually had data in this location
|
1877
|
+
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
1878
|
+
|
1879
|
+
if not valid_metrics:
|
1880
|
+
LOGGER.warning("No valid metrics found in %s", group_location)
|
1881
|
+
continue
|
1882
|
+
|
1883
|
+
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), group_location)
|
1884
|
+
|
1885
|
+
# Create DataFrame for this location
|
1886
|
+
df_location = pd.DataFrame(valid_metrics)
|
1887
|
+
|
1888
|
+
# Extract model reaction specific to this location
|
1889
|
+
# Different locations = different model reactions
|
1890
|
+
location_context = f"Location: {group_location}"
|
1891
|
+
if group.get('caption'):
|
1892
|
+
location_context += f"\nCaption: {group['caption']}"
|
1893
|
+
|
1894
|
+
location_model_reaction = self.find_lineage_model_reaction(group_location, location_context)
|
1895
|
+
|
1896
|
+
# Get full model reaction info with IUPAC names
|
1897
|
+
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
1898
|
+
model_info = self._extract_lineage_model_info(location_model_reaction)
|
1899
|
+
else:
|
1900
|
+
# Try to extract model reaction from this specific location
|
1901
|
+
# Pass the enzyme variants that have data in this location
|
1902
|
+
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1903
|
+
model_info = self.gather_model_reaction_info(location_enzymes)
|
1904
|
+
|
1905
|
+
# Add model reaction info to all enzymes from this location
|
1906
|
+
for key, value in model_info.items():
|
1907
|
+
if isinstance(value, list):
|
1908
|
+
value = "; ".join(str(v) for v in value) if value else None
|
1909
|
+
df_location[key] = value
|
1910
|
+
|
1911
|
+
# Add location identifier
|
1912
|
+
df_location['data_location'] = group_location
|
1913
|
+
# Remove lineage_group column - not working properly
|
1914
|
+
# df_location['lineage_group'] = group.get('lineage_hint', group_id)
|
1915
|
+
|
1916
|
+
all_metrics.append(df_location)
|
1917
|
+
|
1918
|
+
if all_metrics:
|
1919
|
+
# Combine all metrics
|
1920
|
+
df_combined = pd.concat(all_metrics, ignore_index=True)
|
1921
|
+
|
1922
|
+
# Log extraction summary
|
1923
|
+
LOGGER.info("Extraction complete: %d total enzymes from %d locations",
|
1924
|
+
len(df_combined), len(all_metrics))
|
1925
|
+
|
1926
|
+
if 'data_location' in df_combined.columns:
|
1927
|
+
for location in df_combined['data_location'].unique():
|
1928
|
+
location_enzymes = df_combined[df_combined['data_location'] == location]
|
1929
|
+
LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
|
1930
|
+
|
1931
|
+
return df_combined
|
1932
|
+
else:
|
1933
|
+
LOGGER.warning("No metrics extracted from any location")
|
1934
|
+
return pd.DataFrame()
|
1935
|
+
|
1936
|
+
def _has_valid_metrics(self, metrics_row: Dict[str, Any]) -> bool:
|
1937
|
+
"""Check if a metrics row contains any valid performance data."""
|
1938
|
+
metric_fields = ['yield', 'ttn', 'ton', 'selectivity', 'conversion', 'tof', 'activity']
|
1939
|
+
|
1940
|
+
for field in metric_fields:
|
1941
|
+
if metrics_row.get(field) is not None:
|
1942
|
+
return True
|
1943
|
+
|
1944
|
+
# Also check other_metrics
|
1945
|
+
if metrics_row.get('other_metrics') and isinstance(metrics_row['other_metrics'], dict):
|
1946
|
+
if metrics_row['other_metrics']: # Non-empty dict
|
1947
|
+
return True
|
1948
|
+
|
1949
|
+
return False
|
1950
|
+
|
1951
|
+
def _filter_locations_by_campaign(self, locations: List[Dict[str, Any]],
|
1952
|
+
enzyme_df: pd.DataFrame) -> List[Dict[str, Any]]:
|
1953
|
+
"""Filter locations to only those relevant to the current campaign."""
|
1954
|
+
if not self.campaign_filter or 'campaign_id' not in enzyme_df.columns:
|
1955
|
+
return locations
|
1956
|
+
|
1957
|
+
# Get enzyme names for this campaign
|
1958
|
+
campaign_enzymes = enzyme_df[enzyme_df['campaign_id'] == self.campaign_filter]['enzyme_id' if 'enzyme_id' in enzyme_df.columns else 'enzyme'].tolist()
|
1959
|
+
|
1960
|
+
# Extract any common patterns from enzyme names
|
1961
|
+
enzyme_patterns = set()
|
1962
|
+
for enzyme in campaign_enzymes:
|
1963
|
+
# Extract any uppercase abbreviations (e.g., 'PYS', 'INS')
|
1964
|
+
matches = re.findall(r'[A-Z]{2,}', enzyme)
|
1965
|
+
enzyme_patterns.update(matches)
|
1966
|
+
|
1967
|
+
LOGGER.info("Campaign %s has enzyme patterns: %s", self.campaign_filter, enzyme_patterns)
|
1968
|
+
|
1969
|
+
# Get campaign description keywords from the campaign data if available
|
1970
|
+
campaign_keywords = set()
|
1971
|
+
# Extract keywords from campaign_id (e.g., 'pyrrolidine_synthase_evolution' -> ['pyrrolidine', 'synthase'])
|
1972
|
+
words = self.campaign_filter.lower().replace('_', ' ').split()
|
1973
|
+
# Filter out generic words
|
1974
|
+
generic_words = {'evolution', 'campaign', 'synthase', 'enzyme', 'variant'}
|
1975
|
+
campaign_keywords.update(word for word in words if word not in generic_words and len(word) > 3)
|
1976
|
+
|
1977
|
+
LOGGER.info("Campaign keywords: %s", campaign_keywords)
|
1978
|
+
|
1979
|
+
# Filter locations based on campaign clues
|
1980
|
+
filtered = []
|
1981
|
+
for loc in locations:
|
1982
|
+
# Check caption and clues for campaign indicators
|
1983
|
+
caption = loc.get('caption', '').lower()
|
1984
|
+
campaign_clues = loc.get('campaign_clues', '').lower()
|
1985
|
+
lineage_hint = loc.get('lineage_hint', '').lower()
|
1986
|
+
combined_text = caption + ' ' + campaign_clues + ' ' + lineage_hint
|
1987
|
+
|
1988
|
+
# Check if location is relevant to this campaign
|
1989
|
+
is_relevant = False
|
1990
|
+
|
1991
|
+
# Check for enzyme patterns
|
1992
|
+
for pattern in enzyme_patterns:
|
1993
|
+
if pattern.lower() in combined_text:
|
1994
|
+
is_relevant = True
|
1995
|
+
break
|
1996
|
+
|
1997
|
+
# Check for campaign keywords
|
1998
|
+
if not is_relevant:
|
1999
|
+
for keyword in campaign_keywords:
|
2000
|
+
if keyword in combined_text:
|
2001
|
+
is_relevant = True
|
2002
|
+
break
|
2003
|
+
|
2004
|
+
# Check if any campaign enzymes are explicitly mentioned
|
2005
|
+
if not is_relevant:
|
2006
|
+
for enzyme in campaign_enzymes[:5]: # Check first few enzymes
|
2007
|
+
if enzyme.lower() in combined_text:
|
2008
|
+
is_relevant = True
|
2009
|
+
break
|
2010
|
+
|
2011
|
+
if is_relevant:
|
2012
|
+
filtered.append(loc)
|
2013
|
+
LOGGER.info("Location %s is relevant to campaign %s",
|
2014
|
+
loc.get('location'), self.campaign_filter)
|
2015
|
+
else:
|
2016
|
+
LOGGER.debug("Location %s filtered out for campaign %s",
|
2017
|
+
loc.get('location'), self.campaign_filter)
|
2018
|
+
|
2019
|
+
return filtered
|
2020
|
+
|
2021
|
+
def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any]) -> Dict[str, Any]:
|
2022
|
+
"""Extract full model reaction info including IUPAC names for a lineage."""
|
2023
|
+
# Get substrate/product IDs from lineage-specific extraction
|
2024
|
+
substrate_ids = lineage_reaction.get('substrate_ids', [])
|
2025
|
+
product_ids = lineage_reaction.get('product_ids', [])
|
2026
|
+
|
2027
|
+
# Get general model reaction info for conditions
|
2028
|
+
general_info = self.gather_model_reaction_info()
|
2029
|
+
|
2030
|
+
# Override substrate/product lists with lineage-specific ones only if they contain actual compound IDs
|
2031
|
+
model_info = general_info.copy()
|
2032
|
+
|
2033
|
+
# Check if substrate_ids contain actual compound IDs (not generic terms like "alkyl azide")
|
2034
|
+
if substrate_ids and any(re.match(r'^[0-9]+[a-z]?$|^[A-Z][0-9]+$', sid) for sid in substrate_ids):
|
2035
|
+
model_info['substrate_list'] = substrate_ids
|
2036
|
+
elif not substrate_ids and general_info.get('substrate_list'):
|
2037
|
+
# Keep the general info if lineage extraction found nothing
|
2038
|
+
pass
|
2039
|
+
else:
|
2040
|
+
model_info['substrate_list'] = substrate_ids
|
2041
|
+
|
2042
|
+
# Check if product_ids contain actual compound IDs (not generic terms like "pyrrolidine")
|
2043
|
+
if product_ids and any(re.match(r'^[0-9]+[a-z]?$|^[A-Z][0-9]+$', pid) for pid in product_ids):
|
2044
|
+
model_info['product_list'] = product_ids
|
2045
|
+
elif not product_ids and general_info.get('product_list'):
|
2046
|
+
# Keep the general info if lineage extraction found nothing
|
2047
|
+
pass
|
2048
|
+
else:
|
2049
|
+
# If we only have generic terms, try to keep general info if available
|
2050
|
+
if general_info.get('product_list') and all(len(pid) > 5 for pid in product_ids):
|
2051
|
+
# Likely generic terms like "pyrrolidine", keep general info
|
2052
|
+
pass
|
2053
|
+
else:
|
2054
|
+
model_info['product_list'] = product_ids
|
2055
|
+
|
2056
|
+
# Extract IUPAC names for the compounds we're actually using
|
2057
|
+
# Use the IDs from model_info (which may have been preserved from general extraction)
|
2058
|
+
final_substrate_ids = model_info.get('substrate_list', [])
|
2059
|
+
final_product_ids = model_info.get('product_list', [])
|
2060
|
+
all_compound_ids = final_substrate_ids + final_product_ids
|
2061
|
+
|
2062
|
+
if all_compound_ids:
|
2063
|
+
compound_mappings = self._extract_compound_mappings_adaptive(all_compound_ids)
|
2064
|
+
|
2065
|
+
# Map substrate IUPAC names
|
2066
|
+
substrate_iupacs = []
|
2067
|
+
for sid in final_substrate_ids:
|
2068
|
+
mapping = compound_mappings.get(str(sid).lower().strip())
|
2069
|
+
if mapping and mapping.iupac_name:
|
2070
|
+
substrate_iupacs.append(mapping.iupac_name)
|
2071
|
+
# Only update if we found IUPAC names
|
2072
|
+
if substrate_iupacs:
|
2073
|
+
model_info['substrate_iupac_list'] = substrate_iupacs
|
2074
|
+
|
2075
|
+
# Map product IUPAC names
|
2076
|
+
product_iupacs = []
|
2077
|
+
for pid in final_product_ids:
|
2078
|
+
mapping = compound_mappings.get(str(pid).lower().strip())
|
2079
|
+
if mapping and mapping.iupac_name:
|
2080
|
+
product_iupacs.append(mapping.iupac_name)
|
2081
|
+
# Only update if we found IUPAC names
|
2082
|
+
if product_iupacs:
|
2083
|
+
model_info['product_iupac_list'] = product_iupacs
|
2084
|
+
|
2085
|
+
return model_info
|
2086
|
+
|
2087
|
+
def _process_single_lineage_by_confidence(self, locations: List[Dict[str, Any]],
|
2088
|
+
enzyme_df: pd.DataFrame) -> pd.DataFrame:
|
2089
|
+
"""Process single lineage by confidence, stopping when all variants have data."""
|
2090
|
+
# Get list of all variants we need data for
|
2091
|
+
all_variants = set(enzyme_df['enzyme'].tolist() if 'enzyme' in enzyme_df.columns else
|
2092
|
+
enzyme_df['enzyme_id'].tolist())
|
2093
|
+
variants_with_data = set()
|
2094
|
+
all_results = []
|
2095
|
+
|
2096
|
+
# Process locations in order of confidence
|
2097
|
+
for location in locations:
|
2098
|
+
if len(variants_with_data) >= len(all_variants):
|
2099
|
+
LOGGER.info("All variants have data, stopping extraction")
|
2100
|
+
break
|
2101
|
+
|
2102
|
+
LOGGER.info("\nProcessing location %s (confidence: %d%%)",
|
2103
|
+
location['location'], location.get('confidence', 0))
|
2104
|
+
|
2105
|
+
# Extract metrics from this location
|
2106
|
+
metrics_rows = self.extract_metrics_batch(list(all_variants), location['location'])
|
2107
|
+
|
2108
|
+
# Filter to valid metrics
|
2109
|
+
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
2110
|
+
|
2111
|
+
if not valid_metrics:
|
2112
|
+
LOGGER.warning("No valid metrics found in %s", location['location'])
|
2113
|
+
continue
|
2114
|
+
|
2115
|
+
# Create DataFrame for this location
|
2116
|
+
df_location = pd.DataFrame(valid_metrics)
|
2117
|
+
|
2118
|
+
# Track which variants we got data for
|
2119
|
+
new_variants = set(df_location['enzyme'].tolist()) - variants_with_data
|
2120
|
+
LOGGER.info("Found data for %d new variants in %s", len(new_variants), location['location'])
|
2121
|
+
variants_with_data.update(new_variants)
|
2122
|
+
|
2123
|
+
# Add location info
|
2124
|
+
df_location['data_location'] = location['location']
|
2125
|
+
df_location['location_type'] = location.get('type', 'unknown')
|
2126
|
+
df_location['location_confidence'] = location.get('confidence', 0)
|
2127
|
+
|
2128
|
+
all_results.append(df_location)
|
2129
|
+
|
2130
|
+
# Log progress
|
2131
|
+
LOGGER.info("Progress: %d/%d variants have data",
|
2132
|
+
len(variants_with_data), len(all_variants))
|
2133
|
+
|
2134
|
+
if all_results:
|
2135
|
+
# Combine all results
|
2136
|
+
df_combined = pd.concat(all_results, ignore_index=True)
|
2137
|
+
|
2138
|
+
# If we have duplicates (same variant in multiple locations), keep the one with highest confidence
|
2139
|
+
if df_combined.duplicated(subset=['enzyme']).any():
|
2140
|
+
LOGGER.info("Removing duplicates, keeping highest confidence data")
|
2141
|
+
df_combined = df_combined.sort_values(
|
2142
|
+
['enzyme', 'location_confidence'],
|
2143
|
+
ascending=[True, False]
|
2144
|
+
).drop_duplicates(subset=['enzyme'], keep='first')
|
2145
|
+
|
2146
|
+
# Extract model reaction info once
|
2147
|
+
# Pass the enzyme variants we're processing
|
2148
|
+
enzyme_list = df_combined['enzyme'].unique().tolist()
|
2149
|
+
model_info = self.gather_model_reaction_info(enzyme_list)
|
2150
|
+
|
2151
|
+
# Add model reaction info to all rows
|
2152
|
+
for key, value in model_info.items():
|
2153
|
+
if isinstance(value, list):
|
2154
|
+
value = "; ".join(str(v) for v in value) if value else None
|
2155
|
+
df_combined[key] = value
|
2156
|
+
|
2157
|
+
LOGGER.info("Extraction complete: %d unique variants with data", len(df_combined))
|
2158
|
+
|
2159
|
+
return df_combined
|
2160
|
+
else:
|
2161
|
+
LOGGER.warning("No metrics extracted from any location")
|
2162
|
+
return pd.DataFrame()
|
2163
|
+
|
2164
|
+
# ------------------------------------------------------------------
|
2165
|
+
# 6.5 Public orchestrator
|
2166
|
+
# ------------------------------------------------------------------
|
2167
|
+
|
2168
|
+
def run(self, enzyme_df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
|
2169
|
+
# This module should always have enzyme CSV provided
|
2170
|
+
if enzyme_df is None:
|
2171
|
+
LOGGER.error("No enzyme DataFrame provided - this module requires enzyme CSV input")
|
2172
|
+
return pd.DataFrame()
|
2173
|
+
|
2174
|
+
# Check if we have campaign_id column - if so, process each campaign separately
|
2175
|
+
if 'campaign_id' in enzyme_df.columns and not self.campaign_filter:
|
2176
|
+
campaigns = enzyme_df['campaign_id'].unique()
|
2177
|
+
if len(campaigns) > 1:
|
2178
|
+
LOGGER.info("Detected %d campaigns in enzyme data - processing each separately", len(campaigns))
|
2179
|
+
all_campaign_results = []
|
2180
|
+
|
2181
|
+
for campaign_id in campaigns:
|
2182
|
+
LOGGER.info("\n" + "="*60)
|
2183
|
+
LOGGER.info("Processing campaign: %s", campaign_id)
|
2184
|
+
LOGGER.info("="*60)
|
2185
|
+
|
2186
|
+
# Create a new extractor instance for this campaign
|
2187
|
+
campaign_extractor = ReactionExtractor(
|
2188
|
+
manuscript=self.manuscript,
|
2189
|
+
si=self.si,
|
2190
|
+
cfg=self.cfg,
|
2191
|
+
debug_dir=self.debug_dir / campaign_id if self.debug_dir else None,
|
2192
|
+
campaign_filter=campaign_id
|
2193
|
+
)
|
2194
|
+
|
2195
|
+
# Run extraction for this campaign
|
2196
|
+
campaign_df = campaign_extractor.run(enzyme_df)
|
2197
|
+
|
2198
|
+
if not campaign_df.empty:
|
2199
|
+
# Add campaign identifier
|
2200
|
+
campaign_df['campaign_id'] = campaign_id
|
2201
|
+
all_campaign_results.append(campaign_df)
|
2202
|
+
LOGGER.info("Extracted %d reactions for campaign %s", len(campaign_df), campaign_id)
|
2203
|
+
|
2204
|
+
# Combine results from all campaigns
|
2205
|
+
if all_campaign_results:
|
2206
|
+
combined_df = pd.concat(all_campaign_results, ignore_index=True)
|
2207
|
+
LOGGER.info("\nCombined extraction complete: %d total reactions across %d campaigns",
|
2208
|
+
len(combined_df), len(campaigns))
|
2209
|
+
return combined_df
|
2210
|
+
else:
|
2211
|
+
LOGGER.warning("No reactions extracted from any campaign")
|
2212
|
+
return pd.DataFrame()
|
2213
|
+
|
2214
|
+
# Filter by campaign if specified
|
2215
|
+
if self.campaign_filter and 'campaign_id' in enzyme_df.columns:
|
2216
|
+
LOGGER.info("Filtering enzymes for campaign: %s", self.campaign_filter)
|
2217
|
+
enzyme_df = enzyme_df[enzyme_df['campaign_id'] == self.campaign_filter].copy()
|
2218
|
+
LOGGER.info("Found %d enzymes for campaign %s", len(enzyme_df), self.campaign_filter)
|
2219
|
+
if len(enzyme_df) == 0:
|
2220
|
+
LOGGER.warning("No enzymes found for campaign %s", self.campaign_filter)
|
2221
|
+
return pd.DataFrame()
|
2222
|
+
|
2223
|
+
# Find all locations with performance data
|
2224
|
+
locations = self.find_reaction_locations()
|
2225
|
+
if not locations:
|
2226
|
+
LOGGER.error("Failed to find reaction data locations")
|
2227
|
+
return pd.DataFrame()
|
2228
|
+
|
2229
|
+
# Filter locations by campaign if specified
|
2230
|
+
if self.campaign_filter:
|
2231
|
+
filtered_locations = self._filter_locations_by_campaign(locations, enzyme_df)
|
2232
|
+
if filtered_locations:
|
2233
|
+
LOGGER.info("Filtered to %d locations for campaign %s",
|
2234
|
+
len(filtered_locations), self.campaign_filter)
|
2235
|
+
locations = filtered_locations
|
2236
|
+
else:
|
2237
|
+
LOGGER.warning("No locations found specifically for campaign %s, using all locations",
|
2238
|
+
self.campaign_filter)
|
2239
|
+
|
2240
|
+
# Sort locations by confidence (highest first) and prefer tables over figures
|
2241
|
+
locations_sorted = sorted(locations, key=lambda x: (
|
2242
|
+
x.get('confidence', 0),
|
2243
|
+
1 if x.get('type') == 'table' else 0 # Prefer tables when confidence is equal
|
2244
|
+
), reverse=True)
|
2245
|
+
|
2246
|
+
LOGGER.info("Found %d reaction data location(s), sorted by confidence:", len(locations_sorted))
|
2247
|
+
for loc in locations_sorted:
|
2248
|
+
LOGGER.info(" - %s (%s, confidence: %d%%)",
|
2249
|
+
loc.get('location'),
|
2250
|
+
loc.get('type'),
|
2251
|
+
loc.get('confidence', 0))
|
2252
|
+
|
2253
|
+
# Analyze if we have multiple lineages
|
2254
|
+
lineage_analysis = self.analyze_lineage_groups(locations_sorted, enzyme_df)
|
2255
|
+
has_multiple_lineages = lineage_analysis.get('has_multiple_lineages', False)
|
2256
|
+
|
2257
|
+
if has_multiple_lineages:
|
2258
|
+
LOGGER.info("Multiple lineage groups detected")
|
2259
|
+
return self._process_multiple_lineages_by_confidence(locations_sorted, enzyme_df, lineage_analysis)
|
2260
|
+
else:
|
2261
|
+
LOGGER.info("Single lineage detected, using confidence-based processing")
|
2262
|
+
return self._process_single_lineage_by_confidence(locations_sorted, enzyme_df)
|
2263
|
+
|
2264
|
+
###############################################################################
|
2265
|
+
# 7 - MERGE WITH LINEAGE CSV + SAVE
|
2266
|
+
###############################################################################
|
2267
|
+
|
2268
|
+
def merge_with_lineage_data(
|
2269
|
+
df_lineage: pd.DataFrame, df_metrics: pd.DataFrame
|
2270
|
+
) -> pd.DataFrame:
|
2271
|
+
"""Outer-merge on 'enzyme' column. Left CSV defines desired row order."""
|
2272
|
+
|
2273
|
+
# Handle both 'enzyme' and 'enzyme_id' column names
|
2274
|
+
if "enzyme_id" in df_lineage.columns and "enzyme" not in df_lineage.columns:
|
2275
|
+
df_lineage = df_lineage.rename(columns={"enzyme_id": "enzyme"})
|
2276
|
+
|
2277
|
+
if "enzyme" not in df_lineage.columns:
|
2278
|
+
raise ValueError("Lineage CSV must have an 'enzyme' or 'enzyme_id' column.")
|
2279
|
+
|
2280
|
+
merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
|
2281
|
+
return merged
|
2282
|
+
|
2283
|
+
###############################################################################
|
2284
|
+
# 8 - CLI ENTRY-POINT
|
2285
|
+
###############################################################################
|
2286
|
+
|
2287
|
+
def build_parser() -> argparse.ArgumentParser:
|
2288
|
+
p = argparse.ArgumentParser(
|
2289
|
+
description="Extract enzyme reaction metrics from chemistry PDFs",
|
2290
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
2291
|
+
)
|
2292
|
+
p.add_argument("--manuscript", required=True, type=Path)
|
2293
|
+
p.add_argument("--si", type=Path, help="Supporting-information PDF")
|
2294
|
+
p.add_argument("--lineage-csv", type=Path)
|
2295
|
+
p.add_argument("--output", type=Path, default=Path("reaction_metrics.csv"))
|
2296
|
+
p.add_argument("--verbose", action="store_true")
|
2297
|
+
p.add_argument(
|
2298
|
+
"--debug-dir",
|
2299
|
+
metavar="DIR",
|
2300
|
+
help="Write ALL intermediate artefacts (prompts, raw Gemini replies) to DIR",
|
2301
|
+
)
|
2302
|
+
return p
|
2303
|
+
|
2304
|
+
def main() -> None:
|
2305
|
+
args = build_parser().parse_args()
|
2306
|
+
if args.verbose:
|
2307
|
+
LOGGER.setLevel(logging.DEBUG)
|
2308
|
+
cfg = Config()
|
2309
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir)
|
2310
|
+
|
2311
|
+
# Load enzyme data from CSV if provided
|
2312
|
+
enzyme_df = None
|
2313
|
+
if args.lineage_csv and args.lineage_csv.exists():
|
2314
|
+
LOGGER.info("Loading enzyme data from CSV…")
|
2315
|
+
enzyme_df = pd.read_csv(args.lineage_csv)
|
2316
|
+
|
2317
|
+
# Run extraction with enzyme data
|
2318
|
+
df_metrics = extractor.run(enzyme_df)
|
2319
|
+
|
2320
|
+
if args.lineage_csv and args.lineage_csv.exists() and not df_metrics.empty:
|
2321
|
+
LOGGER.info("Merging with lineage CSV…")
|
2322
|
+
df_final = merge_with_lineage_data(enzyme_df, df_metrics)
|
2323
|
+
else:
|
2324
|
+
df_final = df_metrics
|
2325
|
+
|
2326
|
+
df_final.to_csv(args.output, index=False)
|
2327
|
+
LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
|
2328
|
+
|
2329
|
+
if __name__ == "__main__":
|
2330
|
+
main()
|
2331
|
+
|