debase 0.1.18__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +40 -8
- debase/enzyme_lineage_extractor.py +153 -9
- debase/reaction_info_extractor.py +1181 -493
- debase/substrate_scope_extractor.py +83 -34
- debase/wrapper.py +75 -0
- {debase-0.1.18.dist-info → debase-0.4.0.dist-info}/METADATA +1 -1
- debase-0.4.0.dist-info/RECORD +16 -0
- debase/PIPELINE_FLOW.md +0 -100
- debase-0.1.18.dist-info/RECORD +0 -17
- {debase-0.1.18.dist-info → debase-0.4.0.dist-info}/WHEEL +0 -0
- {debase-0.1.18.dist-info → debase-0.4.0.dist-info}/entry_points.txt +0 -0
- {debase-0.1.18.dist-info → debase-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.18.dist-info → debase-0.4.0.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ Key June 2025 additions
|
|
24
24
|
from __future__ import annotations
|
25
25
|
|
26
26
|
import argparse
|
27
|
+
import hashlib
|
27
28
|
import json
|
28
29
|
import logging
|
29
30
|
import os
|
@@ -31,7 +32,9 @@ import re
|
|
31
32
|
import sys
|
32
33
|
import time
|
33
34
|
from base64 import b64encode, b64decode
|
35
|
+
from collections import OrderedDict
|
34
36
|
from dataclasses import dataclass, field
|
37
|
+
from functools import lru_cache
|
35
38
|
from pathlib import Path
|
36
39
|
from textwrap import dedent
|
37
40
|
from typing import Any, Dict, List, Optional, Tuple
|
@@ -40,7 +43,6 @@ import fitz # PyMuPDF - for image extraction
|
|
40
43
|
import google.generativeai as genai # type: ignore
|
41
44
|
import pandas as pd
|
42
45
|
from PyPDF2 import PdfReader
|
43
|
-
import PIL.Image
|
44
46
|
import io
|
45
47
|
|
46
48
|
###############################################################################
|
@@ -51,7 +53,7 @@ import io
|
|
51
53
|
class Config:
|
52
54
|
"""Centralised tunables so tests can override them easily."""
|
53
55
|
|
54
|
-
model_name: str = "gemini-
|
56
|
+
model_name: str = "gemini-2.5-flash"
|
55
57
|
location_temperature: float = 0.2
|
56
58
|
extract_temperature: float = 0.0
|
57
59
|
model_reaction_temperature: float = 0.0
|
@@ -117,6 +119,144 @@ def get_model(cfg: Config):
|
|
117
119
|
genai.configure(api_key=api_key)
|
118
120
|
return genai.GenerativeModel(cfg.model_name)
|
119
121
|
|
122
|
+
# Bounded LRU caches to store prompt/image content by hash (prevents memory leaks)
|
123
|
+
|
124
|
+
class LRUCache:
|
125
|
+
"""Simple LRU cache implementation."""
|
126
|
+
def __init__(self, maxsize: int):
|
127
|
+
self.maxsize = maxsize
|
128
|
+
self.cache = OrderedDict()
|
129
|
+
|
130
|
+
def get(self, key: str) -> Optional[str]:
|
131
|
+
if key in self.cache:
|
132
|
+
# Move to end (most recently used)
|
133
|
+
self.cache.move_to_end(key)
|
134
|
+
return self.cache[key]
|
135
|
+
return None
|
136
|
+
|
137
|
+
def put(self, key: str, value: str) -> None:
|
138
|
+
if key in self.cache:
|
139
|
+
# Update existing
|
140
|
+
self.cache.move_to_end(key)
|
141
|
+
else:
|
142
|
+
# Add new, evict oldest if needed
|
143
|
+
if len(self.cache) >= self.maxsize:
|
144
|
+
self.cache.popitem(last=False)
|
145
|
+
self.cache[key] = value
|
146
|
+
|
147
|
+
def __len__(self) -> int:
|
148
|
+
return len(self.cache)
|
149
|
+
|
150
|
+
# Global bounded caches
|
151
|
+
_PROMPT_CACHE = LRUCache(maxsize=1000)
|
152
|
+
_IMAGE_CACHE = LRUCache(maxsize=500) # Images are larger, so smaller cache
|
153
|
+
|
154
|
+
def get_cache_stats() -> Dict[str, Any]:
|
155
|
+
"""Get cache statistics for debugging."""
|
156
|
+
return {
|
157
|
+
"gemini_cache_info": _cached_gemini_call.cache_info(),
|
158
|
+
"prompt_cache_size": len(_PROMPT_CACHE),
|
159
|
+
"image_cache_size": len(_IMAGE_CACHE),
|
160
|
+
}
|
161
|
+
|
162
|
+
@lru_cache(maxsize=1000)
|
163
|
+
def _cached_gemini_call(
|
164
|
+
model_name: str,
|
165
|
+
prompt_hash: str,
|
166
|
+
image_hash: Optional[str],
|
167
|
+
temperature: float,
|
168
|
+
max_retries: int,
|
169
|
+
) -> str:
|
170
|
+
"""Pure cached function for Gemini API calls using only hash keys.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
model_name: Name of the Gemini model
|
174
|
+
prompt_hash: SHA256 hash of the prompt
|
175
|
+
image_hash: SHA256 hash of the image (if any)
|
176
|
+
temperature: Temperature for generation
|
177
|
+
max_retries: Maximum number of retries
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
Raw response text from Gemini
|
181
|
+
"""
|
182
|
+
# Retrieve actual content from LRU cache
|
183
|
+
prompt = _PROMPT_CACHE.get(prompt_hash)
|
184
|
+
image_b64 = _IMAGE_CACHE.get(image_hash) if image_hash else None
|
185
|
+
|
186
|
+
if prompt is None:
|
187
|
+
raise RuntimeError(f"Prompt content not found for hash {prompt_hash}")
|
188
|
+
|
189
|
+
# Configure API key (this is idempotent)
|
190
|
+
api_key = os.getenv("GEMINI_API_KEY")
|
191
|
+
if not api_key:
|
192
|
+
raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
|
193
|
+
genai.configure(api_key=api_key)
|
194
|
+
|
195
|
+
# Create model instance (not cached since it's lightweight)
|
196
|
+
model = genai.GenerativeModel(model_name)
|
197
|
+
|
198
|
+
for attempt in range(1, max_retries + 1):
|
199
|
+
try:
|
200
|
+
# Handle image if provided
|
201
|
+
if image_b64:
|
202
|
+
# Decode base64 string to bytes for Gemini API
|
203
|
+
image_bytes = b64decode(image_b64)
|
204
|
+
parts = [prompt, {"mime_type": "image/png", "data": image_bytes}]
|
205
|
+
else:
|
206
|
+
parts = [prompt]
|
207
|
+
|
208
|
+
resp = model.generate_content(
|
209
|
+
parts,
|
210
|
+
generation_config={
|
211
|
+
"temperature": temperature,
|
212
|
+
"max_output_tokens": 8192,
|
213
|
+
}
|
214
|
+
)
|
215
|
+
# Track token usage if available
|
216
|
+
try:
|
217
|
+
if hasattr(resp, 'usage_metadata'):
|
218
|
+
input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
|
219
|
+
output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
|
220
|
+
if input_tokens or output_tokens:
|
221
|
+
try:
|
222
|
+
from .wrapper import add_token_usage
|
223
|
+
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
224
|
+
except ImportError:
|
225
|
+
pass # wrapper not available
|
226
|
+
except Exception:
|
227
|
+
pass # token tracking is best-effort
|
228
|
+
|
229
|
+
return resp.text.strip()
|
230
|
+
except Exception as exc:
|
231
|
+
if attempt == max_retries:
|
232
|
+
raise
|
233
|
+
time.sleep(2 ** attempt)
|
234
|
+
|
235
|
+
# Should never reach here
|
236
|
+
raise RuntimeError("Max retries exceeded")
|
237
|
+
|
238
|
+
def _normalize_prompt_for_caching(prompt: str) -> str:
|
239
|
+
"""Normalize prompt for better cache hit rates by removing boilerplate and collapsing whitespace."""
|
240
|
+
# Remove common boilerplate lines that don't affect the core query
|
241
|
+
lines = prompt.split('\n')
|
242
|
+
normalized_lines = []
|
243
|
+
|
244
|
+
for line in lines:
|
245
|
+
# Skip timestamp and debug lines
|
246
|
+
if any(skip in line.lower() for skip in ['timestamp:', 'length:', 'characters', '===', '***']):
|
247
|
+
continue
|
248
|
+
# Skip lines that are just separators
|
249
|
+
if line.strip() and not line.strip().replace('=', '').replace('-', '').replace('*', ''):
|
250
|
+
continue
|
251
|
+
# Collapse whitespace but preserve structure
|
252
|
+
normalized_lines.append(' '.join(line.split()))
|
253
|
+
|
254
|
+
# Join and collapse multiple newlines
|
255
|
+
normalized = '\n'.join(normalized_lines)
|
256
|
+
normalized = re.sub(r'\n\s*\n+', '\n\n', normalized)
|
257
|
+
|
258
|
+
return normalized.strip()
|
259
|
+
|
120
260
|
def generate_json_with_retry(
|
121
261
|
model,
|
122
262
|
prompt: str,
|
@@ -129,9 +269,17 @@ def generate_json_with_retry(
|
|
129
269
|
image_b64: Optional[str] = None,
|
130
270
|
):
|
131
271
|
"""Call Gemini with retries & exponential back-off, returning parsed JSON."""
|
272
|
+
# Generate cache keys based on normalized prompt and image content
|
273
|
+
normalized_prompt = _normalize_prompt_for_caching(prompt)
|
274
|
+
prompt_hash = hashlib.sha256(normalized_prompt.encode()).hexdigest()
|
275
|
+
image_hash = hashlib.sha256(image_b64.encode()).hexdigest() if image_b64 else None
|
276
|
+
|
132
277
|
# Log prompt details
|
133
278
|
LOGGER.info("=== GEMINI API CALL: %s ===", tag.upper())
|
134
279
|
LOGGER.info("Prompt length: %d characters", len(prompt))
|
280
|
+
LOGGER.info("Prompt hash: %s", prompt_hash[:16])
|
281
|
+
if image_hash:
|
282
|
+
LOGGER.info("Image hash: %s", image_hash[:16])
|
135
283
|
LOGGER.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
|
136
284
|
|
137
285
|
# Save full prompt to debug directory if provided
|
@@ -139,107 +287,142 @@ def generate_json_with_retry(
|
|
139
287
|
debug_path = Path(debug_dir)
|
140
288
|
debug_path.mkdir(parents=True, exist_ok=True)
|
141
289
|
prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
|
142
|
-
_dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\n{'='*80}\n\n{prompt}",
|
290
|
+
_dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{prompt}",
|
143
291
|
prompt_file)
|
144
292
|
LOGGER.info("Full prompt saved to: %s", prompt_file)
|
145
293
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
LOGGER.info("
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
294
|
+
try:
|
295
|
+
# Store content in bounded LRU caches for the cached function to retrieve
|
296
|
+
_PROMPT_CACHE.put(prompt_hash, prompt)
|
297
|
+
if image_hash and image_b64:
|
298
|
+
_IMAGE_CACHE.put(image_hash, image_b64)
|
299
|
+
|
300
|
+
# Check if this will be a cache hit
|
301
|
+
cache_info_before = _cached_gemini_call.cache_info()
|
302
|
+
|
303
|
+
# Use cached Gemini call (only with hash keys)
|
304
|
+
LOGGER.info("Calling cached Gemini API...")
|
305
|
+
raw = _cached_gemini_call(
|
306
|
+
model_name=model.model_name,
|
307
|
+
prompt_hash=prompt_hash,
|
308
|
+
image_hash=image_hash,
|
309
|
+
temperature=temperature,
|
310
|
+
max_retries=max_retries,
|
311
|
+
)
|
312
|
+
|
313
|
+
# Log cache performance
|
314
|
+
cache_info_after = _cached_gemini_call.cache_info()
|
315
|
+
if cache_info_after.hits > cache_info_before.hits:
|
316
|
+
LOGGER.info("✓ Cache HIT for prompt hash %s", prompt_hash[:16])
|
317
|
+
else:
|
318
|
+
LOGGER.info("✗ Cache MISS for prompt hash %s", prompt_hash[:16])
|
319
|
+
|
320
|
+
# Log response
|
321
|
+
LOGGER.info("Gemini response length: %d characters", len(raw))
|
322
|
+
LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
|
323
|
+
|
324
|
+
# Save full response to debug directory
|
325
|
+
if debug_dir:
|
326
|
+
response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
|
327
|
+
_dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{raw}",
|
328
|
+
response_file)
|
329
|
+
LOGGER.info("Full response saved to: %s", response_file)
|
176
330
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
331
|
+
# Remove common Markdown fences more carefully
|
332
|
+
if raw.startswith("```json"):
|
333
|
+
raw = raw[7:].strip() # Remove ```json
|
334
|
+
elif raw.startswith("```"):
|
335
|
+
raw = raw[3:].strip() # Remove ```
|
336
|
+
|
337
|
+
if raw.endswith("```"):
|
338
|
+
raw = raw[:-3].strip() # Remove trailing ```
|
339
|
+
|
340
|
+
|
341
|
+
# Simple JSON parsing approach
|
342
|
+
# Try direct parsing first
|
343
|
+
LOGGER.debug(f"Raw JSON length: {len(raw)}")
|
344
|
+
LOGGER.debug(f"Raw JSON first 200 chars: {raw[:200]}")
|
345
|
+
LOGGER.debug(f"Raw JSON last 200 chars: {raw[-200:]}")
|
346
|
+
|
347
|
+
try:
|
348
|
+
parsed = json.loads(raw)
|
349
|
+
except json.JSONDecodeError as e:
|
350
|
+
LOGGER.error(f"JSON parsing failed at position {e.pos}: {e}")
|
351
|
+
LOGGER.error(f"Character at error: {repr(raw[e.pos] if e.pos < len(raw) else 'END')}")
|
352
|
+
LOGGER.error(f"Context: {repr(raw[max(0, e.pos-20):e.pos+20])}")
|
353
|
+
|
354
|
+
# Count braces and quotes for debugging
|
355
|
+
open_braces = raw.count('{')
|
356
|
+
close_braces = raw.count('}')
|
357
|
+
quotes = raw.count('"')
|
358
|
+
LOGGER.error(f"Braces: {open_braces} open, {close_braces} close. Quotes: {quotes}")
|
359
|
+
|
360
|
+
# If that fails, try to extract JSON from the response using a simpler method
|
183
361
|
try:
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
json_start = -1
|
189
|
-
json_end = -1
|
190
|
-
bracket_stack = []
|
191
|
-
in_string = False
|
192
|
-
escape_next = False
|
362
|
+
# Look for the JSON object start and end
|
363
|
+
start_idx = raw.find('{')
|
364
|
+
if start_idx == -1:
|
365
|
+
raise json.JSONDecodeError("No JSON object found", raw, 0)
|
193
366
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
367
|
+
# Find the matching closing brace by counting
|
368
|
+
brace_count = 0
|
369
|
+
end_idx = -1
|
370
|
+
for i in range(start_idx, len(raw)):
|
371
|
+
if raw[i] == '{':
|
372
|
+
brace_count += 1
|
373
|
+
elif raw[i] == '}':
|
374
|
+
brace_count -= 1
|
375
|
+
if brace_count == 0:
|
376
|
+
end_idx = i + 1
|
377
|
+
break
|
378
|
+
|
379
|
+
if end_idx == -1:
|
380
|
+
raise json.JSONDecodeError("No matching closing brace found", raw, 0)
|
381
|
+
|
382
|
+
json_str = raw[start_idx:end_idx]
|
383
|
+
LOGGER.debug(f"Extracted JSON string: {json_str[:200]}...")
|
384
|
+
parsed = json.loads(json_str)
|
385
|
+
|
386
|
+
except json.JSONDecodeError:
|
387
|
+
# Final fallback - try to use eval as a last resort (unsafe but functional)
|
388
|
+
try:
|
389
|
+
# Replace problematic characters and try to parse as Python dict
|
390
|
+
safe_raw = raw.replace('null', 'None').replace('true', 'True').replace('false', 'False')
|
391
|
+
start_idx = safe_raw.find('{')
|
392
|
+
if start_idx == -1:
|
393
|
+
raise ValueError("No dict found")
|
198
394
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
395
|
+
brace_count = 0
|
396
|
+
end_idx = -1
|
397
|
+
for i in range(start_idx, len(safe_raw)):
|
398
|
+
if safe_raw[i] == '{':
|
399
|
+
brace_count += 1
|
400
|
+
elif safe_raw[i] == '}':
|
401
|
+
brace_count -= 1
|
402
|
+
if brace_count == 0:
|
403
|
+
end_idx = i + 1
|
404
|
+
break
|
206
405
|
|
207
|
-
if
|
208
|
-
|
406
|
+
if end_idx == -1:
|
407
|
+
raise ValueError("No matching closing brace found")
|
209
408
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
|
218
|
-
if not bracket_stack: # Found complete JSON
|
219
|
-
json_end = i + 1
|
220
|
-
break
|
221
|
-
|
222
|
-
if json_start >= 0 and json_end > json_start:
|
223
|
-
# Extract the JSON portion
|
224
|
-
json_str = raw[json_start:json_end]
|
225
|
-
parsed = json.loads(json_str)
|
226
|
-
else:
|
227
|
-
# Look for simple [] in the response
|
409
|
+
dict_str = safe_raw[start_idx:end_idx]
|
410
|
+
parsed = eval(dict_str) # This is unsafe but we trust our own generated content
|
411
|
+
LOGGER.warning("Used eval() fallback for JSON parsing")
|
412
|
+
|
413
|
+
except Exception:
|
414
|
+
# If all else fails, return empty dict
|
415
|
+
LOGGER.error("All JSON parsing methods failed")
|
228
416
|
if '[]' in raw:
|
229
417
|
parsed = []
|
230
418
|
else:
|
231
|
-
# No JSON structure found, re-raise the original error
|
232
419
|
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
)
|
240
|
-
if attempt == max_retries:
|
241
|
-
raise
|
242
|
-
time.sleep(2 ** attempt)
|
420
|
+
|
421
|
+
LOGGER.info("Successfully parsed JSON response")
|
422
|
+
return parsed
|
423
|
+
except Exception as exc:
|
424
|
+
LOGGER.error("Cached Gemini call failed: %s", exc)
|
425
|
+
raise
|
243
426
|
|
244
427
|
|
245
428
|
###############################################################################
|
@@ -248,14 +431,14 @@ def generate_json_with_retry(
|
|
248
431
|
|
249
432
|
PROMPT_FIND_LOCATIONS = dedent("""
|
250
433
|
You are an expert reader of protein engineering manuscripts.
|
251
|
-
Given the following article captions and section titles, identify
|
434
|
+
Given the following article captions and section titles, identify most promising locations
|
252
435
|
(tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
|
253
|
-
activity, etc.) for enzyme variants.
|
436
|
+
activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
|
254
437
|
|
255
438
|
IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
|
256
439
|
performance data locations. Pay careful attention to:
|
257
440
|
- The caption text to identify which campaign/lineage the data is for
|
258
|
-
- Enzyme name prefixes
|
441
|
+
- Enzyme name prefixes that indicate different campaigns
|
259
442
|
- Different substrate/product types mentioned in captions
|
260
443
|
|
261
444
|
Respond with a JSON array where each element contains:
|
@@ -267,8 +450,10 @@ Respond with a JSON array where each element contains:
|
|
267
450
|
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
268
451
|
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
269
452
|
|
270
|
-
Tables are preferred over figures
|
271
|
-
|
453
|
+
Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
|
454
|
+
Do not include too much sources, just return 2 or 3 sources.
|
455
|
+
Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
|
456
|
+
When returning confidence scores, be more accurate and avoid scores that are too close together.
|
272
457
|
Respond ONLY with **minified JSON**. NO markdown fences.
|
273
458
|
|
274
459
|
Example:
|
@@ -280,7 +465,8 @@ You are given either (a) the PNG image of a figure panel, or (b) the caption /
|
|
280
465
|
text excerpt that contains numeric reaction performance data for an enzyme.
|
281
466
|
|
282
467
|
Extract ONLY the performance metrics, NOT substrate/product names or reaction conditions.
|
283
|
-
|
468
|
+
|
469
|
+
Return a JSON object with the following keys (use **null** only if the value is not mentioned at all):
|
284
470
|
* "yield" - yield as percentage with ONE decimal place precision
|
285
471
|
* "ttn" - turnover number (total turnovers)
|
286
472
|
* "ton" - turnover number if TTN not available
|
@@ -296,45 +482,66 @@ IMPORTANT:
|
|
296
482
|
- Do NOT extract substrate/product names - these will come from SI
|
297
483
|
- Do NOT extract reaction conditions (temperature, pH, time, solvent)
|
298
484
|
- If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
|
485
|
+
- If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
|
299
486
|
|
300
487
|
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
301
488
|
""")
|
302
489
|
|
303
490
|
PROMPT_EXTRACT_FIGURE_METRICS_BATCH = dedent("""
|
304
|
-
|
305
|
-
|
306
|
-
|
491
|
+
STEP 1: First, identify ALL X-axis labels in the figure
|
492
|
+
- Read each X-axis label from left to right
|
493
|
+
- List exactly what text appears under each bar/data point
|
494
|
+
- Note: Labels may be abbreviated or use different naming conventions
|
495
|
+
|
496
|
+
STEP 2: Match X-axis labels to target enzyme variants
|
497
|
+
- Compare each X-axis label against the target enzyme list below
|
498
|
+
- Look for partial matches, abbreviations, or similar naming patterns
|
499
|
+
- If an X-axis label doesn't match any target enzyme, still include it for completeness
|
500
|
+
|
501
|
+
STEP 3: Identify Y-axis scales and what they measure
|
502
|
+
- Look at the Y-axis labels and tick marks to understand what each axis measures
|
503
|
+
- If there are multiple Y-axes (left and right), read the axis labels and units
|
504
|
+
- Note the minimum and maximum values on each axis scale
|
505
|
+
- Identify which visual elements (bars, dots, lines) correspond to which axis
|
506
|
+
|
507
|
+
STEP 4: Extract values for each matched variant
|
508
|
+
- For each X-axis position, identify which visual elements belong to that position
|
509
|
+
- LEFT Y-axis (bars): Measure bar height against the left scale by reading tick marks
|
510
|
+
- RIGHT Y-axis (dots): Measure dot position against the right scale by reading tick marks
|
511
|
+
- CRITICAL: Read actual scale values from the axis labels and tick marks
|
512
|
+
- Verify: taller bars should have higher values, higher dots should have higher values
|
513
|
+
|
514
|
+
Target enzymes to find and extract:
|
307
515
|
{enzyme_names}
|
308
516
|
|
309
|
-
|
310
|
-
1.
|
311
|
-
2.
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
* "
|
319
|
-
* "
|
320
|
-
* "
|
321
|
-
* "
|
322
|
-
* "
|
323
|
-
* "
|
324
|
-
* "
|
325
|
-
* "
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
-
|
332
|
-
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
517
|
+
Instructions:
|
518
|
+
1. First, list ALL X-axis labels you can see in the figure
|
519
|
+
2. Match each X-axis label to the target enzyme variants
|
520
|
+
3. For matched variants, extract both bar heights (left Y-axis) and dot positions (right Y-axis)
|
521
|
+
4. Return data only for variants that have clear X-axis labels and are matched to targets
|
522
|
+
|
523
|
+
Return JSON with the identified enzyme variant names as keys containing:
|
524
|
+
* "x_axis_label" - the exact text from the X-axis for this variant
|
525
|
+
* "yield" - percentage from left Y-axis bar height measurement
|
526
|
+
* "ttn" - turnover number from right Y-axis dot position measurement
|
527
|
+
* "ton" - if TTN not available
|
528
|
+
* "selectivity" - if shown
|
529
|
+
* "conversion" - if different from yield
|
530
|
+
* "tof" - if provided
|
531
|
+
* "activity" - if provided
|
532
|
+
* "other_metrics" - other metrics
|
533
|
+
* "notes" - REQUIRED: Describe the X-axis label, bar position, and dot position (e.g., "X-axis shows P411-CIS, leftmost bar is very short, dot is at bottom")
|
534
|
+
|
535
|
+
CRITICAL: Return ONLY valid JSON in this exact format:
|
536
|
+
{{"enzyme_name": {{"x_axis_label": "label", "yield": number, "ttn": number, "notes": "description"}}}}
|
537
|
+
|
538
|
+
Rules:
|
539
|
+
- Use double quotes for all strings
|
540
|
+
- No markdown, no commentary, no explanations
|
541
|
+
- All values must be properly formatted
|
542
|
+
- Ensure JSON is complete and valid
|
543
|
+
- Do not truncate or cut off the response
|
544
|
+
- IMPORTANT: When extracting data, prioritize the most complete source that shows data for ALL variants. If there are conflicting values between different sources (e.g., bar graph vs text values), use the source that provides complete data for all target enzymes and ignore partial or conflicting values from other sources
|
338
545
|
""")
|
339
546
|
|
340
547
|
# Removed substrate scope IUPAC extraction - now handled in model reaction only
|
@@ -519,13 +726,25 @@ class ReactionExtractor:
|
|
519
726
|
_TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
520
727
|
|
521
728
|
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
522
|
-
campaign_filter: Optional[str] = None):
|
729
|
+
campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
|
523
730
|
self.manuscript = manuscript
|
524
731
|
self.si = si
|
525
732
|
self.cfg = cfg
|
526
733
|
self.model = get_model(cfg)
|
527
734
|
self.debug_dir = debug_dir
|
528
735
|
self.campaign_filter = campaign_filter # Filter for specific campaign
|
736
|
+
self.all_campaigns = all_campaigns or [] # List of all campaigns for context
|
737
|
+
|
738
|
+
# Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
|
739
|
+
self._figure_cache = LRUCache(maxsize=100) # Figures are large, so smaller cache
|
740
|
+
self._model_reaction_locations_cache = LRUCache(maxsize=50)
|
741
|
+
|
742
|
+
# Cache for compound mappings to avoid repeated API calls (bounded to prevent memory leaks)
|
743
|
+
self._compound_mapping_cache = LRUCache(maxsize=1000)
|
744
|
+
self._compound_mapping_text_cache = LRUCache(maxsize=500) # Cache text extractions too
|
745
|
+
|
746
|
+
# Cache for reaction locations to avoid repeated API calls (bounded to prevent memory leaks)
|
747
|
+
self._reaction_locations_cache = LRUCache(maxsize=50)
|
529
748
|
|
530
749
|
# Create debug directory if specified
|
531
750
|
if self.debug_dir:
|
@@ -551,23 +770,40 @@ class ReactionExtractor:
|
|
551
770
|
# ------------------------------------------------------------------
|
552
771
|
|
553
772
|
def _collect_captions_and_titles(self) -> str:
|
554
|
-
#
|
555
|
-
# This catches all variations
|
556
|
-
|
773
|
+
# Pattern to match Table or Figure with optional leading whitespace
|
774
|
+
# This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
|
775
|
+
# Also handles cases where there's whitespace before the caption
|
776
|
+
cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
|
557
777
|
captions: List[str] = []
|
558
778
|
|
559
779
|
# Collect from all pages
|
560
780
|
all_text = "\n".join(self.all_pages)
|
561
781
|
|
562
|
-
# Find all figure/table captions
|
782
|
+
# Find all figure/table captions with more context
|
563
783
|
for match in cap_pattern.finditer(all_text):
|
564
784
|
caption_start = match.start()
|
565
|
-
|
785
|
+
|
786
|
+
# Include some context before the caption (up to 200 chars)
|
787
|
+
context_start = max(0, caption_start - 200)
|
788
|
+
# Find the start of the sentence/paragraph before the caption
|
789
|
+
context_text = all_text[context_start:caption_start]
|
790
|
+
last_period = context_text.rfind('.')
|
791
|
+
if last_period != -1:
|
792
|
+
context_start = context_start + last_period + 1
|
793
|
+
|
794
|
+
# For tables, include much more content after the caption to show actual table data
|
795
|
+
# For figures, keep the original limit
|
796
|
+
is_table = match.group(1).lower() == 'table'
|
797
|
+
max_chars = 5000 if is_table else 3000
|
798
|
+
|
799
|
+
# Get up to max_chars or until double newline
|
566
800
|
caption_end = all_text.find("\n\n", caption_start)
|
567
|
-
if caption_end == -1 or caption_end - caption_start >
|
568
|
-
caption_end = caption_start +
|
569
|
-
|
570
|
-
|
801
|
+
if caption_end == -1 or caption_end - caption_start > max_chars:
|
802
|
+
caption_end = caption_start + max_chars
|
803
|
+
|
804
|
+
# Include the context and full caption with table content
|
805
|
+
full_caption = all_text[context_start:caption_end].strip()
|
806
|
+
captions.append(full_caption)
|
571
807
|
|
572
808
|
# Also look for SI section titles
|
573
809
|
si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
|
@@ -584,15 +820,47 @@ class ReactionExtractor:
|
|
584
820
|
|
585
821
|
def find_reaction_locations(self) -> List[Dict[str, Any]]:
|
586
822
|
"""Find all locations containing reaction performance data."""
|
587
|
-
#
|
823
|
+
# Create cache key based on campaign filter
|
824
|
+
cache_key = f"locations_{self.campaign_filter or 'all'}"
|
825
|
+
|
826
|
+
# Check cache first
|
827
|
+
cached_result = self._reaction_locations_cache.get(cache_key)
|
828
|
+
if cached_result is not None:
|
829
|
+
LOGGER.info("Using cached reaction locations for campaign: %s", self.campaign_filter or 'all')
|
830
|
+
return cached_result
|
831
|
+
|
832
|
+
# Add campaign context - always provide context to help model understanding
|
588
833
|
campaign_context = ""
|
589
834
|
if self.campaign_filter:
|
835
|
+
campaigns_warning = ""
|
836
|
+
if self.all_campaigns:
|
837
|
+
campaigns_warning = f"""
|
838
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
839
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
840
|
+
|
841
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
|
842
|
+
Be extremely careful to only extract data for the {self.campaign_filter} campaign.
|
843
|
+
"""
|
844
|
+
|
590
845
|
campaign_context = f"""
|
591
|
-
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
592
|
-
Only return locations that contain data for this specific campaign.
|
593
|
-
Ignore locations that contain data for other campaigns.
|
846
|
+
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
847
|
+
Only return locations that contain data for this specific campaign.
|
848
|
+
Ignore locations that contain data for other campaigns.
|
849
|
+
{campaigns_warning}
|
594
850
|
|
595
|
-
"""
|
851
|
+
"""
|
852
|
+
else:
|
853
|
+
# Even for single campaigns, provide context about what to look for
|
854
|
+
campaign_context = f"""
|
855
|
+
IMPORTANT: You are looking for performance data showing enzyme evolution progression.
|
856
|
+
Look for locations that contain actual performance metrics (yield, TTN, TON, activity, etc.)
|
857
|
+
for multiple enzyme variants, not just mutation lists or method descriptions.
|
858
|
+
|
859
|
+
Tables may only contain mutation information without performance data - check the actual
|
860
|
+
table content below the caption to verify if performance metrics are present.
|
861
|
+
Figures with evolutionary lineage data often contain the actual performance matrix.
|
862
|
+
|
863
|
+
"""
|
596
864
|
|
597
865
|
prompt = campaign_context + PROMPT_FIND_LOCATIONS + "\n\n" + self._collect_captions_and_titles()
|
598
866
|
try:
|
@@ -604,13 +872,20 @@ Ignore locations that contain data for other campaigns.
|
|
604
872
|
tag="find_locations"
|
605
873
|
)
|
606
874
|
# Handle both single dict (backwards compatibility) and list
|
875
|
+
result = []
|
607
876
|
if isinstance(data, dict):
|
608
|
-
|
877
|
+
result = [data]
|
609
878
|
elif isinstance(data, list):
|
610
|
-
|
879
|
+
result = data
|
611
880
|
else:
|
612
881
|
LOGGER.error("Expected list or dict from Gemini, got: %s", type(data))
|
613
|
-
|
882
|
+
result = []
|
883
|
+
|
884
|
+
# Cache the result
|
885
|
+
self._reaction_locations_cache.put(cache_key, result)
|
886
|
+
LOGGER.info("Cached reaction locations for campaign: %s", self.campaign_filter or 'all')
|
887
|
+
|
888
|
+
return result
|
614
889
|
except Exception as e:
|
615
890
|
LOGGER.error("Failed to find reaction locations: %s", e)
|
616
891
|
return []
|
@@ -686,13 +961,27 @@ Ignore locations that contain data for other campaigns.
|
|
686
961
|
}
|
687
962
|
|
688
963
|
def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
689
|
-
"""Find the model reaction for a specific lineage group.
|
964
|
+
"""Find the model reaction for a specific lineage group.
|
965
|
+
Returns early if no relevant text is found to avoid unnecessary API calls."""
|
966
|
+
|
690
967
|
# Gather relevant text near this location
|
691
968
|
page_text = self._page_with_reference(location) or ""
|
692
969
|
|
970
|
+
# Early exit if no text found for this location
|
971
|
+
if not page_text or len(page_text.strip()) < 100:
|
972
|
+
LOGGER.info("No sufficient text found for location %s, skipping lineage-specific extraction", location)
|
973
|
+
return {}
|
974
|
+
|
693
975
|
# Also check manuscript introduction for model reaction info
|
694
976
|
intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
|
695
977
|
|
978
|
+
# Quick relevance check - look for reaction-related keywords
|
979
|
+
reaction_keywords = ["substrate", "product", "reaction", "compound", "synthesis", "procedure", "method"]
|
980
|
+
combined_text = (page_text + intro_text).lower()
|
981
|
+
if not any(keyword in combined_text for keyword in reaction_keywords):
|
982
|
+
LOGGER.info("No reaction-related keywords found for location %s, skipping lineage extraction", location)
|
983
|
+
return {}
|
984
|
+
|
696
985
|
# Build the prompt with location and context
|
697
986
|
prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
|
698
987
|
location=location,
|
@@ -702,6 +991,7 @@ Ignore locations that contain data for other campaigns.
|
|
702
991
|
prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
|
703
992
|
|
704
993
|
# If we have model reaction locations, include text from those locations too
|
994
|
+
text_added = False
|
705
995
|
if model_reaction_locations:
|
706
996
|
# Add text from model reaction location
|
707
997
|
if model_reaction_locations.get("model_reaction_location", {}).get("location"):
|
@@ -709,6 +999,7 @@ Ignore locations that contain data for other campaigns.
|
|
709
999
|
model_text = self._get_text_around_location(model_loc)
|
710
1000
|
if model_text:
|
711
1001
|
prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
|
1002
|
+
text_added = True
|
712
1003
|
|
713
1004
|
# Add text from conditions location (often contains reaction details)
|
714
1005
|
if model_reaction_locations.get("conditions_location", {}).get("location"):
|
@@ -716,8 +1007,15 @@ Ignore locations that contain data for other campaigns.
|
|
716
1007
|
cond_text = self._get_text_around_location(cond_loc)
|
717
1008
|
if cond_text:
|
718
1009
|
prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
|
1010
|
+
text_added = True
|
1011
|
+
|
1012
|
+
# If we didn't find any model reaction locations and the page text is sparse, skip
|
1013
|
+
if not text_added and len(page_text.strip()) < 500:
|
1014
|
+
LOGGER.info("Insufficient context for lineage model reaction extraction at %s", location)
|
1015
|
+
return {}
|
719
1016
|
|
720
1017
|
try:
|
1018
|
+
LOGGER.info("Attempting lineage-specific model reaction extraction for %s", location)
|
721
1019
|
data = generate_json_with_retry(
|
722
1020
|
self.model,
|
723
1021
|
prompt,
|
@@ -725,7 +1023,15 @@ Ignore locations that contain data for other campaigns.
|
|
725
1023
|
debug_dir=self.debug_dir,
|
726
1024
|
tag=f"lineage_model_reaction_{location.replace(' ', '_')}"
|
727
1025
|
)
|
728
|
-
|
1026
|
+
|
1027
|
+
# Validate the response has useful information
|
1028
|
+
if isinstance(data, dict) and (data.get('substrate_ids') or data.get('product_ids')):
|
1029
|
+
LOGGER.info("Lineage model reaction extraction successful for %s", location)
|
1030
|
+
return data
|
1031
|
+
else:
|
1032
|
+
LOGGER.info("Lineage model reaction extraction returned empty results for %s", location)
|
1033
|
+
return {}
|
1034
|
+
|
729
1035
|
except Exception as e:
|
730
1036
|
LOGGER.error("Failed to find model reaction for lineage at %s: %s", location, e)
|
731
1037
|
return {}
|
@@ -777,67 +1083,174 @@ Ignore locations that contain data for other campaigns.
|
|
777
1083
|
If False, extracts the entire page (useful for tables).
|
778
1084
|
Returns a base64-encoded PNG or None."""
|
779
1085
|
|
1086
|
+
# Check cache first
|
1087
|
+
cache_key = f"{ref}_{extract_figure_only}"
|
1088
|
+
cached_result = self._figure_cache.get(cache_key)
|
1089
|
+
if cached_result is not None:
|
1090
|
+
LOGGER.debug("Using cached figure for %s", ref)
|
1091
|
+
return cached_result
|
1092
|
+
|
780
1093
|
# For table extraction, use multi-page approach
|
781
1094
|
if not extract_figure_only:
|
782
1095
|
pages_with_ref = self._find_pages_with_reference(ref)
|
783
1096
|
if pages_with_ref:
|
784
1097
|
LOGGER.debug(f"Found {len(pages_with_ref)} pages containing {ref}")
|
785
|
-
return self._extract_multiple_pages_png(pages_with_ref)
|
1098
|
+
return self._extract_multiple_pages_png(pages_with_ref, ref)
|
786
1099
|
return None
|
787
1100
|
|
788
|
-
# For figure extraction, search both documents
|
1101
|
+
# For figure extraction, search both documents for actual figure captions
|
789
1102
|
for doc in filter(None, [self.ms_doc, self.si_doc]):
|
790
1103
|
for page_number in range(doc.page_count):
|
791
1104
|
page = doc.load_page(page_number)
|
792
1105
|
page_text = page.get_text()
|
793
|
-
|
794
|
-
|
795
|
-
#
|
796
|
-
|
797
|
-
|
1106
|
+
|
1107
|
+
# Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
|
1108
|
+
# For subfigures like "Figure 1C", extract the main figure "Figure 1"
|
1109
|
+
figure_num = ref.replace('Figure ', '').replace('figure ', '')
|
1110
|
+
|
1111
|
+
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1112
|
+
main_figure_num = re.match(r'^(\d+)', figure_num)
|
1113
|
+
if main_figure_num:
|
1114
|
+
main_figure_num = main_figure_num.group(1)
|
1115
|
+
else:
|
1116
|
+
main_figure_num = figure_num
|
1117
|
+
|
1118
|
+
caption_patterns = [
|
1119
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\.", # "Figure 1."
|
1120
|
+
rf"^Figure\s+{re.escape(main_figure_num)}:", # "Figure 1:"
|
1121
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]", # "Figure 1 Performance"
|
1122
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
|
1123
|
+
rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
|
1124
|
+
rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
|
1125
|
+
]
|
1126
|
+
|
1127
|
+
LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
|
1128
|
+
main_figure_num, ref, caption_patterns)
|
1129
|
+
|
1130
|
+
caption_found = False
|
1131
|
+
cap_rect = None
|
1132
|
+
|
1133
|
+
for pattern in caption_patterns:
|
1134
|
+
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
1135
|
+
if matches:
|
1136
|
+
LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
|
1137
|
+
# Found actual figure caption, get its position
|
1138
|
+
caption_text = matches.group(0)
|
1139
|
+
text_instances = page.search_for(caption_text, quads=False)
|
1140
|
+
if text_instances:
|
1141
|
+
cap_rect = text_instances[0]
|
1142
|
+
caption_found = True
|
1143
|
+
LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
|
1144
|
+
break
|
1145
|
+
|
1146
|
+
if not caption_found:
|
1147
|
+
# Debug: show what figure-related text is actually on this page
|
1148
|
+
figure_mentions = [line.strip() for line in page_text.split('\n')
|
1149
|
+
if 'figure' in line.lower() and main_figure_num.lower() in line.lower()]
|
1150
|
+
if figure_mentions:
|
1151
|
+
LOGGER.debug("Page %d has figure mentions but no caption match: %s",
|
1152
|
+
page_number, figure_mentions[:3])
|
798
1153
|
continue
|
799
|
-
cap_rect = text_instances[0] # first match
|
800
1154
|
|
801
1155
|
if extract_figure_only:
|
802
|
-
#
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
1156
|
+
# Extract only the area above the caption (the actual figure)
|
1157
|
+
# This excludes caption text and focuses on visual elements
|
1158
|
+
LOGGER.info("Extracting figure area above caption for %s", ref)
|
1159
|
+
|
1160
|
+
# Get the page dimensions
|
1161
|
+
page_rect = page.rect
|
1162
|
+
|
1163
|
+
# Extract the area above the caption
|
1164
|
+
if cap_rect:
|
1165
|
+
# Extract from top of page to top of caption
|
1166
|
+
figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
|
1167
|
+
LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
|
1168
|
+
else:
|
1169
|
+
# If no caption found, use top 80% of page
|
1170
|
+
figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
|
1171
|
+
LOGGER.debug("No caption found, using top 80% of page: %s", figure_rect)
|
1172
|
+
|
1173
|
+
# Extract the figure area only
|
1174
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
|
1175
|
+
pix = page.get_pixmap(matrix=mat, clip=figure_rect)
|
1176
|
+
pix = self._ensure_rgb_pixmap(pix)
|
1177
|
+
img_bytes = pix.tobytes("png")
|
1178
|
+
|
1179
|
+
# Save PNG to debug directory if available
|
1180
|
+
if self.debug_dir:
|
1181
|
+
timestamp = int(time.time())
|
1182
|
+
png_file = self.debug_dir / f"figure_{ref.replace(' ', '_')}_{timestamp}.png"
|
1183
|
+
with open(png_file, 'wb') as f:
|
1184
|
+
f.write(img_bytes)
|
1185
|
+
LOGGER.info("Saved figure page to: %s", png_file)
|
1186
|
+
|
1187
|
+
result = b64encode(img_bytes).decode()
|
1188
|
+
# Cache the result
|
1189
|
+
self._figure_cache.put(cache_key, result)
|
1190
|
+
return result
|
817
1191
|
else:
|
818
1192
|
# Extract the entire page as an image
|
819
|
-
mat = fitz.Matrix(
|
1193
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
|
820
1194
|
pix = page.get_pixmap(matrix=mat)
|
821
1195
|
pix = self._ensure_rgb_pixmap(pix)
|
822
1196
|
img_bytes = pix.tobytes("png")
|
823
|
-
|
1197
|
+
|
1198
|
+
# Save PNG to debug directory if available
|
1199
|
+
if self.debug_dir:
|
1200
|
+
timestamp = int(time.time())
|
1201
|
+
png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
|
1202
|
+
with open(png_file, 'wb') as f:
|
1203
|
+
f.write(img_bytes)
|
1204
|
+
LOGGER.info("Saved page image to: %s", png_file)
|
1205
|
+
|
1206
|
+
result = b64encode(img_bytes).decode()
|
1207
|
+
# Cache the result
|
1208
|
+
self._figure_cache.put(cache_key, result)
|
1209
|
+
return result
|
824
1210
|
return None
|
825
1211
|
|
826
1212
|
def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
|
827
1213
|
"""Find all pages containing the reference across documents.
|
1214
|
+
Prioritizes pages with actual captions over just references.
|
828
1215
|
Returns list of (document, page_number) tuples."""
|
829
1216
|
pages_found = []
|
1217
|
+
caption_pages = []
|
830
1218
|
|
831
1219
|
for doc in filter(None, [self.ms_doc, self.si_doc]):
|
832
1220
|
for page_number in range(doc.page_count):
|
833
1221
|
page = doc.load_page(page_number)
|
834
1222
|
page_text = page.get_text()
|
1223
|
+
|
1224
|
+
# Check for actual figure caption first
|
1225
|
+
if ref.lower().startswith('figure'):
|
1226
|
+
figure_num = ref.replace('Figure ', '').replace('figure ', '')
|
1227
|
+
|
1228
|
+
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1229
|
+
main_figure_num = re.match(r'^(\d+)', figure_num)
|
1230
|
+
if main_figure_num:
|
1231
|
+
main_figure_num = main_figure_num.group(1)
|
1232
|
+
else:
|
1233
|
+
main_figure_num = figure_num
|
1234
|
+
|
1235
|
+
caption_patterns = [
|
1236
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\.",
|
1237
|
+
rf"^Figure\s+{re.escape(main_figure_num)}:",
|
1238
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]"
|
1239
|
+
]
|
1240
|
+
|
1241
|
+
for pattern in caption_patterns:
|
1242
|
+
if re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE):
|
1243
|
+
caption_pages.append((doc, page_number))
|
1244
|
+
break
|
1245
|
+
|
1246
|
+
# Fallback to any mention of the reference
|
835
1247
|
if ref.lower() in page_text.lower():
|
836
1248
|
pages_found.append((doc, page_number))
|
837
|
-
|
838
|
-
|
1249
|
+
|
1250
|
+
# Return caption pages first, then other pages
|
1251
|
+
return caption_pages + [p for p in pages_found if p not in caption_pages]
|
839
1252
|
|
840
|
-
def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]]) -> Optional[str]:
|
1253
|
+
def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]], ref: str = "unknown") -> Optional[str]:
|
841
1254
|
"""Extract multiple pages as a combined PNG image."""
|
842
1255
|
if not pages:
|
843
1256
|
return None
|
@@ -854,12 +1267,7 @@ Ignore locations that contain data for other campaigns.
|
|
854
1267
|
pix = self._ensure_rgb_pixmap(pix)
|
855
1268
|
all_images.append(pix)
|
856
1269
|
|
857
|
-
#
|
858
|
-
if i == len(pages) - 1 and page_num + 1 < doc.page_count:
|
859
|
-
next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
|
860
|
-
next_pix = self._ensure_rgb_pixmap(next_pix)
|
861
|
-
all_images.append(next_pix)
|
862
|
-
LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
|
1270
|
+
# Only extract the page containing the reference (removed next page logic)
|
863
1271
|
|
864
1272
|
if not all_images:
|
865
1273
|
return None
|
@@ -867,7 +1275,17 @@ Ignore locations that contain data for other campaigns.
|
|
867
1275
|
# If only one page, return it directly
|
868
1276
|
if len(all_images) == 1:
|
869
1277
|
pix = self._ensure_rgb_pixmap(all_images[0])
|
870
|
-
|
1278
|
+
img_bytes = pix.tobytes("png")
|
1279
|
+
|
1280
|
+
# Save debug file if available
|
1281
|
+
if self.debug_dir:
|
1282
|
+
timestamp = int(time.time())
|
1283
|
+
png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
|
1284
|
+
with open(png_file, 'wb') as f:
|
1285
|
+
f.write(img_bytes)
|
1286
|
+
LOGGER.info("Saved multi-page image to: %s", png_file)
|
1287
|
+
|
1288
|
+
return b64encode(img_bytes).decode()
|
871
1289
|
|
872
1290
|
# Combine multiple pages vertically
|
873
1291
|
if not all_images:
|
@@ -914,7 +1332,7 @@ Ignore locations that contain data for other campaigns.
|
|
914
1332
|
y_offset += pix.height * scale
|
915
1333
|
|
916
1334
|
# Convert the page to a pixmap
|
917
|
-
mat = fitz.Matrix(
|
1335
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for quality
|
918
1336
|
combined_pix = page.get_pixmap(matrix=mat)
|
919
1337
|
combined_pix = self._ensure_rgb_pixmap(combined_pix)
|
920
1338
|
|
@@ -922,17 +1340,95 @@ Ignore locations that contain data for other campaigns.
|
|
922
1340
|
img_bytes = combined_pix.tobytes("png")
|
923
1341
|
output_doc.close()
|
924
1342
|
|
1343
|
+
# Save debug file if available
|
1344
|
+
if self.debug_dir:
|
1345
|
+
timestamp = int(time.time())
|
1346
|
+
png_file = self.debug_dir / f"combined_pages_{ref.replace(' ', '_')}_{timestamp}.png"
|
1347
|
+
with open(png_file, 'wb') as f:
|
1348
|
+
f.write(img_bytes)
|
1349
|
+
LOGGER.info("Saved combined multi-page image to: %s", png_file)
|
1350
|
+
|
925
1351
|
return b64encode(img_bytes).decode()
|
926
1352
|
|
927
1353
|
# ------------------------------------------------------------------
|
928
1354
|
# 6.3 Extract metrics in batch
|
929
1355
|
# ------------------------------------------------------------------
|
1356
|
+
|
1357
|
+
def _validate_location_exists(self, ref: str) -> bool:
|
1358
|
+
"""Verify that the referenced location actually exists in the document."""
|
1359
|
+
# Search for the actual reference in both manuscript and SI documents
|
1360
|
+
docs_to_check = [self.ms_doc]
|
1361
|
+
if self.si_doc:
|
1362
|
+
docs_to_check.append(self.si_doc)
|
1363
|
+
|
1364
|
+
for doc in docs_to_check:
|
1365
|
+
for page_num in range(len(doc)):
|
1366
|
+
page = doc[page_num]
|
1367
|
+
text = page.get_text()
|
1368
|
+
|
1369
|
+
# Look for table references like "Table 1", "Table S1", etc.
|
1370
|
+
if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
|
1371
|
+
return True
|
1372
|
+
|
1373
|
+
return False
|
1374
|
+
|
1375
|
+
def _validate_context(self, snippet: str, enzyme_list: List[str], ref: str) -> bool:
|
1376
|
+
"""Validate that the context contains meaningful content for extraction."""
|
1377
|
+
if not snippet or len(snippet.strip()) < 50:
|
1378
|
+
LOGGER.warning("Insufficient context for extraction from %s - skipping", ref)
|
1379
|
+
return False
|
1380
|
+
|
1381
|
+
# Check if context actually mentions the enzymes we're looking for
|
1382
|
+
enzyme_mentions = sum(1 for enzyme in enzyme_list if enzyme.lower() in snippet.lower())
|
1383
|
+
if enzyme_mentions == 0:
|
1384
|
+
LOGGER.warning("No enzyme mentions found in context for %s - skipping", ref)
|
1385
|
+
return False
|
1386
|
+
|
1387
|
+
# Check for performance-related keywords
|
1388
|
+
performance_keywords = ['yield', 'selectivity', 'conversion', 'ee', 'er', 'ttn', 'ton', 'tof', '%', 'percent']
|
1389
|
+
has_performance_data = any(keyword in snippet.lower() for keyword in performance_keywords)
|
1390
|
+
|
1391
|
+
if not has_performance_data:
|
1392
|
+
LOGGER.warning("No performance metrics found in context for %s - skipping", ref)
|
1393
|
+
return False
|
1394
|
+
|
1395
|
+
LOGGER.info("Context validated for %s: %d chars, %d enzyme mentions", ref, len(snippet), enzyme_mentions)
|
1396
|
+
return True
|
1397
|
+
|
1398
|
+
def _validate_response(self, data: Dict, enzyme_list: List[str], ref: str) -> bool:
|
1399
|
+
"""Validate that the response contains meaningful data for the requested enzymes."""
|
1400
|
+
if not data or not isinstance(data, dict):
|
1401
|
+
LOGGER.warning("Invalid response format from %s - skipping", ref)
|
1402
|
+
return False
|
1403
|
+
|
1404
|
+
# Check if we got data for at least one enzyme
|
1405
|
+
enzymes_with_data = 0
|
1406
|
+
for enzyme in enzyme_list:
|
1407
|
+
enzyme_data = data.get(enzyme, {})
|
1408
|
+
if isinstance(enzyme_data, dict) and enzyme_data:
|
1409
|
+
# Check if there's at least one non-null metric
|
1410
|
+
metrics = ['yield', 'ttn', 'ton', 'selectivity', 'conversion', 'tof', 'activity']
|
1411
|
+
has_metric = any(enzyme_data.get(metric) is not None for metric in metrics)
|
1412
|
+
if has_metric:
|
1413
|
+
enzymes_with_data += 1
|
1414
|
+
|
1415
|
+
if enzymes_with_data == 0:
|
1416
|
+
LOGGER.warning("No valid metrics found in response from %s - skipping", ref)
|
1417
|
+
return False
|
1418
|
+
|
1419
|
+
LOGGER.info("Response validated for %s: %d enzymes with data", ref, enzymes_with_data)
|
1420
|
+
return True
|
930
1421
|
|
931
1422
|
def extract_metrics_batch(self, enzyme_list: List[str], ref: str) -> List[Dict[str, Any]]:
|
932
1423
|
"""Extract performance metrics for multiple enzymes from the identified location in batch."""
|
933
1424
|
ref_lc = ref.lower()
|
934
1425
|
image_b64: Optional[str] = None
|
935
1426
|
|
1427
|
+
# First, validate that the location actually exists in the document
|
1428
|
+
if not self._validate_location_exists(ref):
|
1429
|
+
LOGGER.warning("Location %s not found in document - skipping", ref)
|
1430
|
+
return []
|
1431
|
+
|
936
1432
|
# Add campaign context if available
|
937
1433
|
campaign_context = ""
|
938
1434
|
if self.campaign_filter:
|
@@ -945,19 +1441,55 @@ Ignore locations that contain data for other campaigns.
|
|
945
1441
|
LOGGER.debug("No page image found for %s - using full page text", ref)
|
946
1442
|
snippet = self._extract_table_context(ref)
|
947
1443
|
elif self._FIG_RE.search(ref_lc):
|
948
|
-
# For figures, extract just the figure image
|
1444
|
+
# For figures, extract just the figure image (same logic as compound mapping)
|
949
1445
|
image_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
950
1446
|
if not image_b64:
|
951
1447
|
LOGGER.debug("No figure image found for %s - using caption text", ref)
|
952
1448
|
snippet = self._extract_figure_caption(ref)
|
1449
|
+
else:
|
1450
|
+
# If figure is found, ignore text information - use image only
|
1451
|
+
snippet = ""
|
953
1452
|
else:
|
954
1453
|
snippet = self._page_with_reference(ref) or ""
|
955
1454
|
|
956
|
-
|
1455
|
+
# For figures with images, skip text validation and proceed with image extraction
|
1456
|
+
if image_b64 and self._FIG_RE.search(ref_lc):
|
1457
|
+
LOGGER.info("Using figure image for %s - ignoring text context", ref)
|
1458
|
+
elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
|
1459
|
+
return []
|
1460
|
+
|
1461
|
+
# Create enhanced enzyme descriptions with parent/mutation context
|
1462
|
+
if hasattr(self, 'enzyme_df') and self.enzyme_df is not None:
|
1463
|
+
enzyme_descriptions = []
|
1464
|
+
for enzyme in enzyme_list:
|
1465
|
+
# Find this enzyme in the dataframe
|
1466
|
+
enzyme_row = None
|
1467
|
+
if 'enzyme_id' in self.enzyme_df.columns:
|
1468
|
+
enzyme_row = self.enzyme_df[self.enzyme_df['enzyme_id'] == enzyme]
|
1469
|
+
elif 'enzyme' in self.enzyme_df.columns:
|
1470
|
+
enzyme_row = self.enzyme_df[self.enzyme_df['enzyme'] == enzyme]
|
1471
|
+
|
1472
|
+
if enzyme_row is not None and len(enzyme_row) > 0:
|
1473
|
+
row = enzyme_row.iloc[0]
|
1474
|
+
parent = row.get('parent_enzyme_id', '')
|
1475
|
+
mutations = row.get('mutations', '')
|
1476
|
+
|
1477
|
+
desc = f"- {enzyme}"
|
1478
|
+
if parent and str(parent).strip() and str(parent) != 'nan':
|
1479
|
+
desc += f" (parent: {parent})"
|
1480
|
+
if mutations and str(mutations).strip() and str(mutations) != 'nan':
|
1481
|
+
desc += f" (mutations: {mutations})"
|
1482
|
+
enzyme_descriptions.append(desc)
|
1483
|
+
else:
|
1484
|
+
enzyme_descriptions.append(f"- {enzyme}")
|
1485
|
+
enzyme_names = "\n".join(enzyme_descriptions)
|
1486
|
+
else:
|
1487
|
+
enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
|
957
1488
|
|
958
1489
|
if image_b64:
|
959
1490
|
# Use batch extraction prompt for image analysis
|
960
|
-
|
1491
|
+
location_context = f"\n\nIMPORTANT: You are extracting data from {ref}, which has been identified as the PRIMARY LOCATION containing the most reliable performance data for these enzymes.\n"
|
1492
|
+
prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
|
961
1493
|
LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
962
1494
|
tag = f"extract_metrics_batch_vision"
|
963
1495
|
else:
|
@@ -977,6 +1509,35 @@ Ignore locations that contain data for other campaigns.
|
|
977
1509
|
image_b64=image_b64
|
978
1510
|
)
|
979
1511
|
|
1512
|
+
# Validate response has meaningful data
|
1513
|
+
if not self._validate_response(data, enzyme_list, ref):
|
1514
|
+
# If figure extraction failed and we have a figure, try falling back to text
|
1515
|
+
if image_b64 and self._FIG_RE.search(ref_lc):
|
1516
|
+
LOGGER.warning("Figure extraction from %s returned empty results - falling back to text", ref)
|
1517
|
+
snippet = self._extract_figure_caption(ref)
|
1518
|
+
if self._validate_context(snippet, enzyme_list, ref):
|
1519
|
+
# Retry with text extraction
|
1520
|
+
format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
|
1521
|
+
prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
|
1522
|
+
LOGGER.info("Gemini: retrying with text extraction for %d enzymes from %s…", len(enzyme_list), ref)
|
1523
|
+
|
1524
|
+
data = generate_json_with_retry(
|
1525
|
+
self.model,
|
1526
|
+
prompt,
|
1527
|
+
temperature=self.cfg.extract_temperature,
|
1528
|
+
debug_dir=self.debug_dir,
|
1529
|
+
tag=f"extract_metrics_batch_text_fallback",
|
1530
|
+
image_b64=None
|
1531
|
+
)
|
1532
|
+
|
1533
|
+
# Validate the text extraction response
|
1534
|
+
if not self._validate_response(data, enzyme_list, ref):
|
1535
|
+
return []
|
1536
|
+
else:
|
1537
|
+
return []
|
1538
|
+
else:
|
1539
|
+
return []
|
1540
|
+
|
980
1541
|
# Handle the response format - expecting a dict with enzyme names as keys
|
981
1542
|
results = []
|
982
1543
|
if isinstance(data, dict):
|
@@ -1027,6 +1588,15 @@ Ignore locations that contain data for other campaigns.
|
|
1027
1588
|
|
1028
1589
|
def find_model_reaction_locations(self, enzyme_variants: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
|
1029
1590
|
"""Find locations for model reaction scheme, conditions, and IUPAC names."""
|
1591
|
+
# Create cache key based on campaign filter and enzyme variants
|
1592
|
+
cache_key = f"{self.campaign_filter}_{hash(tuple(sorted(enzyme_variants)) if enzyme_variants else ())}"
|
1593
|
+
|
1594
|
+
# Check cache first
|
1595
|
+
cached_result = self._model_reaction_locations_cache.get(cache_key)
|
1596
|
+
if cached_result is not None:
|
1597
|
+
LOGGER.info("Using cached model reaction locations for campaign: %s", self.campaign_filter)
|
1598
|
+
return cached_result
|
1599
|
+
|
1030
1600
|
# Collect all text including section titles, captions, and schemes
|
1031
1601
|
all_text = self._collect_captions_and_titles()
|
1032
1602
|
|
@@ -1037,13 +1607,25 @@ Ignore locations that contain data for other campaigns.
|
|
1037
1607
|
# Add enzyme context if provided
|
1038
1608
|
enzyme_context = ""
|
1039
1609
|
if enzyme_variants and self.campaign_filter:
|
1610
|
+
campaigns_context = ""
|
1611
|
+
if self.all_campaigns:
|
1612
|
+
campaigns_context = f"""
|
1613
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
1614
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
1615
|
+
|
1616
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
|
1617
|
+
- Different campaigns may use similar enzyme names but different substrates
|
1618
|
+
- Be extremely careful to only extract data for the {self.campaign_filter} campaign
|
1619
|
+
- Ignore data from other campaigns even if they seem similar
|
1620
|
+
"""
|
1621
|
+
|
1040
1622
|
enzyme_context = f"""
|
1041
1623
|
IMPORTANT CONTEXT:
|
1042
1624
|
You are looking for the model reaction used specifically for these enzyme variants:
|
1043
1625
|
{', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
|
1044
1626
|
|
1045
1627
|
These variants belong to campaign: {self.campaign_filter}
|
1046
|
-
|
1628
|
+
{campaigns_context}
|
1047
1629
|
Focus on finding the model reaction that was used to evaluate THESE specific variants.
|
1048
1630
|
Different campaigns may use different model reactions.
|
1049
1631
|
"""
|
@@ -1061,6 +1643,11 @@ Different campaigns may use different model reactions.
|
|
1061
1643
|
if not isinstance(data, dict):
|
1062
1644
|
LOGGER.error("Expected dict from Gemini, got: %s", type(data))
|
1063
1645
|
return None
|
1646
|
+
|
1647
|
+
# Cache the result
|
1648
|
+
self._model_reaction_locations_cache.put(cache_key, data)
|
1649
|
+
LOGGER.info("Cached model reaction locations for campaign: %s", self.campaign_filter)
|
1650
|
+
|
1064
1651
|
return data
|
1065
1652
|
except Exception as e:
|
1066
1653
|
LOGGER.error("Failed to find model reaction locations: %s", e)
|
@@ -1159,9 +1746,12 @@ Different campaigns may use different model reactions.
|
|
1159
1746
|
extraction_text: str,
|
1160
1747
|
compound_ids: List[str] = None,
|
1161
1748
|
tag_suffix: str = "",
|
1749
|
+
campaign_filter: Optional[str] = None,
|
1162
1750
|
) -> Dict[str, CompoundMapping]:
|
1163
1751
|
"""Helper function to extract compound mappings from provided text."""
|
1164
1752
|
prompt = PROMPT_COMPOUND_MAPPING
|
1753
|
+
if campaign_filter:
|
1754
|
+
prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
|
1165
1755
|
if compound_ids:
|
1166
1756
|
prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
|
1167
1757
|
prompt += "\n\nTEXT:\n" + extraction_text
|
@@ -1209,6 +1799,7 @@ Different campaigns may use different model reactions.
|
|
1209
1799
|
compound_ids: List[str],
|
1210
1800
|
figure_images: Dict[str, str],
|
1211
1801
|
tag_suffix: str = "",
|
1802
|
+
campaign_filter: Optional[str] = None,
|
1212
1803
|
) -> Dict[str, CompoundMapping]:
|
1213
1804
|
"""Extract compound mappings using multimodal approach with figures."""
|
1214
1805
|
# Enhanced prompt for figure-based extraction
|
@@ -1220,8 +1811,26 @@ Use your best knowledge, Look carefully in:
|
|
1220
1811
|
1. The chemical structures shown in figures - infer IUPAC names from drawn structures
|
1221
1812
|
2. Figure captions that may define compounds
|
1222
1813
|
3. Text that refers to these compound numbers
|
1223
|
-
4. Reaction schemes showing transformations
|
1814
|
+
4. Reaction schemes showing transformations"""
|
1815
|
+
|
1816
|
+
if campaign_filter:
|
1817
|
+
campaigns_warning = ""
|
1818
|
+
if self.all_campaigns:
|
1819
|
+
campaigns_warning = f"""
|
1820
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
1821
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
1822
|
+
|
1823
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
|
1824
|
+
"""
|
1825
|
+
|
1826
|
+
prompt += f"""
|
1224
1827
|
|
1828
|
+
IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system.
|
1829
|
+
{campaigns_warning}
|
1830
|
+
Different campaigns may use different numbering systems for compounds.
|
1831
|
+
Do NOT include compound information from other campaigns."""
|
1832
|
+
|
1833
|
+
prompt += """
|
1225
1834
|
|
1226
1835
|
IMPORTANT:
|
1227
1836
|
- Only provide IUPAC names you can determine from the figures or text
|
@@ -1251,9 +1860,10 @@ TEXT FROM MANUSCRIPT:
|
|
1251
1860
|
for fig_ref, fig_base64 in figure_images.items():
|
1252
1861
|
try:
|
1253
1862
|
img_bytes = b64decode(fig_base64)
|
1254
|
-
image
|
1863
|
+
# Format image for Gemini API
|
1864
|
+
image_part = {"mime_type": "image/png", "data": img_bytes}
|
1255
1865
|
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1256
|
-
content_parts.append(
|
1866
|
+
content_parts.append(image_part)
|
1257
1867
|
LOGGER.info("Added figure %s to multimodal compound mapping", fig_ref)
|
1258
1868
|
except Exception as e:
|
1259
1869
|
LOGGER.warning("Failed to add figure %s: %s", fig_ref, e)
|
@@ -1283,6 +1893,21 @@ TEXT FROM MANUSCRIPT:
|
|
1283
1893
|
|
1284
1894
|
# Make multimodal API call
|
1285
1895
|
response = self.model.generate_content(content_parts)
|
1896
|
+
|
1897
|
+
# Track token usage if available
|
1898
|
+
try:
|
1899
|
+
if hasattr(response, 'usage_metadata'):
|
1900
|
+
input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
1901
|
+
output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
1902
|
+
if input_tokens or output_tokens:
|
1903
|
+
try:
|
1904
|
+
from .wrapper import add_token_usage
|
1905
|
+
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
1906
|
+
except ImportError:
|
1907
|
+
pass # wrapper not available
|
1908
|
+
except Exception:
|
1909
|
+
pass # token tracking is best-effort
|
1910
|
+
|
1286
1911
|
raw_text = response.text.strip()
|
1287
1912
|
|
1288
1913
|
# Log response
|
@@ -1329,18 +1954,37 @@ TEXT FROM MANUSCRIPT:
|
|
1329
1954
|
self,
|
1330
1955
|
compound_ids: List[str],
|
1331
1956
|
initial_sections: List[str] = None,
|
1957
|
+
campaign_filter: Optional[str] = None,
|
1332
1958
|
) -> Dict[str, CompoundMapping]:
|
1333
|
-
"""Extract compound ID to IUPAC name mappings using
|
1959
|
+
"""Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
|
1334
1960
|
|
1335
|
-
1. First attempts extraction from
|
1336
|
-
2.
|
1337
|
-
3. Uses multimodal figure analysis as final fallback
|
1961
|
+
1. First attempts extraction from specific SI sections + 10 manuscript pages
|
1962
|
+
2. If compounds missing, uses full manuscript + SI with multimodal figure analysis
|
1338
1963
|
"""
|
1339
1964
|
if not compound_ids:
|
1340
1965
|
return {}
|
1341
1966
|
|
1342
|
-
|
1343
|
-
|
1967
|
+
# Check cache first - return cached results for compounds we've already processed
|
1968
|
+
cached_mappings = {}
|
1969
|
+
uncached_compound_ids = []
|
1970
|
+
|
1971
|
+
for cid in compound_ids:
|
1972
|
+
# Include campaign filter in cache key to prevent cross-campaign contamination
|
1973
|
+
cache_key = f"{campaign_filter}_{cid.lower().strip()}" if campaign_filter else cid.lower().strip()
|
1974
|
+
cached_mapping = self._compound_mapping_cache.get(cache_key)
|
1975
|
+
if cached_mapping is not None:
|
1976
|
+
cached_mappings[cid.lower().strip()] = cached_mapping
|
1977
|
+
LOGGER.info("Using cached compound mapping for: %s (campaign: %s)", cid, campaign_filter)
|
1978
|
+
else:
|
1979
|
+
uncached_compound_ids.append(cid)
|
1980
|
+
|
1981
|
+
# If all compounds are cached, return immediately
|
1982
|
+
if not uncached_compound_ids:
|
1983
|
+
LOGGER.info("All %d compounds found in cache, skipping API calls", len(compound_ids))
|
1984
|
+
return cached_mappings
|
1985
|
+
|
1986
|
+
LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
|
1987
|
+
len(uncached_compound_ids), sorted(uncached_compound_ids))
|
1344
1988
|
|
1345
1989
|
# Tier 1: Standard sections (manuscript + initial SI sections)
|
1346
1990
|
initial_sections = initial_sections or [
|
@@ -1351,118 +1995,118 @@ TEXT FROM MANUSCRIPT:
|
|
1351
1995
|
# Include manuscript pages (first 10) for model reaction context
|
1352
1996
|
manuscript_text = "\n\n".join(self.ms_pages[:10])
|
1353
1997
|
|
1998
|
+
# Add campaign context if provided
|
1999
|
+
campaign_context = ""
|
2000
|
+
if campaign_filter:
|
2001
|
+
campaigns_warning = ""
|
2002
|
+
if self.all_campaigns:
|
2003
|
+
campaigns_warning = f"""
|
2004
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
2005
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
2006
|
+
|
2007
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
|
2008
|
+
"""
|
2009
|
+
|
2010
|
+
campaign_context = f"""
|
2011
|
+
|
2012
|
+
IMPORTANT CAMPAIGN CONTEXT:
|
2013
|
+
You are extracting compound information specifically for the {campaign_filter} campaign.
|
2014
|
+
{campaigns_warning}
|
2015
|
+
Focus ONLY on compound information relevant to the {campaign_filter} campaign/reaction system.
|
2016
|
+
Do NOT include compound information from other campaigns.
|
2017
|
+
|
2018
|
+
"""
|
2019
|
+
|
1354
2020
|
# Extract from initial sections
|
1355
2021
|
extraction_text = self._extract_sections_by_title(initial_sections)
|
1356
2022
|
if extraction_text:
|
1357
|
-
extraction_text = manuscript_text + "\n\n" + extraction_text
|
2023
|
+
extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
|
1358
2024
|
else:
|
1359
|
-
extraction_text = manuscript_text
|
2025
|
+
extraction_text = manuscript_text + campaign_context
|
1360
2026
|
|
1361
|
-
# First extraction attempt
|
2027
|
+
# First extraction attempt - only for uncached compounds
|
1362
2028
|
mappings = self._extract_compound_mappings_from_text(
|
1363
|
-
extraction_text[:50000],
|
2029
|
+
extraction_text[:50000], uncached_compound_ids, tag_suffix="initial", campaign_filter=campaign_filter
|
1364
2030
|
)
|
1365
2031
|
LOGGER.info("Tier 1: Found %d compound mappings from standard sections", len(mappings))
|
1366
2032
|
|
1367
2033
|
# Check for missing compounds
|
1368
2034
|
missing_compounds = []
|
1369
|
-
for cid in
|
2035
|
+
for cid in uncached_compound_ids:
|
1370
2036
|
mapping = mappings.get(cid.lower().strip())
|
1371
2037
|
if not mapping or not mapping.iupac_name:
|
1372
2038
|
missing_compounds.append(cid)
|
1373
2039
|
|
1374
|
-
# Tier 2
|
2040
|
+
# Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
|
1375
2041
|
if missing_compounds:
|
1376
|
-
LOGGER.info("Tier 2: %d compounds still missing IUPAC names: %s",
|
2042
|
+
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
|
1377
2043
|
len(missing_compounds), sorted(missing_compounds))
|
1378
2044
|
|
1379
|
-
#
|
1380
|
-
additional_sections = [
|
1381
|
-
"Engineering strategy", "Evolution campaign",
|
1382
|
-
"Screening", "Optimization", "Substrate synthesis",
|
1383
|
-
"Supporting Information", "Supplementary Methods"
|
1384
|
-
]
|
1385
|
-
|
1386
|
-
# Extract from additional sections
|
1387
|
-
additional_text = self._extract_sections_by_title(additional_sections)
|
1388
|
-
|
1389
|
-
# Also extract any figures that might contain compound structures
|
2045
|
+
# Get all available figures for compound structure analysis
|
1390
2046
|
figure_images = {}
|
1391
|
-
|
2047
|
+
|
2048
|
+
# Extract main manuscript figures
|
2049
|
+
figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
|
1392
2050
|
for ref in figure_refs:
|
1393
2051
|
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1394
2052
|
if img_b64:
|
1395
2053
|
figure_images[ref] = img_b64
|
1396
|
-
LOGGER.info("
|
1397
|
-
|
1398
|
-
# Try multimodal approach with figures and expanded text
|
1399
|
-
if figure_images or additional_text:
|
1400
|
-
combined_text = additional_text[:30000] if additional_text else ""
|
1401
|
-
expanded_mappings = self._extract_compound_mappings_with_figures(
|
1402
|
-
combined_text, missing_compounds, figure_images, tag_suffix="tier2"
|
1403
|
-
)
|
2054
|
+
LOGGER.info("Retrieved %s for compound mapping", ref)
|
1404
2055
|
|
1405
|
-
|
1406
|
-
|
1407
|
-
|
1408
|
-
|
1409
|
-
|
1410
|
-
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1429
|
-
|
1430
|
-
|
1431
|
-
|
1432
|
-
|
1433
|
-
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
|
1444
|
-
|
1445
|
-
|
1446
|
-
final_mappings = self._extract_compound_mappings_with_figures(
|
1447
|
-
full_text[:50000], still_missing, figure_images, tag_suffix="tier3"
|
1448
|
-
)
|
1449
|
-
|
1450
|
-
# Merge final mappings
|
1451
|
-
final_found = 0
|
1452
|
-
for key, mapping in final_mappings.items():
|
1453
|
-
if key not in mappings or not mappings[key].iupac_name:
|
1454
|
-
if mapping.iupac_name:
|
1455
|
-
mappings[key] = mapping
|
1456
|
-
final_found += 1
|
1457
|
-
LOGGER.info("Found IUPAC name for '%s' in final search: %s",
|
1458
|
-
key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
|
2056
|
+
# Get SI figures
|
2057
|
+
si_figure_refs = []
|
2058
|
+
for page in self.si_pages[:10]: # Check first 10 SI pages
|
2059
|
+
matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
|
2060
|
+
si_figure_refs.extend(matches[:10]) # Limit to 10 figures
|
2061
|
+
|
2062
|
+
# Extract SI figures
|
2063
|
+
for ref in set(si_figure_refs):
|
2064
|
+
if ref not in figure_images:
|
2065
|
+
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2066
|
+
if img_b64:
|
2067
|
+
figure_images[ref] = img_b64
|
2068
|
+
LOGGER.info("Extracted %s for compound mapping", ref)
|
2069
|
+
|
2070
|
+
# Full text search including all pages
|
2071
|
+
full_text = "\n\n".join(self.all_pages[:40]) # First 40 pages (more comprehensive)
|
2072
|
+
|
2073
|
+
final_mappings = self._extract_compound_mappings_with_figures(
|
2074
|
+
full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
|
2075
|
+
)
|
2076
|
+
|
2077
|
+
# Merge final mappings with better compound ID matching
|
2078
|
+
final_found = 0
|
2079
|
+
for key, mapping in final_mappings.items():
|
2080
|
+
if key not in mappings or not mappings[key].iupac_name:
|
2081
|
+
if mapping.iupac_name:
|
2082
|
+
mappings[key] = mapping
|
2083
|
+
final_found += 1
|
2084
|
+
iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
|
2085
|
+
LOGGER.info("Found IUPAC name for '%s' in full search: %s", key, iupac_display)
|
2086
|
+
|
2087
|
+
LOGGER.info("Tier 2: Found %d additional compound mappings", final_found)
|
2088
|
+
|
2089
|
+
# Cache all newly found mappings using campaign-aware cache key
|
2090
|
+
for key, mapping in mappings.items():
|
2091
|
+
cache_key = f"{campaign_filter}_{key}" if campaign_filter else key
|
2092
|
+
if self._compound_mapping_cache.get(cache_key) is None:
|
2093
|
+
self._compound_mapping_cache.put(cache_key, mapping)
|
2094
|
+
iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
|
2095
|
+
LOGGER.info("Cached compound mapping for: %s -> %s (campaign: %s)", key, iupac_display, campaign_filter)
|
1459
2096
|
|
1460
|
-
|
2097
|
+
# Also cache without campaign prefix for backward compatibility during integration
|
2098
|
+
if campaign_filter:
|
2099
|
+
self._compound_mapping_cache.put(key, mapping)
|
2100
|
+
|
2101
|
+
# Combine cached and new mappings
|
2102
|
+
final_mappings = cached_mappings.copy()
|
2103
|
+
final_mappings.update(mappings)
|
1461
2104
|
|
1462
|
-
LOGGER.info("Adaptive compound mapping complete: %d total mappings",
|
1463
|
-
|
2105
|
+
LOGGER.info("Adaptive compound mapping complete: %d total mappings (%d cached, %d new)",
|
2106
|
+
len(final_mappings), len(cached_mappings), len(mappings))
|
2107
|
+
return final_mappings
|
1464
2108
|
|
1465
|
-
def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
|
2109
|
+
def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None, lineage_compound_ids: Optional[Dict[str, List[str]]] = None) -> Dict[str, Any]:
|
1466
2110
|
"""Extract model reaction information using identified locations and 3-tier compound mapping."""
|
1467
2111
|
# First find the best locations
|
1468
2112
|
locations = self.find_model_reaction_locations(enzyme_variants)
|
@@ -1585,9 +2229,15 @@ TEXT FROM MANUSCRIPT:
|
|
1585
2229
|
figure_images[fig_ref] = img_b64
|
1586
2230
|
LOGGER.info("Successfully extracted %s image for model reaction analysis", fig_ref)
|
1587
2231
|
|
1588
|
-
# Extract compound IDs from locations
|
2232
|
+
# Extract compound IDs from locations or use lineage-specific ones
|
1589
2233
|
compound_ids = []
|
1590
|
-
if
|
2234
|
+
if lineage_compound_ids:
|
2235
|
+
# Use lineage-specific compound IDs if provided
|
2236
|
+
substrate_ids = lineage_compound_ids.get("substrate_ids", [])
|
2237
|
+
product_ids = lineage_compound_ids.get("product_ids", [])
|
2238
|
+
compound_ids = substrate_ids + product_ids
|
2239
|
+
LOGGER.info("Using lineage-specific compound IDs: %s", compound_ids)
|
2240
|
+
elif locations and locations.get("model_reaction_location", {}).get("compound_ids"):
|
1591
2241
|
compound_ids = locations["model_reaction_location"]["compound_ids"]
|
1592
2242
|
LOGGER.info("Found compound IDs in model reaction: %s", compound_ids)
|
1593
2243
|
|
@@ -1595,7 +2245,7 @@ TEXT FROM MANUSCRIPT:
|
|
1595
2245
|
compound_mappings = {}
|
1596
2246
|
if compound_ids:
|
1597
2247
|
LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
|
1598
|
-
compound_mappings = self._extract_compound_mappings_adaptive(compound_ids)
|
2248
|
+
compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
|
1599
2249
|
|
1600
2250
|
# Add the mapped IUPAC names to the context for better extraction
|
1601
2251
|
if compound_mappings:
|
@@ -1606,8 +2256,35 @@ TEXT FROM MANUSCRIPT:
|
|
1606
2256
|
mapping_text += f"Compound {cid}: {mapping.iupac_name}\n"
|
1607
2257
|
text_context += mapping_text
|
1608
2258
|
|
2259
|
+
# Add campaign context if available
|
2260
|
+
campaign_context = ""
|
2261
|
+
if enzyme_variants and self.campaign_filter:
|
2262
|
+
campaigns_context = ""
|
2263
|
+
if self.all_campaigns:
|
2264
|
+
campaigns_context = f"""
|
2265
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
2266
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
2267
|
+
|
2268
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
|
2269
|
+
- Different campaigns may use similar enzyme names but different substrates
|
2270
|
+
- Be extremely careful to only extract data for the {self.campaign_filter} campaign
|
2271
|
+
- Ignore data from other campaigns even if they seem similar
|
2272
|
+
"""
|
2273
|
+
|
2274
|
+
campaign_context = f"""
|
2275
|
+
IMPORTANT CONTEXT:
|
2276
|
+
You are extracting the model reaction used specifically for these enzyme variants:
|
2277
|
+
{', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
|
2278
|
+
|
2279
|
+
These variants belong to campaign: {self.campaign_filter}
|
2280
|
+
{campaigns_context}
|
2281
|
+
Focus on extracting the model reaction that was used to evaluate THESE specific variants.
|
2282
|
+
Different campaigns may use different model reactions and substrates.
|
2283
|
+
|
2284
|
+
"""
|
2285
|
+
|
1609
2286
|
# Include both manuscript and SI text for better coverage
|
1610
|
-
prompt = PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
|
2287
|
+
prompt = campaign_context + PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
|
1611
2288
|
|
1612
2289
|
try:
|
1613
2290
|
# Use multimodal extraction if we have figure images
|
@@ -1620,9 +2297,10 @@ TEXT FROM MANUSCRIPT:
|
|
1620
2297
|
for fig_ref, fig_base64 in figure_images.items():
|
1621
2298
|
try:
|
1622
2299
|
img_bytes = b64decode(fig_base64)
|
1623
|
-
image
|
2300
|
+
# Format image for Gemini API
|
2301
|
+
image_part = {"mime_type": "image/png", "data": img_bytes}
|
1624
2302
|
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1625
|
-
content_parts.append(
|
2303
|
+
content_parts.append(image_part)
|
1626
2304
|
except Exception as e:
|
1627
2305
|
LOGGER.warning("Failed to process figure %s: %s", fig_ref, e)
|
1628
2306
|
|
@@ -1641,6 +2319,20 @@ TEXT FROM MANUSCRIPT:
|
|
1641
2319
|
|
1642
2320
|
response = model.generate_content(content_parts)
|
1643
2321
|
|
2322
|
+
# Track token usage if available
|
2323
|
+
try:
|
2324
|
+
if hasattr(response, 'usage_metadata'):
|
2325
|
+
input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
2326
|
+
output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
2327
|
+
if input_tokens or output_tokens:
|
2328
|
+
try:
|
2329
|
+
from .wrapper import add_token_usage
|
2330
|
+
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
2331
|
+
except ImportError:
|
2332
|
+
pass # wrapper not available
|
2333
|
+
except Exception:
|
2334
|
+
pass # token tracking is best-effort
|
2335
|
+
|
1644
2336
|
# Parse JSON from response
|
1645
2337
|
if response and response.text:
|
1646
2338
|
# Save debug output
|
@@ -1681,32 +2373,59 @@ TEXT FROM MANUSCRIPT:
|
|
1681
2373
|
if isinstance(data, dict):
|
1682
2374
|
# If we have compound mappings, enhance the IUPAC names
|
1683
2375
|
if compound_ids and compound_mappings:
|
2376
|
+
LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
|
2377
|
+
list(compound_mappings.keys()))
|
2378
|
+
|
1684
2379
|
# Try to map substrate/product lists through compound IDs
|
1685
|
-
substrate_list = data.get("substrate_iupac_list", [])
|
2380
|
+
substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
|
1686
2381
|
if isinstance(substrate_list, list):
|
1687
2382
|
enhanced_substrates = []
|
1688
2383
|
for item in substrate_list:
|
2384
|
+
item_str = str(item).lower().strip()
|
1689
2385
|
# Check if it's a compound ID that we can map
|
1690
|
-
mapping = compound_mappings.get(
|
2386
|
+
mapping = compound_mappings.get(item_str)
|
1691
2387
|
if mapping and mapping.iupac_name:
|
1692
2388
|
enhanced_substrates.append(mapping.iupac_name)
|
2389
|
+
LOGGER.info("Mapped substrate '%s' -> '%s'", item, mapping.iupac_name)
|
1693
2390
|
elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
|
1694
|
-
# Keep valid IUPAC names
|
2391
|
+
# Keep valid IUPAC names that aren't compound IDs
|
1695
2392
|
enhanced_substrates.append(str(item))
|
2393
|
+
LOGGER.info("Kept substrate IUPAC name: '%s'", item)
|
2394
|
+
else:
|
2395
|
+
LOGGER.warning("Could not map substrate compound ID '%s'", item)
|
1696
2396
|
data["substrate_iupac_list"] = enhanced_substrates
|
1697
2397
|
|
1698
|
-
product_list = data.get("product_iupac_list", [])
|
2398
|
+
product_list = data.get("product_iupac_list", []) or data.get("product_list", [])
|
1699
2399
|
if isinstance(product_list, list):
|
1700
2400
|
enhanced_products = []
|
1701
2401
|
for item in product_list:
|
2402
|
+
item_str = str(item).lower().strip()
|
1702
2403
|
# Check if it's a compound ID that we can map
|
1703
|
-
mapping = compound_mappings.get(
|
2404
|
+
mapping = compound_mappings.get(item_str)
|
1704
2405
|
if mapping and mapping.iupac_name:
|
1705
2406
|
enhanced_products.append(mapping.iupac_name)
|
2407
|
+
LOGGER.info("Mapped product '%s' -> '%s'", item, mapping.iupac_name)
|
1706
2408
|
elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
|
1707
|
-
# Keep valid IUPAC names
|
2409
|
+
# Keep valid IUPAC names that aren't compound IDs
|
1708
2410
|
enhanced_products.append(str(item))
|
2411
|
+
LOGGER.info("Kept product IUPAC name: '%s'", item)
|
2412
|
+
else:
|
2413
|
+
LOGGER.warning("Could not map product compound ID '%s'", item)
|
1709
2414
|
data["product_iupac_list"] = enhanced_products
|
2415
|
+
|
2416
|
+
# Also try to enhance using both substrate_list and product_list if they contain compound IDs
|
2417
|
+
for list_key, target_key in [("substrate_list", "substrate_iupac_list"), ("product_list", "product_iupac_list")]:
|
2418
|
+
if list_key in data and isinstance(data[list_key], list):
|
2419
|
+
if target_key not in data or not data[target_key]:
|
2420
|
+
enhanced_list = []
|
2421
|
+
for item in data[list_key]:
|
2422
|
+
item_str = str(item).lower().strip()
|
2423
|
+
mapping = compound_mappings.get(item_str)
|
2424
|
+
if mapping and mapping.iupac_name:
|
2425
|
+
enhanced_list.append(mapping.iupac_name)
|
2426
|
+
LOGGER.info("Enhanced %s: mapped '%s' -> '%s'", target_key, item, mapping.iupac_name)
|
2427
|
+
if enhanced_list:
|
2428
|
+
data[target_key] = enhanced_list
|
1710
2429
|
|
1711
2430
|
# Validate and convert arrays to semicolon-separated strings for CSV compatibility
|
1712
2431
|
if "substrate_iupac_list" in data and isinstance(data["substrate_iupac_list"], list):
|
@@ -1753,8 +2472,8 @@ TEXT FROM MANUSCRIPT:
|
|
1753
2472
|
return data
|
1754
2473
|
|
1755
2474
|
def _process_single_lineage(self, location: Dict[str, Any], enzyme_df: pd.DataFrame) -> pd.DataFrame:
|
1756
|
-
"""Process a single lineage case -
|
1757
|
-
#
|
2475
|
+
"""Process a single lineage case - use confidence-based processing."""
|
2476
|
+
# Create lineage analysis for single location
|
1758
2477
|
lineage_analysis = {
|
1759
2478
|
'has_multiple_lineages': False,
|
1760
2479
|
'lineage_groups': [{
|
@@ -1766,7 +2485,7 @@ TEXT FROM MANUSCRIPT:
|
|
1766
2485
|
}]
|
1767
2486
|
}
|
1768
2487
|
|
1769
|
-
return self.
|
2488
|
+
return self._process_multiple_lineages_by_confidence([location], enzyme_df, lineage_analysis)
|
1770
2489
|
|
1771
2490
|
def _process_multiple_lineages_by_confidence(self, locations: List[Dict[str, Any]],
|
1772
2491
|
enzyme_df: pd.DataFrame,
|
@@ -1781,186 +2500,69 @@ TEXT FROM MANUSCRIPT:
|
|
1781
2500
|
# If enzyme_df has campaign_id column, we can use it to filter
|
1782
2501
|
has_campaign_info = 'campaign_id' in enzyme_df.columns
|
1783
2502
|
|
1784
|
-
#
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
LOGGER.info("\nProcessing location %s (confidence: %d%%)",
|
1791
|
-
location['location'], location.get('confidence', 0))
|
2503
|
+
# Select the most confident source only
|
2504
|
+
best_location = None
|
2505
|
+
if locations:
|
2506
|
+
# Sort by confidence only
|
2507
|
+
locations_sorted = sorted(locations, key=lambda x: -x.get('confidence', 0))
|
2508
|
+
best_location = locations_sorted[0]
|
1792
2509
|
|
1793
|
-
|
1794
|
-
|
2510
|
+
LOGGER.info("Selected primary location: %s (type: %s, confidence: %d%%)",
|
2511
|
+
best_location['location'],
|
2512
|
+
best_location.get('type', 'unknown'),
|
2513
|
+
best_location.get('confidence', 0))
|
2514
|
+
|
2515
|
+
# Extract metrics from the most confident source only
|
2516
|
+
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
|
1795
2517
|
|
1796
2518
|
# Filter to valid metrics
|
1797
2519
|
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
1798
2520
|
|
1799
2521
|
if not valid_metrics:
|
1800
|
-
LOGGER.warning("No valid metrics found in %s",
|
1801
|
-
|
2522
|
+
LOGGER.warning("No valid metrics found in primary location %s", best_location['location'])
|
2523
|
+
return pd.DataFrame()
|
1802
2524
|
|
1803
|
-
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics),
|
2525
|
+
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), best_location['location'])
|
1804
2526
|
|
1805
|
-
# Create DataFrame for
|
2527
|
+
# Create DataFrame for the single best location
|
1806
2528
|
df_location = pd.DataFrame(valid_metrics)
|
1807
2529
|
|
1808
|
-
#
|
1809
|
-
|
1810
|
-
|
1811
|
-
variants_with_data.update(new_variants)
|
2530
|
+
# Add metadata about the location
|
2531
|
+
df_location['data_location'] = best_location['location']
|
2532
|
+
df_location['confidence'] = best_location.get('confidence', 0)
|
1812
2533
|
|
1813
|
-
|
1814
|
-
# by checking which variants are present
|
1815
|
-
location_variants = set(df_location['enzyme'].tolist())
|
2534
|
+
LOGGER.info("Successfully extracted data for %d enzymes from primary location", len(df_location))
|
1816
2535
|
|
1817
|
-
#
|
1818
|
-
|
1819
|
-
if
|
1820
|
-
|
1821
|
-
|
1822
|
-
|
1823
|
-
else:
|
1824
|
-
variant_campaigns = enzyme_df[enzyme_df['enzyme'].isin(location_variants)]['campaign_id'].unique()
|
1825
|
-
if len(variant_campaigns) == 1:
|
1826
|
-
campaign_id = variant_campaigns[0]
|
1827
|
-
LOGGER.info("Location %s contains variants from campaign: %s",
|
1828
|
-
location['location'], campaign_id)
|
1829
|
-
elif len(variant_campaigns) > 1:
|
1830
|
-
LOGGER.warning("Location %s contains variants from multiple campaigns: %s",
|
1831
|
-
location['location'], variant_campaigns)
|
1832
|
-
|
1833
|
-
# Extract model reaction specific to this location/campaign
|
1834
|
-
location_context = f"Location: {location['location']}"
|
1835
|
-
if location.get('caption'):
|
1836
|
-
location_context += f"\nCaption: {location['caption']}"
|
1837
|
-
|
1838
|
-
# First find model reaction locations for this campaign/enzyme group
|
2536
|
+
# Extract model reaction info once for this location
|
2537
|
+
location_context = f"Location: {best_location['location']}"
|
2538
|
+
if best_location.get('caption'):
|
2539
|
+
location_context += f"\nCaption: {best_location['caption']}"
|
2540
|
+
|
2541
|
+
# Get enzyme list for model reaction
|
1839
2542
|
location_enzymes = df_location['enzyme'].unique().tolist()
|
2543
|
+
# Get model reaction locations for this campaign
|
1840
2544
|
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1841
2545
|
|
1842
|
-
#
|
1843
|
-
|
1844
|
-
location['location'],
|
1845
|
-
location_context,
|
1846
|
-
model_reaction_locations
|
1847
|
-
)
|
1848
|
-
|
1849
|
-
# Get full model reaction info with IUPAC names
|
1850
|
-
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
1851
|
-
model_info = self._extract_lineage_model_info(location_model_reaction)
|
1852
|
-
else:
|
1853
|
-
# Fall back to general model reaction extraction
|
1854
|
-
# Pass the enzyme variants from this location
|
1855
|
-
model_info = self.gather_model_reaction_info(location_enzymes)
|
1856
|
-
|
1857
|
-
# Add model reaction info to all enzymes from this location
|
1858
|
-
for key, value in model_info.items():
|
1859
|
-
if isinstance(value, list):
|
1860
|
-
value = "; ".join(str(v) for v in value) if value else None
|
1861
|
-
df_location[key] = value
|
1862
|
-
|
1863
|
-
# Add location and campaign info
|
1864
|
-
df_location['data_location'] = location['location']
|
1865
|
-
df_location['location_type'] = location.get('type', 'unknown')
|
1866
|
-
df_location['location_confidence'] = location.get('confidence', 0)
|
1867
|
-
# Remove lineage_group column - not working properly
|
1868
|
-
# df_location['lineage_group'] = location.get('lineage_hint', campaign_id or 'unknown')
|
1869
|
-
|
1870
|
-
all_results.append(df_location)
|
1871
|
-
|
1872
|
-
# Log progress
|
1873
|
-
LOGGER.info("Progress: %d/%d variants have data",
|
1874
|
-
len(variants_with_data), len(all_variants))
|
1875
|
-
|
1876
|
-
if all_results:
|
1877
|
-
# Combine all results
|
1878
|
-
df_combined = pd.concat(all_results, ignore_index=True)
|
1879
|
-
|
1880
|
-
# If we have duplicates (same variant in multiple locations), keep the one with highest confidence
|
1881
|
-
if df_combined.duplicated(subset=['enzyme']).any():
|
1882
|
-
LOGGER.info("Removing duplicates, keeping highest confidence data")
|
1883
|
-
df_combined = df_combined.sort_values(
|
1884
|
-
['enzyme', 'location_confidence'],
|
1885
|
-
ascending=[True, False]
|
1886
|
-
).drop_duplicates(subset=['enzyme'], keep='first')
|
1887
|
-
|
1888
|
-
# Log extraction summary
|
1889
|
-
LOGGER.info("Extraction complete: %d unique variants from %d locations",
|
1890
|
-
len(df_combined), len(all_results))
|
2546
|
+
# Extract model reaction for this location - use unified approach
|
2547
|
+
LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
|
1891
2548
|
|
1892
|
-
|
1893
|
-
for location in df_combined['data_location'].unique():
|
1894
|
-
location_enzymes = df_combined[df_combined['data_location'] == location]
|
1895
|
-
LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
|
1896
|
-
|
1897
|
-
return df_combined
|
1898
|
-
else:
|
1899
|
-
LOGGER.warning("No metrics extracted from any location")
|
1900
|
-
return pd.DataFrame()
|
1901
|
-
|
1902
|
-
def _process_multiple_lineages(self, locations: List[Dict[str, Any]],
|
1903
|
-
enzyme_df: pd.DataFrame,
|
1904
|
-
lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
|
1905
|
-
"""Process multiple lineages where each location represents a different model reaction."""
|
1906
|
-
all_metrics = []
|
1907
|
-
lineage_groups = lineage_analysis.get('lineage_groups', [])
|
1908
|
-
|
1909
|
-
# Get all enzyme IDs for extraction attempts
|
1910
|
-
all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else []
|
1911
|
-
|
1912
|
-
for group in lineage_groups:
|
1913
|
-
group_location = group.get('data_location')
|
1914
|
-
group_id = group.get('group_id')
|
1915
|
-
|
1916
|
-
# Find the location info
|
1917
|
-
location_info = next((loc for loc in locations if loc['location'] == group_location), None)
|
1918
|
-
if not location_info:
|
1919
|
-
LOGGER.warning("No location info found for group %s at %s", group_id, group_location)
|
1920
|
-
continue
|
1921
|
-
|
1922
|
-
LOGGER.info("Processing location %s (%s)", group_location, group_id)
|
1923
|
-
|
1924
|
-
# Extract metrics from this location for ALL enzymes
|
1925
|
-
# The extractor will return only those that actually have data
|
1926
|
-
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, group_location)
|
1927
|
-
|
1928
|
-
# Filter to enzymes that actually had data in this location
|
1929
|
-
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
1930
|
-
|
1931
|
-
if not valid_metrics:
|
1932
|
-
LOGGER.warning("No valid metrics found in %s", group_location)
|
1933
|
-
continue
|
1934
|
-
|
1935
|
-
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), group_location)
|
1936
|
-
|
1937
|
-
# Create DataFrame for this location
|
1938
|
-
df_location = pd.DataFrame(valid_metrics)
|
1939
|
-
|
1940
|
-
# Extract model reaction specific to this location
|
1941
|
-
# Different locations = different model reactions
|
1942
|
-
location_context = f"Location: {group_location}"
|
1943
|
-
if group.get('caption'):
|
1944
|
-
location_context += f"\nCaption: {group['caption']}"
|
1945
|
-
|
1946
|
-
# First find model reaction locations for this enzyme group
|
1947
|
-
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
1948
|
-
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1949
|
-
|
1950
|
-
# Try to find model reaction for this specific lineage, passing the locations
|
2549
|
+
# Try lineage-specific extraction first
|
1951
2550
|
location_model_reaction = self.find_lineage_model_reaction(
|
1952
|
-
|
2551
|
+
best_location['location'],
|
1953
2552
|
location_context,
|
1954
2553
|
model_reaction_locations
|
1955
2554
|
)
|
1956
2555
|
|
1957
|
-
#
|
2556
|
+
# Check if lineage extraction was successful
|
1958
2557
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
1959
|
-
|
2558
|
+
LOGGER.info("Using lineage-specific model reaction data")
|
2559
|
+
model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
|
1960
2560
|
else:
|
1961
|
-
|
1962
|
-
#
|
2561
|
+
LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
|
2562
|
+
# Use the comprehensive multimodal approach as fallback
|
1963
2563
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
2564
|
+
|
2565
|
+
LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
|
1964
2566
|
|
1965
2567
|
# Add model reaction info to all enzymes from this location
|
1966
2568
|
for key, value in model_info.items():
|
@@ -1968,30 +2570,18 @@ TEXT FROM MANUSCRIPT:
|
|
1968
2570
|
value = "; ".join(str(v) for v in value) if value else None
|
1969
2571
|
df_location[key] = value
|
1970
2572
|
|
1971
|
-
# Add location
|
1972
|
-
df_location['
|
1973
|
-
|
1974
|
-
# df_location['lineage_group'] = group.get('lineage_hint', group_id)
|
1975
|
-
|
1976
|
-
all_metrics.append(df_location)
|
1977
|
-
|
1978
|
-
if all_metrics:
|
1979
|
-
# Combine all metrics
|
1980
|
-
df_combined = pd.concat(all_metrics, ignore_index=True)
|
2573
|
+
# Add additional location metadata (data_location already set above)
|
2574
|
+
df_location['location_type'] = best_location.get('type', 'unknown')
|
2575
|
+
df_location['location_confidence'] = best_location.get('confidence', 0)
|
1981
2576
|
|
1982
|
-
|
1983
|
-
|
1984
|
-
len(df_combined), len(all_metrics))
|
2577
|
+
LOGGER.info("Extraction complete: %d variants from primary location %s",
|
2578
|
+
len(df_location), best_location['location'])
|
1985
2579
|
|
1986
|
-
|
1987
|
-
|
1988
|
-
|
1989
|
-
|
1990
|
-
|
1991
|
-
return df_combined
|
1992
|
-
else:
|
1993
|
-
LOGGER.warning("No metrics extracted from any location")
|
1994
|
-
return pd.DataFrame()
|
2580
|
+
return df_location
|
2581
|
+
|
2582
|
+
# No locations found
|
2583
|
+
LOGGER.warning("No valid locations found for extraction")
|
2584
|
+
return pd.DataFrame()
|
1995
2585
|
|
1996
2586
|
def _has_valid_metrics(self, metrics_row: Dict[str, Any]) -> bool:
|
1997
2587
|
"""Check if a metrics row contains any valid performance data."""
|
@@ -2078,14 +2668,18 @@ TEXT FROM MANUSCRIPT:
|
|
2078
2668
|
|
2079
2669
|
return filtered
|
2080
2670
|
|
2081
|
-
def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any]) -> Dict[str, Any]:
|
2671
|
+
def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any], enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
|
2082
2672
|
"""Extract full model reaction info including IUPAC names for a lineage."""
|
2083
2673
|
# Get substrate/product IDs from lineage-specific extraction
|
2084
2674
|
substrate_ids = lineage_reaction.get('substrate_ids', [])
|
2085
2675
|
product_ids = lineage_reaction.get('product_ids', [])
|
2086
2676
|
|
2087
|
-
# Get general model reaction info for conditions
|
2088
|
-
|
2677
|
+
# Get general model reaction info for conditions, using lineage-specific compound IDs
|
2678
|
+
lineage_ids = {
|
2679
|
+
"substrate_ids": substrate_ids,
|
2680
|
+
"product_ids": product_ids
|
2681
|
+
}
|
2682
|
+
general_info = self.gather_model_reaction_info(enzyme_variants, lineage_compound_ids=lineage_ids)
|
2089
2683
|
|
2090
2684
|
# Override substrate/product lists with lineage-specific ones only if they contain actual compound IDs
|
2091
2685
|
model_info = general_info.copy()
|
@@ -2231,6 +2825,9 @@ TEXT FROM MANUSCRIPT:
|
|
2231
2825
|
LOGGER.error("No enzyme DataFrame provided - this module requires enzyme CSV input")
|
2232
2826
|
return pd.DataFrame()
|
2233
2827
|
|
2828
|
+
# Store enzyme_df for use in extract_metrics_batch
|
2829
|
+
self.enzyme_df = enzyme_df
|
2830
|
+
|
2234
2831
|
# Check if we have campaign_id column - if so, process each campaign separately
|
2235
2832
|
if 'campaign_id' in enzyme_df.columns and not self.campaign_filter:
|
2236
2833
|
campaigns = enzyme_df['campaign_id'].unique()
|
@@ -2249,15 +2846,16 @@ TEXT FROM MANUSCRIPT:
|
|
2249
2846
|
si=self.si,
|
2250
2847
|
cfg=self.cfg,
|
2251
2848
|
debug_dir=self.debug_dir / campaign_id if self.debug_dir else None,
|
2252
|
-
campaign_filter=campaign_id
|
2849
|
+
campaign_filter=campaign_id,
|
2850
|
+
all_campaigns=campaigns.tolist()
|
2253
2851
|
)
|
2254
2852
|
|
2255
2853
|
# Run extraction for this campaign
|
2256
2854
|
campaign_df = campaign_extractor.run(enzyme_df)
|
2257
2855
|
|
2258
2856
|
if not campaign_df.empty:
|
2259
|
-
# Add campaign identifier
|
2260
|
-
campaign_df['
|
2857
|
+
# Add a temporary campaign identifier for merging
|
2858
|
+
campaign_df['_extraction_campaign'] = campaign_id
|
2261
2859
|
all_campaign_results.append(campaign_df)
|
2262
2860
|
LOGGER.info("Extracted %d reactions for campaign %s", len(campaign_df), campaign_id)
|
2263
2861
|
|
@@ -2328,7 +2926,7 @@ TEXT FROM MANUSCRIPT:
|
|
2328
2926
|
def merge_with_lineage_data(
|
2329
2927
|
df_lineage: pd.DataFrame, df_metrics: pd.DataFrame
|
2330
2928
|
) -> pd.DataFrame:
|
2331
|
-
"""
|
2929
|
+
"""Merge lineage and metrics data ensuring one-to-one mapping per campaign."""
|
2332
2930
|
|
2333
2931
|
# Handle both 'enzyme' and 'enzyme_id' column names
|
2334
2932
|
if "enzyme_id" in df_lineage.columns and "enzyme" not in df_lineage.columns:
|
@@ -2337,7 +2935,19 @@ def merge_with_lineage_data(
|
|
2337
2935
|
if "enzyme" not in df_lineage.columns:
|
2338
2936
|
raise ValueError("Lineage CSV must have an 'enzyme' or 'enzyme_id' column.")
|
2339
2937
|
|
2340
|
-
|
2938
|
+
# Check if we have campaign information to match on
|
2939
|
+
if "campaign_id" in df_lineage.columns and "_extraction_campaign" in df_metrics.columns:
|
2940
|
+
# Match on both enzyme and campaign to ensure correct pairing
|
2941
|
+
df_metrics_temp = df_metrics.copy()
|
2942
|
+
df_metrics_temp['campaign_id'] = df_metrics_temp['_extraction_campaign']
|
2943
|
+
df_metrics_temp = df_metrics_temp.drop('_extraction_campaign', axis=1)
|
2944
|
+
merged = df_lineage.merge(df_metrics_temp, on=["enzyme", "campaign_id"], how="left")
|
2945
|
+
else:
|
2946
|
+
# Simple merge on enzyme only
|
2947
|
+
if "_extraction_campaign" in df_metrics.columns:
|
2948
|
+
df_metrics = df_metrics.drop('_extraction_campaign', axis=1)
|
2949
|
+
merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
|
2950
|
+
|
2341
2951
|
return merged
|
2342
2952
|
|
2343
2953
|
###############################################################################
|
@@ -2366,25 +2976,103 @@ def main() -> None:
|
|
2366
2976
|
if args.verbose:
|
2367
2977
|
LOGGER.setLevel(logging.DEBUG)
|
2368
2978
|
cfg = Config()
|
2369
|
-
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir)
|
2370
2979
|
|
2371
|
-
# Load enzyme data from CSV if provided
|
2980
|
+
# Load enzyme data from CSV if provided to detect campaign information
|
2372
2981
|
enzyme_df = None
|
2982
|
+
campaign_filter = None
|
2983
|
+
all_campaigns = None
|
2984
|
+
|
2373
2985
|
if args.lineage_csv and args.lineage_csv.exists():
|
2374
2986
|
LOGGER.info("Loading enzyme data from CSV…")
|
2375
2987
|
enzyme_df = pd.read_csv(args.lineage_csv)
|
2376
|
-
|
2377
|
-
|
2378
|
-
|
2988
|
+
|
2989
|
+
# Detect campaign information from the enzyme CSV
|
2990
|
+
if 'campaign_id' in enzyme_df.columns:
|
2991
|
+
all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
|
2992
|
+
if len(all_campaigns) == 1:
|
2993
|
+
campaign_filter = all_campaigns[0]
|
2994
|
+
LOGGER.info("Detected single campaign: %s", campaign_filter)
|
2995
|
+
|
2996
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
|
2997
|
+
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
2998
|
+
df_metrics = extractor.run(enzyme_df)
|
2999
|
+
|
3000
|
+
elif len(all_campaigns) > 1:
|
3001
|
+
LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
|
3002
|
+
all_results = []
|
3003
|
+
|
3004
|
+
# Process each campaign separately
|
3005
|
+
for campaign in all_campaigns:
|
3006
|
+
LOGGER.info("Processing campaign: %s", campaign)
|
3007
|
+
|
3008
|
+
# Filter enzyme_df to this campaign
|
3009
|
+
campaign_df = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3010
|
+
LOGGER.info("Found %d enzymes for campaign %s", len(campaign_df), campaign)
|
3011
|
+
|
3012
|
+
if len(campaign_df) == 0:
|
3013
|
+
LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
|
3014
|
+
continue
|
3015
|
+
|
3016
|
+
# Create extractor for this campaign
|
3017
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
|
3018
|
+
campaign_filter=campaign, all_campaigns=all_campaigns)
|
3019
|
+
|
3020
|
+
# Run extraction for this campaign
|
3021
|
+
campaign_metrics = extractor.run(campaign_df)
|
3022
|
+
|
3023
|
+
if not campaign_metrics.empty:
|
3024
|
+
# Merge with lineage data for this campaign
|
3025
|
+
campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3026
|
+
if "enzyme_id" in campaign_lineage.columns and "enzyme" not in campaign_lineage.columns:
|
3027
|
+
campaign_lineage = campaign_lineage.rename(columns={"enzyme_id": "enzyme"})
|
3028
|
+
|
3029
|
+
# Merge campaign metrics with lineage data
|
3030
|
+
campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3031
|
+
|
3032
|
+
# Save campaign-specific file immediately
|
3033
|
+
output_dir = args.output.parent
|
3034
|
+
base_name = args.output.stem
|
3035
|
+
campaign_file = output_dir / f"{base_name}_{campaign}.csv"
|
3036
|
+
campaign_final.to_csv(campaign_file, index=False)
|
3037
|
+
LOGGER.info("Saved %d rows for campaign %s -> %s", len(campaign_final), campaign, campaign_file)
|
3038
|
+
|
3039
|
+
# Add the merged data (not just metrics) to final results
|
3040
|
+
all_results.append(campaign_final)
|
3041
|
+
LOGGER.info("Added %d merged results for campaign %s", len(campaign_final), campaign)
|
3042
|
+
else:
|
3043
|
+
LOGGER.warning("No results extracted for campaign %s", campaign)
|
3044
|
+
|
3045
|
+
# Still save an empty campaign file with lineage data
|
3046
|
+
campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3047
|
+
if not campaign_lineage.empty:
|
3048
|
+
output_dir = args.output.parent
|
3049
|
+
base_name = args.output.stem
|
3050
|
+
campaign_file = output_dir / f"{base_name}_{campaign}.csv"
|
3051
|
+
campaign_lineage.to_csv(campaign_file, index=False)
|
3052
|
+
LOGGER.info("Saved %d rows (lineage only) for campaign %s -> %s", len(campaign_lineage), campaign, campaign_file)
|
3053
|
+
|
3054
|
+
# Combine all campaign results
|
3055
|
+
if all_results:
|
3056
|
+
df_metrics = pd.concat(all_results, ignore_index=True)
|
3057
|
+
LOGGER.info("Combined results from %d campaigns: %d total rows", len(all_results), len(df_metrics))
|
3058
|
+
else:
|
3059
|
+
LOGGER.warning("No results from any campaign")
|
3060
|
+
df_metrics = pd.DataFrame()
|
3061
|
+
else:
|
3062
|
+
# No campaign information, process all enzymes together
|
3063
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
|
3064
|
+
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
3065
|
+
df_metrics = extractor.run(enzyme_df)
|
2379
3066
|
|
2380
|
-
|
2381
|
-
|
2382
|
-
|
2383
|
-
else
|
2384
|
-
df_final = df_metrics
|
3067
|
+
# Skip final merge since campaign-specific merges already happened during processing
|
3068
|
+
# This avoids duplicate entries when same enzyme appears in multiple campaigns
|
3069
|
+
df_final = df_metrics
|
3070
|
+
LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
|
2385
3071
|
|
2386
3072
|
df_final.to_csv(args.output, index=False)
|
2387
3073
|
LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
|
3074
|
+
|
3075
|
+
# Campaign-specific files are already saved during processing above
|
2388
3076
|
|
2389
3077
|
if __name__ == "__main__":
|
2390
3078
|
main()
|