debase 0.1.19__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +40 -8
- debase/enzyme_lineage_extractor.py +153 -9
- debase/reaction_info_extractor.py +1119 -504
- debase/substrate_scope_extractor.py +83 -34
- debase/wrapper.py +75 -0
- {debase-0.1.19.dist-info → debase-0.4.0.dist-info}/METADATA +1 -1
- debase-0.4.0.dist-info/RECORD +16 -0
- debase/PIPELINE_FLOW.md +0 -100
- debase-0.1.19.dist-info/RECORD +0 -17
- {debase-0.1.19.dist-info → debase-0.4.0.dist-info}/WHEEL +0 -0
- {debase-0.1.19.dist-info → debase-0.4.0.dist-info}/entry_points.txt +0 -0
- {debase-0.1.19.dist-info → debase-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.1.19.dist-info → debase-0.4.0.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ Key June 2025 additions
|
|
24
24
|
from __future__ import annotations
|
25
25
|
|
26
26
|
import argparse
|
27
|
+
import hashlib
|
27
28
|
import json
|
28
29
|
import logging
|
29
30
|
import os
|
@@ -31,7 +32,9 @@ import re
|
|
31
32
|
import sys
|
32
33
|
import time
|
33
34
|
from base64 import b64encode, b64decode
|
35
|
+
from collections import OrderedDict
|
34
36
|
from dataclasses import dataclass, field
|
37
|
+
from functools import lru_cache
|
35
38
|
from pathlib import Path
|
36
39
|
from textwrap import dedent
|
37
40
|
from typing import Any, Dict, List, Optional, Tuple
|
@@ -40,7 +43,6 @@ import fitz # PyMuPDF - for image extraction
|
|
40
43
|
import google.generativeai as genai # type: ignore
|
41
44
|
import pandas as pd
|
42
45
|
from PyPDF2 import PdfReader
|
43
|
-
import PIL.Image
|
44
46
|
import io
|
45
47
|
|
46
48
|
###############################################################################
|
@@ -51,7 +53,7 @@ import io
|
|
51
53
|
class Config:
|
52
54
|
"""Centralised tunables so tests can override them easily."""
|
53
55
|
|
54
|
-
model_name: str = "gemini-
|
56
|
+
model_name: str = "gemini-2.5-flash"
|
55
57
|
location_temperature: float = 0.2
|
56
58
|
extract_temperature: float = 0.0
|
57
59
|
model_reaction_temperature: float = 0.0
|
@@ -117,6 +119,144 @@ def get_model(cfg: Config):
|
|
117
119
|
genai.configure(api_key=api_key)
|
118
120
|
return genai.GenerativeModel(cfg.model_name)
|
119
121
|
|
122
|
+
# Bounded LRU caches to store prompt/image content by hash (prevents memory leaks)
|
123
|
+
|
124
|
+
class LRUCache:
|
125
|
+
"""Simple LRU cache implementation."""
|
126
|
+
def __init__(self, maxsize: int):
|
127
|
+
self.maxsize = maxsize
|
128
|
+
self.cache = OrderedDict()
|
129
|
+
|
130
|
+
def get(self, key: str) -> Optional[str]:
|
131
|
+
if key in self.cache:
|
132
|
+
# Move to end (most recently used)
|
133
|
+
self.cache.move_to_end(key)
|
134
|
+
return self.cache[key]
|
135
|
+
return None
|
136
|
+
|
137
|
+
def put(self, key: str, value: str) -> None:
|
138
|
+
if key in self.cache:
|
139
|
+
# Update existing
|
140
|
+
self.cache.move_to_end(key)
|
141
|
+
else:
|
142
|
+
# Add new, evict oldest if needed
|
143
|
+
if len(self.cache) >= self.maxsize:
|
144
|
+
self.cache.popitem(last=False)
|
145
|
+
self.cache[key] = value
|
146
|
+
|
147
|
+
def __len__(self) -> int:
|
148
|
+
return len(self.cache)
|
149
|
+
|
150
|
+
# Global bounded caches
|
151
|
+
_PROMPT_CACHE = LRUCache(maxsize=1000)
|
152
|
+
_IMAGE_CACHE = LRUCache(maxsize=500) # Images are larger, so smaller cache
|
153
|
+
|
154
|
+
def get_cache_stats() -> Dict[str, Any]:
|
155
|
+
"""Get cache statistics for debugging."""
|
156
|
+
return {
|
157
|
+
"gemini_cache_info": _cached_gemini_call.cache_info(),
|
158
|
+
"prompt_cache_size": len(_PROMPT_CACHE),
|
159
|
+
"image_cache_size": len(_IMAGE_CACHE),
|
160
|
+
}
|
161
|
+
|
162
|
+
@lru_cache(maxsize=1000)
|
163
|
+
def _cached_gemini_call(
|
164
|
+
model_name: str,
|
165
|
+
prompt_hash: str,
|
166
|
+
image_hash: Optional[str],
|
167
|
+
temperature: float,
|
168
|
+
max_retries: int,
|
169
|
+
) -> str:
|
170
|
+
"""Pure cached function for Gemini API calls using only hash keys.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
model_name: Name of the Gemini model
|
174
|
+
prompt_hash: SHA256 hash of the prompt
|
175
|
+
image_hash: SHA256 hash of the image (if any)
|
176
|
+
temperature: Temperature for generation
|
177
|
+
max_retries: Maximum number of retries
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
Raw response text from Gemini
|
181
|
+
"""
|
182
|
+
# Retrieve actual content from LRU cache
|
183
|
+
prompt = _PROMPT_CACHE.get(prompt_hash)
|
184
|
+
image_b64 = _IMAGE_CACHE.get(image_hash) if image_hash else None
|
185
|
+
|
186
|
+
if prompt is None:
|
187
|
+
raise RuntimeError(f"Prompt content not found for hash {prompt_hash}")
|
188
|
+
|
189
|
+
# Configure API key (this is idempotent)
|
190
|
+
api_key = os.getenv("GEMINI_API_KEY")
|
191
|
+
if not api_key:
|
192
|
+
raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
|
193
|
+
genai.configure(api_key=api_key)
|
194
|
+
|
195
|
+
# Create model instance (not cached since it's lightweight)
|
196
|
+
model = genai.GenerativeModel(model_name)
|
197
|
+
|
198
|
+
for attempt in range(1, max_retries + 1):
|
199
|
+
try:
|
200
|
+
# Handle image if provided
|
201
|
+
if image_b64:
|
202
|
+
# Decode base64 string to bytes for Gemini API
|
203
|
+
image_bytes = b64decode(image_b64)
|
204
|
+
parts = [prompt, {"mime_type": "image/png", "data": image_bytes}]
|
205
|
+
else:
|
206
|
+
parts = [prompt]
|
207
|
+
|
208
|
+
resp = model.generate_content(
|
209
|
+
parts,
|
210
|
+
generation_config={
|
211
|
+
"temperature": temperature,
|
212
|
+
"max_output_tokens": 8192,
|
213
|
+
}
|
214
|
+
)
|
215
|
+
# Track token usage if available
|
216
|
+
try:
|
217
|
+
if hasattr(resp, 'usage_metadata'):
|
218
|
+
input_tokens = getattr(resp.usage_metadata, 'prompt_token_count', 0)
|
219
|
+
output_tokens = getattr(resp.usage_metadata, 'candidates_token_count', 0)
|
220
|
+
if input_tokens or output_tokens:
|
221
|
+
try:
|
222
|
+
from .wrapper import add_token_usage
|
223
|
+
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
224
|
+
except ImportError:
|
225
|
+
pass # wrapper not available
|
226
|
+
except Exception:
|
227
|
+
pass # token tracking is best-effort
|
228
|
+
|
229
|
+
return resp.text.strip()
|
230
|
+
except Exception as exc:
|
231
|
+
if attempt == max_retries:
|
232
|
+
raise
|
233
|
+
time.sleep(2 ** attempt)
|
234
|
+
|
235
|
+
# Should never reach here
|
236
|
+
raise RuntimeError("Max retries exceeded")
|
237
|
+
|
238
|
+
def _normalize_prompt_for_caching(prompt: str) -> str:
|
239
|
+
"""Normalize prompt for better cache hit rates by removing boilerplate and collapsing whitespace."""
|
240
|
+
# Remove common boilerplate lines that don't affect the core query
|
241
|
+
lines = prompt.split('\n')
|
242
|
+
normalized_lines = []
|
243
|
+
|
244
|
+
for line in lines:
|
245
|
+
# Skip timestamp and debug lines
|
246
|
+
if any(skip in line.lower() for skip in ['timestamp:', 'length:', 'characters', '===', '***']):
|
247
|
+
continue
|
248
|
+
# Skip lines that are just separators
|
249
|
+
if line.strip() and not line.strip().replace('=', '').replace('-', '').replace('*', ''):
|
250
|
+
continue
|
251
|
+
# Collapse whitespace but preserve structure
|
252
|
+
normalized_lines.append(' '.join(line.split()))
|
253
|
+
|
254
|
+
# Join and collapse multiple newlines
|
255
|
+
normalized = '\n'.join(normalized_lines)
|
256
|
+
normalized = re.sub(r'\n\s*\n+', '\n\n', normalized)
|
257
|
+
|
258
|
+
return normalized.strip()
|
259
|
+
|
120
260
|
def generate_json_with_retry(
|
121
261
|
model,
|
122
262
|
prompt: str,
|
@@ -129,9 +269,17 @@ def generate_json_with_retry(
|
|
129
269
|
image_b64: Optional[str] = None,
|
130
270
|
):
|
131
271
|
"""Call Gemini with retries & exponential back-off, returning parsed JSON."""
|
272
|
+
# Generate cache keys based on normalized prompt and image content
|
273
|
+
normalized_prompt = _normalize_prompt_for_caching(prompt)
|
274
|
+
prompt_hash = hashlib.sha256(normalized_prompt.encode()).hexdigest()
|
275
|
+
image_hash = hashlib.sha256(image_b64.encode()).hexdigest() if image_b64 else None
|
276
|
+
|
132
277
|
# Log prompt details
|
133
278
|
LOGGER.info("=== GEMINI API CALL: %s ===", tag.upper())
|
134
279
|
LOGGER.info("Prompt length: %d characters", len(prompt))
|
280
|
+
LOGGER.info("Prompt hash: %s", prompt_hash[:16])
|
281
|
+
if image_hash:
|
282
|
+
LOGGER.info("Image hash: %s", image_hash[:16])
|
135
283
|
LOGGER.info("First 500 chars of prompt:\n%s\n...(truncated)", prompt[:500])
|
136
284
|
|
137
285
|
# Save full prompt to debug directory if provided
|
@@ -139,107 +287,142 @@ def generate_json_with_retry(
|
|
139
287
|
debug_path = Path(debug_dir)
|
140
288
|
debug_path.mkdir(parents=True, exist_ok=True)
|
141
289
|
prompt_file = debug_path / f"{tag}_prompt_{int(time.time())}.txt"
|
142
|
-
_dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\n{'='*80}\n\n{prompt}",
|
290
|
+
_dump(f"=== PROMPT FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(prompt)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{prompt}",
|
143
291
|
prompt_file)
|
144
292
|
LOGGER.info("Full prompt saved to: %s", prompt_file)
|
145
293
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
LOGGER.info("
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
294
|
+
try:
|
295
|
+
# Store content in bounded LRU caches for the cached function to retrieve
|
296
|
+
_PROMPT_CACHE.put(prompt_hash, prompt)
|
297
|
+
if image_hash and image_b64:
|
298
|
+
_IMAGE_CACHE.put(image_hash, image_b64)
|
299
|
+
|
300
|
+
# Check if this will be a cache hit
|
301
|
+
cache_info_before = _cached_gemini_call.cache_info()
|
302
|
+
|
303
|
+
# Use cached Gemini call (only with hash keys)
|
304
|
+
LOGGER.info("Calling cached Gemini API...")
|
305
|
+
raw = _cached_gemini_call(
|
306
|
+
model_name=model.model_name,
|
307
|
+
prompt_hash=prompt_hash,
|
308
|
+
image_hash=image_hash,
|
309
|
+
temperature=temperature,
|
310
|
+
max_retries=max_retries,
|
311
|
+
)
|
312
|
+
|
313
|
+
# Log cache performance
|
314
|
+
cache_info_after = _cached_gemini_call.cache_info()
|
315
|
+
if cache_info_after.hits > cache_info_before.hits:
|
316
|
+
LOGGER.info("✓ Cache HIT for prompt hash %s", prompt_hash[:16])
|
317
|
+
else:
|
318
|
+
LOGGER.info("✗ Cache MISS for prompt hash %s", prompt_hash[:16])
|
319
|
+
|
320
|
+
# Log response
|
321
|
+
LOGGER.info("Gemini response length: %d characters", len(raw))
|
322
|
+
LOGGER.info("First 500 chars of response:\n%s\n...(truncated)", raw[:500])
|
323
|
+
|
324
|
+
# Save full response to debug directory
|
325
|
+
if debug_dir:
|
326
|
+
response_file = debug_path / f"{tag}_response_{int(time.time())}.txt"
|
327
|
+
_dump(f"=== RESPONSE FOR {tag.upper()} ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\nHash: {prompt_hash}\n{'='*80}\n\n{raw}",
|
328
|
+
response_file)
|
329
|
+
LOGGER.info("Full response saved to: %s", response_file)
|
176
330
|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
331
|
+
# Remove common Markdown fences more carefully
|
332
|
+
if raw.startswith("```json"):
|
333
|
+
raw = raw[7:].strip() # Remove ```json
|
334
|
+
elif raw.startswith("```"):
|
335
|
+
raw = raw[3:].strip() # Remove ```
|
336
|
+
|
337
|
+
if raw.endswith("```"):
|
338
|
+
raw = raw[:-3].strip() # Remove trailing ```
|
339
|
+
|
340
|
+
|
341
|
+
# Simple JSON parsing approach
|
342
|
+
# Try direct parsing first
|
343
|
+
LOGGER.debug(f"Raw JSON length: {len(raw)}")
|
344
|
+
LOGGER.debug(f"Raw JSON first 200 chars: {raw[:200]}")
|
345
|
+
LOGGER.debug(f"Raw JSON last 200 chars: {raw[-200:]}")
|
346
|
+
|
347
|
+
try:
|
348
|
+
parsed = json.loads(raw)
|
349
|
+
except json.JSONDecodeError as e:
|
350
|
+
LOGGER.error(f"JSON parsing failed at position {e.pos}: {e}")
|
351
|
+
LOGGER.error(f"Character at error: {repr(raw[e.pos] if e.pos < len(raw) else 'END')}")
|
352
|
+
LOGGER.error(f"Context: {repr(raw[max(0, e.pos-20):e.pos+20])}")
|
353
|
+
|
354
|
+
# Count braces and quotes for debugging
|
355
|
+
open_braces = raw.count('{')
|
356
|
+
close_braces = raw.count('}')
|
357
|
+
quotes = raw.count('"')
|
358
|
+
LOGGER.error(f"Braces: {open_braces} open, {close_braces} close. Quotes: {quotes}")
|
359
|
+
|
360
|
+
# If that fails, try to extract JSON from the response using a simpler method
|
183
361
|
try:
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
json_start = -1
|
189
|
-
json_end = -1
|
190
|
-
bracket_stack = []
|
191
|
-
in_string = False
|
192
|
-
escape_next = False
|
362
|
+
# Look for the JSON object start and end
|
363
|
+
start_idx = raw.find('{')
|
364
|
+
if start_idx == -1:
|
365
|
+
raise json.JSONDecodeError("No JSON object found", raw, 0)
|
193
366
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
367
|
+
# Find the matching closing brace by counting
|
368
|
+
brace_count = 0
|
369
|
+
end_idx = -1
|
370
|
+
for i in range(start_idx, len(raw)):
|
371
|
+
if raw[i] == '{':
|
372
|
+
brace_count += 1
|
373
|
+
elif raw[i] == '}':
|
374
|
+
brace_count -= 1
|
375
|
+
if brace_count == 0:
|
376
|
+
end_idx = i + 1
|
377
|
+
break
|
378
|
+
|
379
|
+
if end_idx == -1:
|
380
|
+
raise json.JSONDecodeError("No matching closing brace found", raw, 0)
|
381
|
+
|
382
|
+
json_str = raw[start_idx:end_idx]
|
383
|
+
LOGGER.debug(f"Extracted JSON string: {json_str[:200]}...")
|
384
|
+
parsed = json.loads(json_str)
|
385
|
+
|
386
|
+
except json.JSONDecodeError:
|
387
|
+
# Final fallback - try to use eval as a last resort (unsafe but functional)
|
388
|
+
try:
|
389
|
+
# Replace problematic characters and try to parse as Python dict
|
390
|
+
safe_raw = raw.replace('null', 'None').replace('true', 'True').replace('false', 'False')
|
391
|
+
start_idx = safe_raw.find('{')
|
392
|
+
if start_idx == -1:
|
393
|
+
raise ValueError("No dict found")
|
198
394
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
395
|
+
brace_count = 0
|
396
|
+
end_idx = -1
|
397
|
+
for i in range(start_idx, len(safe_raw)):
|
398
|
+
if safe_raw[i] == '{':
|
399
|
+
brace_count += 1
|
400
|
+
elif safe_raw[i] == '}':
|
401
|
+
brace_count -= 1
|
402
|
+
if brace_count == 0:
|
403
|
+
end_idx = i + 1
|
404
|
+
break
|
206
405
|
|
207
|
-
if
|
208
|
-
|
406
|
+
if end_idx == -1:
|
407
|
+
raise ValueError("No matching closing brace found")
|
209
408
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
|
218
|
-
if not bracket_stack: # Found complete JSON
|
219
|
-
json_end = i + 1
|
220
|
-
break
|
221
|
-
|
222
|
-
if json_start >= 0 and json_end > json_start:
|
223
|
-
# Extract the JSON portion
|
224
|
-
json_str = raw[json_start:json_end]
|
225
|
-
parsed = json.loads(json_str)
|
226
|
-
else:
|
227
|
-
# Look for simple [] in the response
|
409
|
+
dict_str = safe_raw[start_idx:end_idx]
|
410
|
+
parsed = eval(dict_str) # This is unsafe but we trust our own generated content
|
411
|
+
LOGGER.warning("Used eval() fallback for JSON parsing")
|
412
|
+
|
413
|
+
except Exception:
|
414
|
+
# If all else fails, return empty dict
|
415
|
+
LOGGER.error("All JSON parsing methods failed")
|
228
416
|
if '[]' in raw:
|
229
417
|
parsed = []
|
230
418
|
else:
|
231
|
-
# No JSON structure found, re-raise the original error
|
232
419
|
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
)
|
240
|
-
if attempt == max_retries:
|
241
|
-
raise
|
242
|
-
time.sleep(2 ** attempt)
|
420
|
+
|
421
|
+
LOGGER.info("Successfully parsed JSON response")
|
422
|
+
return parsed
|
423
|
+
except Exception as exc:
|
424
|
+
LOGGER.error("Cached Gemini call failed: %s", exc)
|
425
|
+
raise
|
243
426
|
|
244
427
|
|
245
428
|
###############################################################################
|
@@ -248,14 +431,14 @@ def generate_json_with_retry(
|
|
248
431
|
|
249
432
|
PROMPT_FIND_LOCATIONS = dedent("""
|
250
433
|
You are an expert reader of protein engineering manuscripts.
|
251
|
-
Given the following article captions and section titles, identify
|
434
|
+
Given the following article captions and section titles, identify most promising locations
|
252
435
|
(tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
|
253
|
-
activity, etc.) for enzyme variants.
|
436
|
+
activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
|
254
437
|
|
255
438
|
IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
|
256
439
|
performance data locations. Pay careful attention to:
|
257
440
|
- The caption text to identify which campaign/lineage the data is for
|
258
|
-
- Enzyme name prefixes
|
441
|
+
- Enzyme name prefixes that indicate different campaigns
|
259
442
|
- Different substrate/product types mentioned in captions
|
260
443
|
|
261
444
|
Respond with a JSON array where each element contains:
|
@@ -267,8 +450,10 @@ Respond with a JSON array where each element contains:
|
|
267
450
|
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
268
451
|
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
269
452
|
|
270
|
-
Tables are preferred over figures
|
271
|
-
|
453
|
+
Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information.
|
454
|
+
Do not include too much sources, just return 2 or 3 sources.
|
455
|
+
Adjust confidence comparing all locations you will be returning, only rank figure the highest when you are absolutely certain table won't contain complete information.
|
456
|
+
When returning confidence scores, be more accurate and avoid scores that are too close together.
|
272
457
|
Respond ONLY with **minified JSON**. NO markdown fences.
|
273
458
|
|
274
459
|
Example:
|
@@ -280,7 +465,8 @@ You are given either (a) the PNG image of a figure panel, or (b) the caption /
|
|
280
465
|
text excerpt that contains numeric reaction performance data for an enzyme.
|
281
466
|
|
282
467
|
Extract ONLY the performance metrics, NOT substrate/product names or reaction conditions.
|
283
|
-
|
468
|
+
|
469
|
+
Return a JSON object with the following keys (use **null** only if the value is not mentioned at all):
|
284
470
|
* "yield" - yield as percentage with ONE decimal place precision
|
285
471
|
* "ttn" - turnover number (total turnovers)
|
286
472
|
* "ton" - turnover number if TTN not available
|
@@ -296,45 +482,66 @@ IMPORTANT:
|
|
296
482
|
- Do NOT extract substrate/product names - these will come from SI
|
297
483
|
- Do NOT extract reaction conditions (temperature, pH, time, solvent)
|
298
484
|
- If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
|
485
|
+
- If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
|
299
486
|
|
300
487
|
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
301
488
|
""")
|
302
489
|
|
303
490
|
PROMPT_EXTRACT_FIGURE_METRICS_BATCH = dedent("""
|
304
|
-
|
305
|
-
|
306
|
-
|
491
|
+
STEP 1: First, identify ALL X-axis labels in the figure
|
492
|
+
- Read each X-axis label from left to right
|
493
|
+
- List exactly what text appears under each bar/data point
|
494
|
+
- Note: Labels may be abbreviated or use different naming conventions
|
495
|
+
|
496
|
+
STEP 2: Match X-axis labels to target enzyme variants
|
497
|
+
- Compare each X-axis label against the target enzyme list below
|
498
|
+
- Look for partial matches, abbreviations, or similar naming patterns
|
499
|
+
- If an X-axis label doesn't match any target enzyme, still include it for completeness
|
500
|
+
|
501
|
+
STEP 3: Identify Y-axis scales and what they measure
|
502
|
+
- Look at the Y-axis labels and tick marks to understand what each axis measures
|
503
|
+
- If there are multiple Y-axes (left and right), read the axis labels and units
|
504
|
+
- Note the minimum and maximum values on each axis scale
|
505
|
+
- Identify which visual elements (bars, dots, lines) correspond to which axis
|
506
|
+
|
507
|
+
STEP 4: Extract values for each matched variant
|
508
|
+
- For each X-axis position, identify which visual elements belong to that position
|
509
|
+
- LEFT Y-axis (bars): Measure bar height against the left scale by reading tick marks
|
510
|
+
- RIGHT Y-axis (dots): Measure dot position against the right scale by reading tick marks
|
511
|
+
- CRITICAL: Read actual scale values from the axis labels and tick marks
|
512
|
+
- Verify: taller bars should have higher values, higher dots should have higher values
|
513
|
+
|
514
|
+
Target enzymes to find and extract:
|
307
515
|
{enzyme_names}
|
308
516
|
|
309
|
-
|
310
|
-
1.
|
311
|
-
2.
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
* "
|
319
|
-
* "
|
320
|
-
* "
|
321
|
-
* "
|
322
|
-
* "
|
323
|
-
* "
|
324
|
-
* "
|
325
|
-
* "
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
-
|
332
|
-
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
517
|
+
Instructions:
|
518
|
+
1. First, list ALL X-axis labels you can see in the figure
|
519
|
+
2. Match each X-axis label to the target enzyme variants
|
520
|
+
3. For matched variants, extract both bar heights (left Y-axis) and dot positions (right Y-axis)
|
521
|
+
4. Return data only for variants that have clear X-axis labels and are matched to targets
|
522
|
+
|
523
|
+
Return JSON with the identified enzyme variant names as keys containing:
|
524
|
+
* "x_axis_label" - the exact text from the X-axis for this variant
|
525
|
+
* "yield" - percentage from left Y-axis bar height measurement
|
526
|
+
* "ttn" - turnover number from right Y-axis dot position measurement
|
527
|
+
* "ton" - if TTN not available
|
528
|
+
* "selectivity" - if shown
|
529
|
+
* "conversion" - if different from yield
|
530
|
+
* "tof" - if provided
|
531
|
+
* "activity" - if provided
|
532
|
+
* "other_metrics" - other metrics
|
533
|
+
* "notes" - REQUIRED: Describe the X-axis label, bar position, and dot position (e.g., "X-axis shows P411-CIS, leftmost bar is very short, dot is at bottom")
|
534
|
+
|
535
|
+
CRITICAL: Return ONLY valid JSON in this exact format:
|
536
|
+
{{"enzyme_name": {{"x_axis_label": "label", "yield": number, "ttn": number, "notes": "description"}}}}
|
537
|
+
|
538
|
+
Rules:
|
539
|
+
- Use double quotes for all strings
|
540
|
+
- No markdown, no commentary, no explanations
|
541
|
+
- All values must be properly formatted
|
542
|
+
- Ensure JSON is complete and valid
|
543
|
+
- Do not truncate or cut off the response
|
544
|
+
- IMPORTANT: When extracting data, prioritize the most complete source that shows data for ALL variants. If there are conflicting values between different sources (e.g., bar graph vs text values), use the source that provides complete data for all target enzymes and ignore partial or conflicting values from other sources
|
338
545
|
""")
|
339
546
|
|
340
547
|
# Removed substrate scope IUPAC extraction - now handled in model reaction only
|
@@ -519,13 +726,25 @@ class ReactionExtractor:
|
|
519
726
|
_TAB_RE = re.compile(r"tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
520
727
|
|
521
728
|
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
522
|
-
campaign_filter: Optional[str] = None):
|
729
|
+
campaign_filter: Optional[str] = None, all_campaigns: Optional[List[str]] = None):
|
523
730
|
self.manuscript = manuscript
|
524
731
|
self.si = si
|
525
732
|
self.cfg = cfg
|
526
733
|
self.model = get_model(cfg)
|
527
734
|
self.debug_dir = debug_dir
|
528
735
|
self.campaign_filter = campaign_filter # Filter for specific campaign
|
736
|
+
self.all_campaigns = all_campaigns or [] # List of all campaigns for context
|
737
|
+
|
738
|
+
# Cache for extracted figures to avoid redundant extractions (bounded to prevent memory leaks)
|
739
|
+
self._figure_cache = LRUCache(maxsize=100) # Figures are large, so smaller cache
|
740
|
+
self._model_reaction_locations_cache = LRUCache(maxsize=50)
|
741
|
+
|
742
|
+
# Cache for compound mappings to avoid repeated API calls (bounded to prevent memory leaks)
|
743
|
+
self._compound_mapping_cache = LRUCache(maxsize=1000)
|
744
|
+
self._compound_mapping_text_cache = LRUCache(maxsize=500) # Cache text extractions too
|
745
|
+
|
746
|
+
# Cache for reaction locations to avoid repeated API calls (bounded to prevent memory leaks)
|
747
|
+
self._reaction_locations_cache = LRUCache(maxsize=50)
|
529
748
|
|
530
749
|
# Create debug directory if specified
|
531
750
|
if self.debug_dir:
|
@@ -551,23 +770,40 @@ class ReactionExtractor:
|
|
551
770
|
# ------------------------------------------------------------------
|
552
771
|
|
553
772
|
def _collect_captions_and_titles(self) -> str:
|
554
|
-
#
|
555
|
-
# This catches all variations
|
556
|
-
|
773
|
+
# Pattern to match Table or Figure with optional leading whitespace
|
774
|
+
# This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
|
775
|
+
# Also handles cases where there's whitespace before the caption
|
776
|
+
cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
|
557
777
|
captions: List[str] = []
|
558
778
|
|
559
779
|
# Collect from all pages
|
560
780
|
all_text = "\n".join(self.all_pages)
|
561
781
|
|
562
|
-
# Find all figure/table captions
|
782
|
+
# Find all figure/table captions with more context
|
563
783
|
for match in cap_pattern.finditer(all_text):
|
564
784
|
caption_start = match.start()
|
565
|
-
|
785
|
+
|
786
|
+
# Include some context before the caption (up to 200 chars)
|
787
|
+
context_start = max(0, caption_start - 200)
|
788
|
+
# Find the start of the sentence/paragraph before the caption
|
789
|
+
context_text = all_text[context_start:caption_start]
|
790
|
+
last_period = context_text.rfind('.')
|
791
|
+
if last_period != -1:
|
792
|
+
context_start = context_start + last_period + 1
|
793
|
+
|
794
|
+
# For tables, include much more content after the caption to show actual table data
|
795
|
+
# For figures, keep the original limit
|
796
|
+
is_table = match.group(1).lower() == 'table'
|
797
|
+
max_chars = 5000 if is_table else 3000
|
798
|
+
|
799
|
+
# Get up to max_chars or until double newline
|
566
800
|
caption_end = all_text.find("\n\n", caption_start)
|
567
|
-
if caption_end == -1 or caption_end - caption_start >
|
568
|
-
caption_end = caption_start +
|
569
|
-
|
570
|
-
|
801
|
+
if caption_end == -1 or caption_end - caption_start > max_chars:
|
802
|
+
caption_end = caption_start + max_chars
|
803
|
+
|
804
|
+
# Include the context and full caption with table content
|
805
|
+
full_caption = all_text[context_start:caption_end].strip()
|
806
|
+
captions.append(full_caption)
|
571
807
|
|
572
808
|
# Also look for SI section titles
|
573
809
|
si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
|
@@ -584,15 +820,47 @@ class ReactionExtractor:
|
|
584
820
|
|
585
821
|
def find_reaction_locations(self) -> List[Dict[str, Any]]:
|
586
822
|
"""Find all locations containing reaction performance data."""
|
587
|
-
#
|
823
|
+
# Create cache key based on campaign filter
|
824
|
+
cache_key = f"locations_{self.campaign_filter or 'all'}"
|
825
|
+
|
826
|
+
# Check cache first
|
827
|
+
cached_result = self._reaction_locations_cache.get(cache_key)
|
828
|
+
if cached_result is not None:
|
829
|
+
LOGGER.info("Using cached reaction locations for campaign: %s", self.campaign_filter or 'all')
|
830
|
+
return cached_result
|
831
|
+
|
832
|
+
# Add campaign context - always provide context to help model understanding
|
588
833
|
campaign_context = ""
|
589
834
|
if self.campaign_filter:
|
835
|
+
campaigns_warning = ""
|
836
|
+
if self.all_campaigns:
|
837
|
+
campaigns_warning = f"""
|
838
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
839
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
840
|
+
|
841
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
|
842
|
+
Be extremely careful to only extract data for the {self.campaign_filter} campaign.
|
843
|
+
"""
|
844
|
+
|
590
845
|
campaign_context = f"""
|
591
|
-
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
592
|
-
Only return locations that contain data for this specific campaign.
|
593
|
-
Ignore locations that contain data for other campaigns.
|
846
|
+
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
847
|
+
Only return locations that contain data for this specific campaign.
|
848
|
+
Ignore locations that contain data for other campaigns.
|
849
|
+
{campaigns_warning}
|
594
850
|
|
595
|
-
"""
|
851
|
+
"""
|
852
|
+
else:
|
853
|
+
# Even for single campaigns, provide context about what to look for
|
854
|
+
campaign_context = f"""
|
855
|
+
IMPORTANT: You are looking for performance data showing enzyme evolution progression.
|
856
|
+
Look for locations that contain actual performance metrics (yield, TTN, TON, activity, etc.)
|
857
|
+
for multiple enzyme variants, not just mutation lists or method descriptions.
|
858
|
+
|
859
|
+
Tables may only contain mutation information without performance data - check the actual
|
860
|
+
table content below the caption to verify if performance metrics are present.
|
861
|
+
Figures with evolutionary lineage data often contain the actual performance matrix.
|
862
|
+
|
863
|
+
"""
|
596
864
|
|
597
865
|
prompt = campaign_context + PROMPT_FIND_LOCATIONS + "\n\n" + self._collect_captions_and_titles()
|
598
866
|
try:
|
@@ -604,13 +872,20 @@ Ignore locations that contain data for other campaigns.
|
|
604
872
|
tag="find_locations"
|
605
873
|
)
|
606
874
|
# Handle both single dict (backwards compatibility) and list
|
875
|
+
result = []
|
607
876
|
if isinstance(data, dict):
|
608
|
-
|
877
|
+
result = [data]
|
609
878
|
elif isinstance(data, list):
|
610
|
-
|
879
|
+
result = data
|
611
880
|
else:
|
612
881
|
LOGGER.error("Expected list or dict from Gemini, got: %s", type(data))
|
613
|
-
|
882
|
+
result = []
|
883
|
+
|
884
|
+
# Cache the result
|
885
|
+
self._reaction_locations_cache.put(cache_key, result)
|
886
|
+
LOGGER.info("Cached reaction locations for campaign: %s", self.campaign_filter or 'all')
|
887
|
+
|
888
|
+
return result
|
614
889
|
except Exception as e:
|
615
890
|
LOGGER.error("Failed to find reaction locations: %s", e)
|
616
891
|
return []
|
@@ -686,13 +961,27 @@ Ignore locations that contain data for other campaigns.
|
|
686
961
|
}
|
687
962
|
|
688
963
|
def find_lineage_model_reaction(self, location: str, group_context: str, model_reaction_locations: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
689
|
-
"""Find the model reaction for a specific lineage group.
|
964
|
+
"""Find the model reaction for a specific lineage group.
|
965
|
+
Returns early if no relevant text is found to avoid unnecessary API calls."""
|
966
|
+
|
690
967
|
# Gather relevant text near this location
|
691
968
|
page_text = self._page_with_reference(location) or ""
|
692
969
|
|
970
|
+
# Early exit if no text found for this location
|
971
|
+
if not page_text or len(page_text.strip()) < 100:
|
972
|
+
LOGGER.info("No sufficient text found for location %s, skipping lineage-specific extraction", location)
|
973
|
+
return {}
|
974
|
+
|
693
975
|
# Also check manuscript introduction for model reaction info
|
694
976
|
intro_text = "\n\n".join(self.ms_pages[:3]) if self.ms_pages else ""
|
695
977
|
|
978
|
+
# Quick relevance check - look for reaction-related keywords
|
979
|
+
reaction_keywords = ["substrate", "product", "reaction", "compound", "synthesis", "procedure", "method"]
|
980
|
+
combined_text = (page_text + intro_text).lower()
|
981
|
+
if not any(keyword in combined_text for keyword in reaction_keywords):
|
982
|
+
LOGGER.info("No reaction-related keywords found for location %s, skipping lineage extraction", location)
|
983
|
+
return {}
|
984
|
+
|
696
985
|
# Build the prompt with location and context
|
697
986
|
prompt = PROMPT_FIND_LINEAGE_MODEL_REACTION.format(
|
698
987
|
location=location,
|
@@ -702,6 +991,7 @@ Ignore locations that contain data for other campaigns.
|
|
702
991
|
prompt += f"\n\nManuscript introduction:\n{intro_text[:3000]}"
|
703
992
|
|
704
993
|
# If we have model reaction locations, include text from those locations too
|
994
|
+
text_added = False
|
705
995
|
if model_reaction_locations:
|
706
996
|
# Add text from model reaction location
|
707
997
|
if model_reaction_locations.get("model_reaction_location", {}).get("location"):
|
@@ -709,6 +999,7 @@ Ignore locations that contain data for other campaigns.
|
|
709
999
|
model_text = self._get_text_around_location(model_loc)
|
710
1000
|
if model_text:
|
711
1001
|
prompt += f"\n\nText from {model_loc} (potential model reaction location):\n{model_text[:3000]}"
|
1002
|
+
text_added = True
|
712
1003
|
|
713
1004
|
# Add text from conditions location (often contains reaction details)
|
714
1005
|
if model_reaction_locations.get("conditions_location", {}).get("location"):
|
@@ -716,8 +1007,15 @@ Ignore locations that contain data for other campaigns.
|
|
716
1007
|
cond_text = self._get_text_around_location(cond_loc)
|
717
1008
|
if cond_text:
|
718
1009
|
prompt += f"\n\nText from {cond_loc} (reaction conditions):\n{cond_text[:3000]}"
|
1010
|
+
text_added = True
|
1011
|
+
|
1012
|
+
# If we didn't find any model reaction locations and the page text is sparse, skip
|
1013
|
+
if not text_added and len(page_text.strip()) < 500:
|
1014
|
+
LOGGER.info("Insufficient context for lineage model reaction extraction at %s", location)
|
1015
|
+
return {}
|
719
1016
|
|
720
1017
|
try:
|
1018
|
+
LOGGER.info("Attempting lineage-specific model reaction extraction for %s", location)
|
721
1019
|
data = generate_json_with_retry(
|
722
1020
|
self.model,
|
723
1021
|
prompt,
|
@@ -725,7 +1023,15 @@ Ignore locations that contain data for other campaigns.
|
|
725
1023
|
debug_dir=self.debug_dir,
|
726
1024
|
tag=f"lineage_model_reaction_{location.replace(' ', '_')}"
|
727
1025
|
)
|
728
|
-
|
1026
|
+
|
1027
|
+
# Validate the response has useful information
|
1028
|
+
if isinstance(data, dict) and (data.get('substrate_ids') or data.get('product_ids')):
|
1029
|
+
LOGGER.info("Lineage model reaction extraction successful for %s", location)
|
1030
|
+
return data
|
1031
|
+
else:
|
1032
|
+
LOGGER.info("Lineage model reaction extraction returned empty results for %s", location)
|
1033
|
+
return {}
|
1034
|
+
|
729
1035
|
except Exception as e:
|
730
1036
|
LOGGER.error("Failed to find model reaction for lineage at %s: %s", location, e)
|
731
1037
|
return {}
|
@@ -777,67 +1083,174 @@ Ignore locations that contain data for other campaigns.
|
|
777
1083
|
If False, extracts the entire page (useful for tables).
|
778
1084
|
Returns a base64-encoded PNG or None."""
|
779
1085
|
|
1086
|
+
# Check cache first
|
1087
|
+
cache_key = f"{ref}_{extract_figure_only}"
|
1088
|
+
cached_result = self._figure_cache.get(cache_key)
|
1089
|
+
if cached_result is not None:
|
1090
|
+
LOGGER.debug("Using cached figure for %s", ref)
|
1091
|
+
return cached_result
|
1092
|
+
|
780
1093
|
# For table extraction, use multi-page approach
|
781
1094
|
if not extract_figure_only:
|
782
1095
|
pages_with_ref = self._find_pages_with_reference(ref)
|
783
1096
|
if pages_with_ref:
|
784
1097
|
LOGGER.debug(f"Found {len(pages_with_ref)} pages containing {ref}")
|
785
|
-
return self._extract_multiple_pages_png(pages_with_ref)
|
1098
|
+
return self._extract_multiple_pages_png(pages_with_ref, ref)
|
786
1099
|
return None
|
787
1100
|
|
788
|
-
# For figure extraction, search both documents
|
1101
|
+
# For figure extraction, search both documents for actual figure captions
|
789
1102
|
for doc in filter(None, [self.ms_doc, self.si_doc]):
|
790
1103
|
for page_number in range(doc.page_count):
|
791
1104
|
page = doc.load_page(page_number)
|
792
1105
|
page_text = page.get_text()
|
793
|
-
|
794
|
-
|
795
|
-
#
|
796
|
-
|
797
|
-
|
1106
|
+
|
1107
|
+
# Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
|
1108
|
+
# For subfigures like "Figure 1C", extract the main figure "Figure 1"
|
1109
|
+
figure_num = ref.replace('Figure ', '').replace('figure ', '')
|
1110
|
+
|
1111
|
+
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1112
|
+
main_figure_num = re.match(r'^(\d+)', figure_num)
|
1113
|
+
if main_figure_num:
|
1114
|
+
main_figure_num = main_figure_num.group(1)
|
1115
|
+
else:
|
1116
|
+
main_figure_num = figure_num
|
1117
|
+
|
1118
|
+
caption_patterns = [
|
1119
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\.", # "Figure 1."
|
1120
|
+
rf"^Figure\s+{re.escape(main_figure_num)}:", # "Figure 1:"
|
1121
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]", # "Figure 1 Performance"
|
1122
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
|
1123
|
+
rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
|
1124
|
+
rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
|
1125
|
+
]
|
1126
|
+
|
1127
|
+
LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
|
1128
|
+
main_figure_num, ref, caption_patterns)
|
1129
|
+
|
1130
|
+
caption_found = False
|
1131
|
+
cap_rect = None
|
1132
|
+
|
1133
|
+
for pattern in caption_patterns:
|
1134
|
+
matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
|
1135
|
+
if matches:
|
1136
|
+
LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
|
1137
|
+
# Found actual figure caption, get its position
|
1138
|
+
caption_text = matches.group(0)
|
1139
|
+
text_instances = page.search_for(caption_text, quads=False)
|
1140
|
+
if text_instances:
|
1141
|
+
cap_rect = text_instances[0]
|
1142
|
+
caption_found = True
|
1143
|
+
LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
|
1144
|
+
break
|
1145
|
+
|
1146
|
+
if not caption_found:
|
1147
|
+
# Debug: show what figure-related text is actually on this page
|
1148
|
+
figure_mentions = [line.strip() for line in page_text.split('\n')
|
1149
|
+
if 'figure' in line.lower() and main_figure_num.lower() in line.lower()]
|
1150
|
+
if figure_mentions:
|
1151
|
+
LOGGER.debug("Page %d has figure mentions but no caption match: %s",
|
1152
|
+
page_number, figure_mentions[:3])
|
798
1153
|
continue
|
799
|
-
cap_rect = text_instances[0] # first match
|
800
1154
|
|
801
1155
|
if extract_figure_only:
|
802
|
-
#
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
1156
|
+
# Extract only the area above the caption (the actual figure)
|
1157
|
+
# This excludes caption text and focuses on visual elements
|
1158
|
+
LOGGER.info("Extracting figure area above caption for %s", ref)
|
1159
|
+
|
1160
|
+
# Get the page dimensions
|
1161
|
+
page_rect = page.rect
|
1162
|
+
|
1163
|
+
# Extract the area above the caption
|
1164
|
+
if cap_rect:
|
1165
|
+
# Extract from top of page to top of caption
|
1166
|
+
figure_rect = fitz.Rect(0, 0, page_rect.width, cap_rect.y0)
|
1167
|
+
LOGGER.debug("Extracting figure area: %s (caption at y=%f)", figure_rect, cap_rect.y0)
|
1168
|
+
else:
|
1169
|
+
# If no caption found, use top 80% of page
|
1170
|
+
figure_rect = fitz.Rect(0, 0, page_rect.width, page_rect.height * 0.8)
|
1171
|
+
LOGGER.debug("No caption found, using top 80% of page: %s", figure_rect)
|
1172
|
+
|
1173
|
+
# Extract the figure area only
|
1174
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
|
1175
|
+
pix = page.get_pixmap(matrix=mat, clip=figure_rect)
|
1176
|
+
pix = self._ensure_rgb_pixmap(pix)
|
1177
|
+
img_bytes = pix.tobytes("png")
|
1178
|
+
|
1179
|
+
# Save PNG to debug directory if available
|
1180
|
+
if self.debug_dir:
|
1181
|
+
timestamp = int(time.time())
|
1182
|
+
png_file = self.debug_dir / f"figure_{ref.replace(' ', '_')}_{timestamp}.png"
|
1183
|
+
with open(png_file, 'wb') as f:
|
1184
|
+
f.write(img_bytes)
|
1185
|
+
LOGGER.info("Saved figure page to: %s", png_file)
|
1186
|
+
|
1187
|
+
result = b64encode(img_bytes).decode()
|
1188
|
+
# Cache the result
|
1189
|
+
self._figure_cache.put(cache_key, result)
|
1190
|
+
return result
|
817
1191
|
else:
|
818
1192
|
# Extract the entire page as an image
|
819
|
-
mat = fitz.Matrix(
|
1193
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for better quality
|
820
1194
|
pix = page.get_pixmap(matrix=mat)
|
821
1195
|
pix = self._ensure_rgb_pixmap(pix)
|
822
1196
|
img_bytes = pix.tobytes("png")
|
823
|
-
|
1197
|
+
|
1198
|
+
# Save PNG to debug directory if available
|
1199
|
+
if self.debug_dir:
|
1200
|
+
timestamp = int(time.time())
|
1201
|
+
png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
|
1202
|
+
with open(png_file, 'wb') as f:
|
1203
|
+
f.write(img_bytes)
|
1204
|
+
LOGGER.info("Saved page image to: %s", png_file)
|
1205
|
+
|
1206
|
+
result = b64encode(img_bytes).decode()
|
1207
|
+
# Cache the result
|
1208
|
+
self._figure_cache.put(cache_key, result)
|
1209
|
+
return result
|
824
1210
|
return None
|
825
1211
|
|
826
1212
|
def _find_pages_with_reference(self, ref: str) -> List[Tuple[fitz.Document, int]]:
|
827
1213
|
"""Find all pages containing the reference across documents.
|
1214
|
+
Prioritizes pages with actual captions over just references.
|
828
1215
|
Returns list of (document, page_number) tuples."""
|
829
1216
|
pages_found = []
|
1217
|
+
caption_pages = []
|
830
1218
|
|
831
1219
|
for doc in filter(None, [self.ms_doc, self.si_doc]):
|
832
1220
|
for page_number in range(doc.page_count):
|
833
1221
|
page = doc.load_page(page_number)
|
834
1222
|
page_text = page.get_text()
|
1223
|
+
|
1224
|
+
# Check for actual figure caption first
|
1225
|
+
if ref.lower().startswith('figure'):
|
1226
|
+
figure_num = ref.replace('Figure ', '').replace('figure ', '')
|
1227
|
+
|
1228
|
+
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1229
|
+
main_figure_num = re.match(r'^(\d+)', figure_num)
|
1230
|
+
if main_figure_num:
|
1231
|
+
main_figure_num = main_figure_num.group(1)
|
1232
|
+
else:
|
1233
|
+
main_figure_num = figure_num
|
1234
|
+
|
1235
|
+
caption_patterns = [
|
1236
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\.",
|
1237
|
+
rf"^Figure\s+{re.escape(main_figure_num)}:",
|
1238
|
+
rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]"
|
1239
|
+
]
|
1240
|
+
|
1241
|
+
for pattern in caption_patterns:
|
1242
|
+
if re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE):
|
1243
|
+
caption_pages.append((doc, page_number))
|
1244
|
+
break
|
1245
|
+
|
1246
|
+
# Fallback to any mention of the reference
|
835
1247
|
if ref.lower() in page_text.lower():
|
836
1248
|
pages_found.append((doc, page_number))
|
837
|
-
|
838
|
-
|
1249
|
+
|
1250
|
+
# Return caption pages first, then other pages
|
1251
|
+
return caption_pages + [p for p in pages_found if p not in caption_pages]
|
839
1252
|
|
840
|
-
def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]]) -> Optional[str]:
|
1253
|
+
def _extract_multiple_pages_png(self, pages: List[Tuple[fitz.Document, int]], ref: str = "unknown") -> Optional[str]:
|
841
1254
|
"""Extract multiple pages as a combined PNG image."""
|
842
1255
|
if not pages:
|
843
1256
|
return None
|
@@ -854,12 +1267,7 @@ Ignore locations that contain data for other campaigns.
|
|
854
1267
|
pix = self._ensure_rgb_pixmap(pix)
|
855
1268
|
all_images.append(pix)
|
856
1269
|
|
857
|
-
#
|
858
|
-
if i == len(pages) - 1 and page_num + 1 < doc.page_count:
|
859
|
-
next_pix = doc.load_page(page_num + 1).get_pixmap(matrix=mat)
|
860
|
-
next_pix = self._ensure_rgb_pixmap(next_pix)
|
861
|
-
all_images.append(next_pix)
|
862
|
-
LOGGER.info(f"Added next page: page {page_num + 2}") # +2 because page numbers are 1-based for users
|
1270
|
+
# Only extract the page containing the reference (removed next page logic)
|
863
1271
|
|
864
1272
|
if not all_images:
|
865
1273
|
return None
|
@@ -867,7 +1275,17 @@ Ignore locations that contain data for other campaigns.
|
|
867
1275
|
# If only one page, return it directly
|
868
1276
|
if len(all_images) == 1:
|
869
1277
|
pix = self._ensure_rgb_pixmap(all_images[0])
|
870
|
-
|
1278
|
+
img_bytes = pix.tobytes("png")
|
1279
|
+
|
1280
|
+
# Save debug file if available
|
1281
|
+
if self.debug_dir:
|
1282
|
+
timestamp = int(time.time())
|
1283
|
+
png_file = self.debug_dir / f"page_{ref.replace(' ', '_')}_{timestamp}.png"
|
1284
|
+
with open(png_file, 'wb') as f:
|
1285
|
+
f.write(img_bytes)
|
1286
|
+
LOGGER.info("Saved multi-page image to: %s", png_file)
|
1287
|
+
|
1288
|
+
return b64encode(img_bytes).decode()
|
871
1289
|
|
872
1290
|
# Combine multiple pages vertically
|
873
1291
|
if not all_images:
|
@@ -914,7 +1332,7 @@ Ignore locations that contain data for other campaigns.
|
|
914
1332
|
y_offset += pix.height * scale
|
915
1333
|
|
916
1334
|
# Convert the page to a pixmap
|
917
|
-
mat = fitz.Matrix(
|
1335
|
+
mat = fitz.Matrix(5.0, 5.0) # 5x zoom for quality
|
918
1336
|
combined_pix = page.get_pixmap(matrix=mat)
|
919
1337
|
combined_pix = self._ensure_rgb_pixmap(combined_pix)
|
920
1338
|
|
@@ -922,6 +1340,14 @@ Ignore locations that contain data for other campaigns.
|
|
922
1340
|
img_bytes = combined_pix.tobytes("png")
|
923
1341
|
output_doc.close()
|
924
1342
|
|
1343
|
+
# Save debug file if available
|
1344
|
+
if self.debug_dir:
|
1345
|
+
timestamp = int(time.time())
|
1346
|
+
png_file = self.debug_dir / f"combined_pages_{ref.replace(' ', '_')}_{timestamp}.png"
|
1347
|
+
with open(png_file, 'wb') as f:
|
1348
|
+
f.write(img_bytes)
|
1349
|
+
LOGGER.info("Saved combined multi-page image to: %s", png_file)
|
1350
|
+
|
925
1351
|
return b64encode(img_bytes).decode()
|
926
1352
|
|
927
1353
|
# ------------------------------------------------------------------
|
@@ -930,14 +1356,19 @@ Ignore locations that contain data for other campaigns.
|
|
930
1356
|
|
931
1357
|
def _validate_location_exists(self, ref: str) -> bool:
|
932
1358
|
"""Verify that the referenced location actually exists in the document."""
|
933
|
-
# Search for the actual reference in
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
1359
|
+
# Search for the actual reference in both manuscript and SI documents
|
1360
|
+
docs_to_check = [self.ms_doc]
|
1361
|
+
if self.si_doc:
|
1362
|
+
docs_to_check.append(self.si_doc)
|
1363
|
+
|
1364
|
+
for doc in docs_to_check:
|
1365
|
+
for page_num in range(len(doc)):
|
1366
|
+
page = doc[page_num]
|
1367
|
+
text = page.get_text()
|
1368
|
+
|
1369
|
+
# Look for table references like "Table 1", "Table S1", etc.
|
1370
|
+
if re.search(rf'\b{re.escape(ref)}\b', text, re.IGNORECASE):
|
1371
|
+
return True
|
941
1372
|
|
942
1373
|
return False
|
943
1374
|
|
@@ -1010,23 +1441,55 @@ Ignore locations that contain data for other campaigns.
|
|
1010
1441
|
LOGGER.debug("No page image found for %s - using full page text", ref)
|
1011
1442
|
snippet = self._extract_table_context(ref)
|
1012
1443
|
elif self._FIG_RE.search(ref_lc):
|
1013
|
-
# For figures, extract just the figure image
|
1444
|
+
# For figures, extract just the figure image (same logic as compound mapping)
|
1014
1445
|
image_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1015
1446
|
if not image_b64:
|
1016
1447
|
LOGGER.debug("No figure image found for %s - using caption text", ref)
|
1017
1448
|
snippet = self._extract_figure_caption(ref)
|
1449
|
+
else:
|
1450
|
+
# If figure is found, ignore text information - use image only
|
1451
|
+
snippet = ""
|
1018
1452
|
else:
|
1019
1453
|
snippet = self._page_with_reference(ref) or ""
|
1020
1454
|
|
1021
|
-
#
|
1022
|
-
if
|
1455
|
+
# For figures with images, skip text validation and proceed with image extraction
|
1456
|
+
if image_b64 and self._FIG_RE.search(ref_lc):
|
1457
|
+
LOGGER.info("Using figure image for %s - ignoring text context", ref)
|
1458
|
+
elif not image_b64 and not self._validate_context(snippet, enzyme_list, ref):
|
1023
1459
|
return []
|
1024
1460
|
|
1025
|
-
|
1461
|
+
# Create enhanced enzyme descriptions with parent/mutation context
|
1462
|
+
if hasattr(self, 'enzyme_df') and self.enzyme_df is not None:
|
1463
|
+
enzyme_descriptions = []
|
1464
|
+
for enzyme in enzyme_list:
|
1465
|
+
# Find this enzyme in the dataframe
|
1466
|
+
enzyme_row = None
|
1467
|
+
if 'enzyme_id' in self.enzyme_df.columns:
|
1468
|
+
enzyme_row = self.enzyme_df[self.enzyme_df['enzyme_id'] == enzyme]
|
1469
|
+
elif 'enzyme' in self.enzyme_df.columns:
|
1470
|
+
enzyme_row = self.enzyme_df[self.enzyme_df['enzyme'] == enzyme]
|
1471
|
+
|
1472
|
+
if enzyme_row is not None and len(enzyme_row) > 0:
|
1473
|
+
row = enzyme_row.iloc[0]
|
1474
|
+
parent = row.get('parent_enzyme_id', '')
|
1475
|
+
mutations = row.get('mutations', '')
|
1476
|
+
|
1477
|
+
desc = f"- {enzyme}"
|
1478
|
+
if parent and str(parent).strip() and str(parent) != 'nan':
|
1479
|
+
desc += f" (parent: {parent})"
|
1480
|
+
if mutations and str(mutations).strip() and str(mutations) != 'nan':
|
1481
|
+
desc += f" (mutations: {mutations})"
|
1482
|
+
enzyme_descriptions.append(desc)
|
1483
|
+
else:
|
1484
|
+
enzyme_descriptions.append(f"- {enzyme}")
|
1485
|
+
enzyme_names = "\n".join(enzyme_descriptions)
|
1486
|
+
else:
|
1487
|
+
enzyme_names = "\n".join([f"- {enzyme}" for enzyme in enzyme_list])
|
1026
1488
|
|
1027
1489
|
if image_b64:
|
1028
1490
|
# Use batch extraction prompt for image analysis
|
1029
|
-
|
1491
|
+
location_context = f"\n\nIMPORTANT: You are extracting data from {ref}, which has been identified as the PRIMARY LOCATION containing the most reliable performance data for these enzymes.\n"
|
1492
|
+
prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
|
1030
1493
|
LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
1031
1494
|
tag = f"extract_metrics_batch_vision"
|
1032
1495
|
else:
|
@@ -1048,7 +1511,32 @@ Ignore locations that contain data for other campaigns.
|
|
1048
1511
|
|
1049
1512
|
# Validate response has meaningful data
|
1050
1513
|
if not self._validate_response(data, enzyme_list, ref):
|
1051
|
-
|
1514
|
+
# If figure extraction failed and we have a figure, try falling back to text
|
1515
|
+
if image_b64 and self._FIG_RE.search(ref_lc):
|
1516
|
+
LOGGER.warning("Figure extraction from %s returned empty results - falling back to text", ref)
|
1517
|
+
snippet = self._extract_figure_caption(ref)
|
1518
|
+
if self._validate_context(snippet, enzyme_list, ref):
|
1519
|
+
# Retry with text extraction
|
1520
|
+
format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
|
1521
|
+
prompt = campaign_context + PROMPT_EXTRACT_METRICS + f"\n\nExtract performance data for ALL these enzyme variants:\n{enzyme_names}\n\nReturn a JSON object with enzyme names as keys, each containing the metrics.\nExample format: {format_example}\n\n=== CONTEXT ===\n" + snippet[:4000]
|
1522
|
+
LOGGER.info("Gemini: retrying with text extraction for %d enzymes from %s…", len(enzyme_list), ref)
|
1523
|
+
|
1524
|
+
data = generate_json_with_retry(
|
1525
|
+
self.model,
|
1526
|
+
prompt,
|
1527
|
+
temperature=self.cfg.extract_temperature,
|
1528
|
+
debug_dir=self.debug_dir,
|
1529
|
+
tag=f"extract_metrics_batch_text_fallback",
|
1530
|
+
image_b64=None
|
1531
|
+
)
|
1532
|
+
|
1533
|
+
# Validate the text extraction response
|
1534
|
+
if not self._validate_response(data, enzyme_list, ref):
|
1535
|
+
return []
|
1536
|
+
else:
|
1537
|
+
return []
|
1538
|
+
else:
|
1539
|
+
return []
|
1052
1540
|
|
1053
1541
|
# Handle the response format - expecting a dict with enzyme names as keys
|
1054
1542
|
results = []
|
@@ -1100,6 +1588,15 @@ Ignore locations that contain data for other campaigns.
|
|
1100
1588
|
|
1101
1589
|
def find_model_reaction_locations(self, enzyme_variants: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
|
1102
1590
|
"""Find locations for model reaction scheme, conditions, and IUPAC names."""
|
1591
|
+
# Create cache key based on campaign filter and enzyme variants
|
1592
|
+
cache_key = f"{self.campaign_filter}_{hash(tuple(sorted(enzyme_variants)) if enzyme_variants else ())}"
|
1593
|
+
|
1594
|
+
# Check cache first
|
1595
|
+
cached_result = self._model_reaction_locations_cache.get(cache_key)
|
1596
|
+
if cached_result is not None:
|
1597
|
+
LOGGER.info("Using cached model reaction locations for campaign: %s", self.campaign_filter)
|
1598
|
+
return cached_result
|
1599
|
+
|
1103
1600
|
# Collect all text including section titles, captions, and schemes
|
1104
1601
|
all_text = self._collect_captions_and_titles()
|
1105
1602
|
|
@@ -1110,13 +1607,25 @@ Ignore locations that contain data for other campaigns.
|
|
1110
1607
|
# Add enzyme context if provided
|
1111
1608
|
enzyme_context = ""
|
1112
1609
|
if enzyme_variants and self.campaign_filter:
|
1610
|
+
campaigns_context = ""
|
1611
|
+
if self.all_campaigns:
|
1612
|
+
campaigns_context = f"""
|
1613
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
1614
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
1615
|
+
|
1616
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
|
1617
|
+
- Different campaigns may use similar enzyme names but different substrates
|
1618
|
+
- Be extremely careful to only extract data for the {self.campaign_filter} campaign
|
1619
|
+
- Ignore data from other campaigns even if they seem similar
|
1620
|
+
"""
|
1621
|
+
|
1113
1622
|
enzyme_context = f"""
|
1114
1623
|
IMPORTANT CONTEXT:
|
1115
1624
|
You are looking for the model reaction used specifically for these enzyme variants:
|
1116
1625
|
{', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
|
1117
1626
|
|
1118
1627
|
These variants belong to campaign: {self.campaign_filter}
|
1119
|
-
|
1628
|
+
{campaigns_context}
|
1120
1629
|
Focus on finding the model reaction that was used to evaluate THESE specific variants.
|
1121
1630
|
Different campaigns may use different model reactions.
|
1122
1631
|
"""
|
@@ -1134,6 +1643,11 @@ Different campaigns may use different model reactions.
|
|
1134
1643
|
if not isinstance(data, dict):
|
1135
1644
|
LOGGER.error("Expected dict from Gemini, got: %s", type(data))
|
1136
1645
|
return None
|
1646
|
+
|
1647
|
+
# Cache the result
|
1648
|
+
self._model_reaction_locations_cache.put(cache_key, data)
|
1649
|
+
LOGGER.info("Cached model reaction locations for campaign: %s", self.campaign_filter)
|
1650
|
+
|
1137
1651
|
return data
|
1138
1652
|
except Exception as e:
|
1139
1653
|
LOGGER.error("Failed to find model reaction locations: %s", e)
|
@@ -1232,9 +1746,12 @@ Different campaigns may use different model reactions.
|
|
1232
1746
|
extraction_text: str,
|
1233
1747
|
compound_ids: List[str] = None,
|
1234
1748
|
tag_suffix: str = "",
|
1749
|
+
campaign_filter: Optional[str] = None,
|
1235
1750
|
) -> Dict[str, CompoundMapping]:
|
1236
1751
|
"""Helper function to extract compound mappings from provided text."""
|
1237
1752
|
prompt = PROMPT_COMPOUND_MAPPING
|
1753
|
+
if campaign_filter:
|
1754
|
+
prompt += f"\n\nIMPORTANT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system."
|
1238
1755
|
if compound_ids:
|
1239
1756
|
prompt += "\n\nCOMPOUNDS TO MAP: " + ", ".join(sorted(compound_ids))
|
1240
1757
|
prompt += "\n\nTEXT:\n" + extraction_text
|
@@ -1282,6 +1799,7 @@ Different campaigns may use different model reactions.
|
|
1282
1799
|
compound_ids: List[str],
|
1283
1800
|
figure_images: Dict[str, str],
|
1284
1801
|
tag_suffix: str = "",
|
1802
|
+
campaign_filter: Optional[str] = None,
|
1285
1803
|
) -> Dict[str, CompoundMapping]:
|
1286
1804
|
"""Extract compound mappings using multimodal approach with figures."""
|
1287
1805
|
# Enhanced prompt for figure-based extraction
|
@@ -1293,8 +1811,26 @@ Use your best knowledge, Look carefully in:
|
|
1293
1811
|
1. The chemical structures shown in figures - infer IUPAC names from drawn structures
|
1294
1812
|
2. Figure captions that may define compounds
|
1295
1813
|
3. Text that refers to these compound numbers
|
1296
|
-
4. Reaction schemes showing transformations
|
1814
|
+
4. Reaction schemes showing transformations"""
|
1815
|
+
|
1816
|
+
if campaign_filter:
|
1817
|
+
campaigns_warning = ""
|
1818
|
+
if self.all_campaigns:
|
1819
|
+
campaigns_warning = f"""
|
1820
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
1821
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
1822
|
+
|
1823
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
|
1824
|
+
"""
|
1825
|
+
|
1826
|
+
prompt += f"""
|
1827
|
+
|
1828
|
+
IMPORTANT CAMPAIGN CONTEXT: Focus on compound information relevant to the {campaign_filter} campaign/reaction system.
|
1829
|
+
{campaigns_warning}
|
1830
|
+
Different campaigns may use different numbering systems for compounds.
|
1831
|
+
Do NOT include compound information from other campaigns."""
|
1297
1832
|
|
1833
|
+
prompt += """
|
1298
1834
|
|
1299
1835
|
IMPORTANT:
|
1300
1836
|
- Only provide IUPAC names you can determine from the figures or text
|
@@ -1324,9 +1860,10 @@ TEXT FROM MANUSCRIPT:
|
|
1324
1860
|
for fig_ref, fig_base64 in figure_images.items():
|
1325
1861
|
try:
|
1326
1862
|
img_bytes = b64decode(fig_base64)
|
1327
|
-
image
|
1863
|
+
# Format image for Gemini API
|
1864
|
+
image_part = {"mime_type": "image/png", "data": img_bytes}
|
1328
1865
|
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1329
|
-
content_parts.append(
|
1866
|
+
content_parts.append(image_part)
|
1330
1867
|
LOGGER.info("Added figure %s to multimodal compound mapping", fig_ref)
|
1331
1868
|
except Exception as e:
|
1332
1869
|
LOGGER.warning("Failed to add figure %s: %s", fig_ref, e)
|
@@ -1356,6 +1893,21 @@ TEXT FROM MANUSCRIPT:
|
|
1356
1893
|
|
1357
1894
|
# Make multimodal API call
|
1358
1895
|
response = self.model.generate_content(content_parts)
|
1896
|
+
|
1897
|
+
# Track token usage if available
|
1898
|
+
try:
|
1899
|
+
if hasattr(response, 'usage_metadata'):
|
1900
|
+
input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
1901
|
+
output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
1902
|
+
if input_tokens or output_tokens:
|
1903
|
+
try:
|
1904
|
+
from .wrapper import add_token_usage
|
1905
|
+
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
1906
|
+
except ImportError:
|
1907
|
+
pass # wrapper not available
|
1908
|
+
except Exception:
|
1909
|
+
pass # token tracking is best-effort
|
1910
|
+
|
1359
1911
|
raw_text = response.text.strip()
|
1360
1912
|
|
1361
1913
|
# Log response
|
@@ -1402,18 +1954,37 @@ TEXT FROM MANUSCRIPT:
|
|
1402
1954
|
self,
|
1403
1955
|
compound_ids: List[str],
|
1404
1956
|
initial_sections: List[str] = None,
|
1957
|
+
campaign_filter: Optional[str] = None,
|
1405
1958
|
) -> Dict[str, CompoundMapping]:
|
1406
|
-
"""Extract compound ID to IUPAC name mappings using
|
1959
|
+
"""Extract compound ID to IUPAC name mappings using simplified 2-tier strategy.
|
1407
1960
|
|
1408
|
-
1. First attempts extraction from
|
1409
|
-
2.
|
1410
|
-
3. Uses multimodal figure analysis as final fallback
|
1961
|
+
1. First attempts extraction from specific SI sections + 10 manuscript pages
|
1962
|
+
2. If compounds missing, uses full manuscript + SI with multimodal figure analysis
|
1411
1963
|
"""
|
1412
1964
|
if not compound_ids:
|
1413
1965
|
return {}
|
1414
1966
|
|
1415
|
-
|
1416
|
-
|
1967
|
+
# Check cache first - return cached results for compounds we've already processed
|
1968
|
+
cached_mappings = {}
|
1969
|
+
uncached_compound_ids = []
|
1970
|
+
|
1971
|
+
for cid in compound_ids:
|
1972
|
+
# Include campaign filter in cache key to prevent cross-campaign contamination
|
1973
|
+
cache_key = f"{campaign_filter}_{cid.lower().strip()}" if campaign_filter else cid.lower().strip()
|
1974
|
+
cached_mapping = self._compound_mapping_cache.get(cache_key)
|
1975
|
+
if cached_mapping is not None:
|
1976
|
+
cached_mappings[cid.lower().strip()] = cached_mapping
|
1977
|
+
LOGGER.info("Using cached compound mapping for: %s (campaign: %s)", cid, campaign_filter)
|
1978
|
+
else:
|
1979
|
+
uncached_compound_ids.append(cid)
|
1980
|
+
|
1981
|
+
# If all compounds are cached, return immediately
|
1982
|
+
if not uncached_compound_ids:
|
1983
|
+
LOGGER.info("All %d compounds found in cache, skipping API calls", len(compound_ids))
|
1984
|
+
return cached_mappings
|
1985
|
+
|
1986
|
+
LOGGER.info("Starting adaptive compound mapping for %d uncached compounds: %s",
|
1987
|
+
len(uncached_compound_ids), sorted(uncached_compound_ids))
|
1417
1988
|
|
1418
1989
|
# Tier 1: Standard sections (manuscript + initial SI sections)
|
1419
1990
|
initial_sections = initial_sections or [
|
@@ -1424,118 +1995,118 @@ TEXT FROM MANUSCRIPT:
|
|
1424
1995
|
# Include manuscript pages (first 10) for model reaction context
|
1425
1996
|
manuscript_text = "\n\n".join(self.ms_pages[:10])
|
1426
1997
|
|
1998
|
+
# Add campaign context if provided
|
1999
|
+
campaign_context = ""
|
2000
|
+
if campaign_filter:
|
2001
|
+
campaigns_warning = ""
|
2002
|
+
if self.all_campaigns:
|
2003
|
+
campaigns_warning = f"""
|
2004
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
2005
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
2006
|
+
|
2007
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates.
|
2008
|
+
"""
|
2009
|
+
|
2010
|
+
campaign_context = f"""
|
2011
|
+
|
2012
|
+
IMPORTANT CAMPAIGN CONTEXT:
|
2013
|
+
You are extracting compound information specifically for the {campaign_filter} campaign.
|
2014
|
+
{campaigns_warning}
|
2015
|
+
Focus ONLY on compound information relevant to the {campaign_filter} campaign/reaction system.
|
2016
|
+
Do NOT include compound information from other campaigns.
|
2017
|
+
|
2018
|
+
"""
|
2019
|
+
|
1427
2020
|
# Extract from initial sections
|
1428
2021
|
extraction_text = self._extract_sections_by_title(initial_sections)
|
1429
2022
|
if extraction_text:
|
1430
|
-
extraction_text = manuscript_text + "\n\n" + extraction_text
|
2023
|
+
extraction_text = manuscript_text + campaign_context + "\n\n" + extraction_text
|
1431
2024
|
else:
|
1432
|
-
extraction_text = manuscript_text
|
2025
|
+
extraction_text = manuscript_text + campaign_context
|
1433
2026
|
|
1434
|
-
# First extraction attempt
|
2027
|
+
# First extraction attempt - only for uncached compounds
|
1435
2028
|
mappings = self._extract_compound_mappings_from_text(
|
1436
|
-
extraction_text[:50000],
|
2029
|
+
extraction_text[:50000], uncached_compound_ids, tag_suffix="initial", campaign_filter=campaign_filter
|
1437
2030
|
)
|
1438
2031
|
LOGGER.info("Tier 1: Found %d compound mappings from standard sections", len(mappings))
|
1439
2032
|
|
1440
2033
|
# Check for missing compounds
|
1441
2034
|
missing_compounds = []
|
1442
|
-
for cid in
|
2035
|
+
for cid in uncached_compound_ids:
|
1443
2036
|
mapping = mappings.get(cid.lower().strip())
|
1444
2037
|
if not mapping or not mapping.iupac_name:
|
1445
2038
|
missing_compounds.append(cid)
|
1446
2039
|
|
1447
|
-
# Tier 2
|
2040
|
+
# Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
|
1448
2041
|
if missing_compounds:
|
1449
|
-
LOGGER.info("Tier 2: %d compounds still missing IUPAC names: %s",
|
2042
|
+
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
|
1450
2043
|
len(missing_compounds), sorted(missing_compounds))
|
1451
2044
|
|
1452
|
-
#
|
1453
|
-
additional_sections = [
|
1454
|
-
"Engineering strategy", "Evolution campaign",
|
1455
|
-
"Screening", "Optimization", "Substrate synthesis",
|
1456
|
-
"Supporting Information", "Supplementary Methods"
|
1457
|
-
]
|
1458
|
-
|
1459
|
-
# Extract from additional sections
|
1460
|
-
additional_text = self._extract_sections_by_title(additional_sections)
|
1461
|
-
|
1462
|
-
# Also extract any figures that might contain compound structures
|
2045
|
+
# Get all available figures for compound structure analysis
|
1463
2046
|
figure_images = {}
|
1464
|
-
|
2047
|
+
|
2048
|
+
# Extract main manuscript figures
|
2049
|
+
figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
|
1465
2050
|
for ref in figure_refs:
|
1466
2051
|
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
1467
2052
|
if img_b64:
|
1468
2053
|
figure_images[ref] = img_b64
|
1469
|
-
LOGGER.info("
|
1470
|
-
|
1471
|
-
# Try multimodal approach with figures and expanded text
|
1472
|
-
if figure_images or additional_text:
|
1473
|
-
combined_text = additional_text[:30000] if additional_text else ""
|
1474
|
-
expanded_mappings = self._extract_compound_mappings_with_figures(
|
1475
|
-
combined_text, missing_compounds, figure_images, tag_suffix="tier2"
|
1476
|
-
)
|
1477
|
-
|
1478
|
-
# Merge new mappings
|
1479
|
-
new_found = 0
|
1480
|
-
for key, mapping in expanded_mappings.items():
|
1481
|
-
if key not in mappings or not mappings[key].iupac_name:
|
1482
|
-
if mapping.iupac_name:
|
1483
|
-
mappings[key] = mapping
|
1484
|
-
new_found += 1
|
1485
|
-
LOGGER.info("Found IUPAC name for '%s': %s",
|
1486
|
-
key, mapping.iupac_name[:50] + "..." if len(mapping.iupac_name) > 50 else mapping.iupac_name)
|
1487
|
-
|
1488
|
-
LOGGER.info("Tier 2: Found %d additional compound mappings", new_found)
|
1489
|
-
|
1490
|
-
# Check again for still missing compounds
|
1491
|
-
still_missing = []
|
1492
|
-
for cid in missing_compounds:
|
1493
|
-
mapping = mappings.get(cid.lower().strip())
|
1494
|
-
if not mapping or not mapping.iupac_name:
|
1495
|
-
still_missing.append(cid)
|
1496
|
-
|
1497
|
-
# Tier 3: Full manuscript search with all available figures
|
1498
|
-
if still_missing:
|
1499
|
-
LOGGER.info("Tier 3: %d compounds still missing, trying full manuscript search",
|
1500
|
-
len(still_missing))
|
1501
|
-
|
1502
|
-
# Get all SI figures
|
1503
|
-
si_figure_refs = []
|
1504
|
-
for page in self.si_pages[:5]: # Check first 5 SI pages
|
1505
|
-
matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
|
1506
|
-
si_figure_refs.extend(matches[:5]) # Limit to 5 figures
|
2054
|
+
LOGGER.info("Retrieved %s for compound mapping", ref)
|
1507
2055
|
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1514
|
-
|
1515
|
-
|
1516
|
-
|
1517
|
-
|
1518
|
-
|
1519
|
-
|
1520
|
-
|
1521
|
-
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
2056
|
+
# Get SI figures
|
2057
|
+
si_figure_refs = []
|
2058
|
+
for page in self.si_pages[:10]: # Check first 10 SI pages
|
2059
|
+
matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
|
2060
|
+
si_figure_refs.extend(matches[:10]) # Limit to 10 figures
|
2061
|
+
|
2062
|
+
# Extract SI figures
|
2063
|
+
for ref in set(si_figure_refs):
|
2064
|
+
if ref not in figure_images:
|
2065
|
+
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2066
|
+
if img_b64:
|
2067
|
+
figure_images[ref] = img_b64
|
2068
|
+
LOGGER.info("Extracted %s for compound mapping", ref)
|
2069
|
+
|
2070
|
+
# Full text search including all pages
|
2071
|
+
full_text = "\n\n".join(self.all_pages[:40]) # First 40 pages (more comprehensive)
|
2072
|
+
|
2073
|
+
final_mappings = self._extract_compound_mappings_with_figures(
|
2074
|
+
full_text[:60000], missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
|
2075
|
+
)
|
2076
|
+
|
2077
|
+
# Merge final mappings with better compound ID matching
|
2078
|
+
final_found = 0
|
2079
|
+
for key, mapping in final_mappings.items():
|
2080
|
+
if key not in mappings or not mappings[key].iupac_name:
|
2081
|
+
if mapping.iupac_name:
|
2082
|
+
mappings[key] = mapping
|
2083
|
+
final_found += 1
|
2084
|
+
iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
|
2085
|
+
LOGGER.info("Found IUPAC name for '%s' in full search: %s", key, iupac_display)
|
2086
|
+
|
2087
|
+
LOGGER.info("Tier 2: Found %d additional compound mappings", final_found)
|
2088
|
+
|
2089
|
+
# Cache all newly found mappings using campaign-aware cache key
|
2090
|
+
for key, mapping in mappings.items():
|
2091
|
+
cache_key = f"{campaign_filter}_{key}" if campaign_filter else key
|
2092
|
+
if self._compound_mapping_cache.get(cache_key) is None:
|
2093
|
+
self._compound_mapping_cache.put(cache_key, mapping)
|
2094
|
+
iupac_display = mapping.iupac_name[:50] + "..." if mapping.iupac_name and len(mapping.iupac_name) > 50 else (mapping.iupac_name or "None")
|
2095
|
+
LOGGER.info("Cached compound mapping for: %s -> %s (campaign: %s)", key, iupac_display, campaign_filter)
|
1532
2096
|
|
1533
|
-
|
2097
|
+
# Also cache without campaign prefix for backward compatibility during integration
|
2098
|
+
if campaign_filter:
|
2099
|
+
self._compound_mapping_cache.put(key, mapping)
|
2100
|
+
|
2101
|
+
# Combine cached and new mappings
|
2102
|
+
final_mappings = cached_mappings.copy()
|
2103
|
+
final_mappings.update(mappings)
|
1534
2104
|
|
1535
|
-
LOGGER.info("Adaptive compound mapping complete: %d total mappings",
|
1536
|
-
|
2105
|
+
LOGGER.info("Adaptive compound mapping complete: %d total mappings (%d cached, %d new)",
|
2106
|
+
len(final_mappings), len(cached_mappings), len(mappings))
|
2107
|
+
return final_mappings
|
1537
2108
|
|
1538
|
-
def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
|
2109
|
+
def gather_model_reaction_info(self, enzyme_variants: Optional[List[str]] = None, lineage_compound_ids: Optional[Dict[str, List[str]]] = None) -> Dict[str, Any]:
|
1539
2110
|
"""Extract model reaction information using identified locations and 3-tier compound mapping."""
|
1540
2111
|
# First find the best locations
|
1541
2112
|
locations = self.find_model_reaction_locations(enzyme_variants)
|
@@ -1658,9 +2229,15 @@ TEXT FROM MANUSCRIPT:
|
|
1658
2229
|
figure_images[fig_ref] = img_b64
|
1659
2230
|
LOGGER.info("Successfully extracted %s image for model reaction analysis", fig_ref)
|
1660
2231
|
|
1661
|
-
# Extract compound IDs from locations
|
2232
|
+
# Extract compound IDs from locations or use lineage-specific ones
|
1662
2233
|
compound_ids = []
|
1663
|
-
if
|
2234
|
+
if lineage_compound_ids:
|
2235
|
+
# Use lineage-specific compound IDs if provided
|
2236
|
+
substrate_ids = lineage_compound_ids.get("substrate_ids", [])
|
2237
|
+
product_ids = lineage_compound_ids.get("product_ids", [])
|
2238
|
+
compound_ids = substrate_ids + product_ids
|
2239
|
+
LOGGER.info("Using lineage-specific compound IDs: %s", compound_ids)
|
2240
|
+
elif locations and locations.get("model_reaction_location", {}).get("compound_ids"):
|
1664
2241
|
compound_ids = locations["model_reaction_location"]["compound_ids"]
|
1665
2242
|
LOGGER.info("Found compound IDs in model reaction: %s", compound_ids)
|
1666
2243
|
|
@@ -1668,7 +2245,7 @@ TEXT FROM MANUSCRIPT:
|
|
1668
2245
|
compound_mappings = {}
|
1669
2246
|
if compound_ids:
|
1670
2247
|
LOGGER.info("Using 3-tier compound mapping approach for compounds: %s", compound_ids)
|
1671
|
-
compound_mappings = self._extract_compound_mappings_adaptive(compound_ids)
|
2248
|
+
compound_mappings = self._extract_compound_mappings_adaptive(compound_ids, campaign_filter=self.campaign_filter)
|
1672
2249
|
|
1673
2250
|
# Add the mapped IUPAC names to the context for better extraction
|
1674
2251
|
if compound_mappings:
|
@@ -1679,8 +2256,35 @@ TEXT FROM MANUSCRIPT:
|
|
1679
2256
|
mapping_text += f"Compound {cid}: {mapping.iupac_name}\n"
|
1680
2257
|
text_context += mapping_text
|
1681
2258
|
|
2259
|
+
# Add campaign context if available
|
2260
|
+
campaign_context = ""
|
2261
|
+
if enzyme_variants and self.campaign_filter:
|
2262
|
+
campaigns_context = ""
|
2263
|
+
if self.all_campaigns:
|
2264
|
+
campaigns_context = f"""
|
2265
|
+
ALL CAMPAIGNS IN THIS PAPER:
|
2266
|
+
{chr(10).join([f"- {campaign}" for campaign in self.all_campaigns])}
|
2267
|
+
|
2268
|
+
CRITICAL WARNING: Do NOT confuse campaigns! Each campaign uses completely different substrates:
|
2269
|
+
- Different campaigns may use similar enzyme names but different substrates
|
2270
|
+
- Be extremely careful to only extract data for the {self.campaign_filter} campaign
|
2271
|
+
- Ignore data from other campaigns even if they seem similar
|
2272
|
+
"""
|
2273
|
+
|
2274
|
+
campaign_context = f"""
|
2275
|
+
IMPORTANT CONTEXT:
|
2276
|
+
You are extracting the model reaction used specifically for these enzyme variants:
|
2277
|
+
{', '.join(enzyme_variants[:10])}{'...' if len(enzyme_variants) > 10 else ''}
|
2278
|
+
|
2279
|
+
These variants belong to campaign: {self.campaign_filter}
|
2280
|
+
{campaigns_context}
|
2281
|
+
Focus on extracting the model reaction that was used to evaluate THESE specific variants.
|
2282
|
+
Different campaigns may use different model reactions and substrates.
|
2283
|
+
|
2284
|
+
"""
|
2285
|
+
|
1682
2286
|
# Include both manuscript and SI text for better coverage
|
1683
|
-
prompt = PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
|
2287
|
+
prompt = campaign_context + PROMPT_MODEL_REACTION + "\n\n=== CONTEXT ===\n" + text_context
|
1684
2288
|
|
1685
2289
|
try:
|
1686
2290
|
# Use multimodal extraction if we have figure images
|
@@ -1693,9 +2297,10 @@ TEXT FROM MANUSCRIPT:
|
|
1693
2297
|
for fig_ref, fig_base64 in figure_images.items():
|
1694
2298
|
try:
|
1695
2299
|
img_bytes = b64decode(fig_base64)
|
1696
|
-
image
|
2300
|
+
# Format image for Gemini API
|
2301
|
+
image_part = {"mime_type": "image/png", "data": img_bytes}
|
1697
2302
|
content_parts.append(f"\n[Figure: {fig_ref}]")
|
1698
|
-
content_parts.append(
|
2303
|
+
content_parts.append(image_part)
|
1699
2304
|
except Exception as e:
|
1700
2305
|
LOGGER.warning("Failed to process figure %s: %s", fig_ref, e)
|
1701
2306
|
|
@@ -1714,6 +2319,20 @@ TEXT FROM MANUSCRIPT:
|
|
1714
2319
|
|
1715
2320
|
response = model.generate_content(content_parts)
|
1716
2321
|
|
2322
|
+
# Track token usage if available
|
2323
|
+
try:
|
2324
|
+
if hasattr(response, 'usage_metadata'):
|
2325
|
+
input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
2326
|
+
output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
2327
|
+
if input_tokens or output_tokens:
|
2328
|
+
try:
|
2329
|
+
from .wrapper import add_token_usage
|
2330
|
+
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
2331
|
+
except ImportError:
|
2332
|
+
pass # wrapper not available
|
2333
|
+
except Exception:
|
2334
|
+
pass # token tracking is best-effort
|
2335
|
+
|
1717
2336
|
# Parse JSON from response
|
1718
2337
|
if response and response.text:
|
1719
2338
|
# Save debug output
|
@@ -1754,32 +2373,59 @@ TEXT FROM MANUSCRIPT:
|
|
1754
2373
|
if isinstance(data, dict):
|
1755
2374
|
# If we have compound mappings, enhance the IUPAC names
|
1756
2375
|
if compound_ids and compound_mappings:
|
2376
|
+
LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
|
2377
|
+
list(compound_mappings.keys()))
|
2378
|
+
|
1757
2379
|
# Try to map substrate/product lists through compound IDs
|
1758
|
-
substrate_list = data.get("substrate_iupac_list", [])
|
2380
|
+
substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
|
1759
2381
|
if isinstance(substrate_list, list):
|
1760
2382
|
enhanced_substrates = []
|
1761
2383
|
for item in substrate_list:
|
2384
|
+
item_str = str(item).lower().strip()
|
1762
2385
|
# Check if it's a compound ID that we can map
|
1763
|
-
mapping = compound_mappings.get(
|
2386
|
+
mapping = compound_mappings.get(item_str)
|
1764
2387
|
if mapping and mapping.iupac_name:
|
1765
2388
|
enhanced_substrates.append(mapping.iupac_name)
|
2389
|
+
LOGGER.info("Mapped substrate '%s' -> '%s'", item, mapping.iupac_name)
|
1766
2390
|
elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
|
1767
|
-
# Keep valid IUPAC names
|
2391
|
+
# Keep valid IUPAC names that aren't compound IDs
|
1768
2392
|
enhanced_substrates.append(str(item))
|
2393
|
+
LOGGER.info("Kept substrate IUPAC name: '%s'", item)
|
2394
|
+
else:
|
2395
|
+
LOGGER.warning("Could not map substrate compound ID '%s'", item)
|
1769
2396
|
data["substrate_iupac_list"] = enhanced_substrates
|
1770
2397
|
|
1771
|
-
product_list = data.get("product_iupac_list", [])
|
2398
|
+
product_list = data.get("product_iupac_list", []) or data.get("product_list", [])
|
1772
2399
|
if isinstance(product_list, list):
|
1773
2400
|
enhanced_products = []
|
1774
2401
|
for item in product_list:
|
2402
|
+
item_str = str(item).lower().strip()
|
1775
2403
|
# Check if it's a compound ID that we can map
|
1776
|
-
mapping = compound_mappings.get(
|
2404
|
+
mapping = compound_mappings.get(item_str)
|
1777
2405
|
if mapping and mapping.iupac_name:
|
1778
2406
|
enhanced_products.append(mapping.iupac_name)
|
2407
|
+
LOGGER.info("Mapped product '%s' -> '%s'", item, mapping.iupac_name)
|
1779
2408
|
elif item and not re.match(r'^[0-9]+[a-z]?$|^S\d+$', str(item)):
|
1780
|
-
# Keep valid IUPAC names
|
2409
|
+
# Keep valid IUPAC names that aren't compound IDs
|
1781
2410
|
enhanced_products.append(str(item))
|
2411
|
+
LOGGER.info("Kept product IUPAC name: '%s'", item)
|
2412
|
+
else:
|
2413
|
+
LOGGER.warning("Could not map product compound ID '%s'", item)
|
1782
2414
|
data["product_iupac_list"] = enhanced_products
|
2415
|
+
|
2416
|
+
# Also try to enhance using both substrate_list and product_list if they contain compound IDs
|
2417
|
+
for list_key, target_key in [("substrate_list", "substrate_iupac_list"), ("product_list", "product_iupac_list")]:
|
2418
|
+
if list_key in data and isinstance(data[list_key], list):
|
2419
|
+
if target_key not in data or not data[target_key]:
|
2420
|
+
enhanced_list = []
|
2421
|
+
for item in data[list_key]:
|
2422
|
+
item_str = str(item).lower().strip()
|
2423
|
+
mapping = compound_mappings.get(item_str)
|
2424
|
+
if mapping and mapping.iupac_name:
|
2425
|
+
enhanced_list.append(mapping.iupac_name)
|
2426
|
+
LOGGER.info("Enhanced %s: mapped '%s' -> '%s'", target_key, item, mapping.iupac_name)
|
2427
|
+
if enhanced_list:
|
2428
|
+
data[target_key] = enhanced_list
|
1783
2429
|
|
1784
2430
|
# Validate and convert arrays to semicolon-separated strings for CSV compatibility
|
1785
2431
|
if "substrate_iupac_list" in data and isinstance(data["substrate_iupac_list"], list):
|
@@ -1826,8 +2472,8 @@ TEXT FROM MANUSCRIPT:
|
|
1826
2472
|
return data
|
1827
2473
|
|
1828
2474
|
def _process_single_lineage(self, location: Dict[str, Any], enzyme_df: pd.DataFrame) -> pd.DataFrame:
|
1829
|
-
"""Process a single lineage case -
|
1830
|
-
#
|
2475
|
+
"""Process a single lineage case - use confidence-based processing."""
|
2476
|
+
# Create lineage analysis for single location
|
1831
2477
|
lineage_analysis = {
|
1832
2478
|
'has_multiple_lineages': False,
|
1833
2479
|
'lineage_groups': [{
|
@@ -1839,7 +2485,7 @@ TEXT FROM MANUSCRIPT:
|
|
1839
2485
|
}]
|
1840
2486
|
}
|
1841
2487
|
|
1842
|
-
return self.
|
2488
|
+
return self._process_multiple_lineages_by_confidence([location], enzyme_df, lineage_analysis)
|
1843
2489
|
|
1844
2490
|
def _process_multiple_lineages_by_confidence(self, locations: List[Dict[str, Any]],
|
1845
2491
|
enzyme_df: pd.DataFrame,
|
@@ -1854,186 +2500,69 @@ TEXT FROM MANUSCRIPT:
|
|
1854
2500
|
# If enzyme_df has campaign_id column, we can use it to filter
|
1855
2501
|
has_campaign_info = 'campaign_id' in enzyme_df.columns
|
1856
2502
|
|
1857
|
-
#
|
1858
|
-
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
1862
|
-
|
1863
|
-
|
1864
|
-
|
2503
|
+
# Select the most confident source only
|
2504
|
+
best_location = None
|
2505
|
+
if locations:
|
2506
|
+
# Sort by confidence only
|
2507
|
+
locations_sorted = sorted(locations, key=lambda x: -x.get('confidence', 0))
|
2508
|
+
best_location = locations_sorted[0]
|
2509
|
+
|
2510
|
+
LOGGER.info("Selected primary location: %s (type: %s, confidence: %d%%)",
|
2511
|
+
best_location['location'],
|
2512
|
+
best_location.get('type', 'unknown'),
|
2513
|
+
best_location.get('confidence', 0))
|
1865
2514
|
|
1866
|
-
# Extract metrics from
|
1867
|
-
metrics_rows = self.extract_metrics_batch(all_enzyme_ids,
|
2515
|
+
# Extract metrics from the most confident source only
|
2516
|
+
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, best_location['location'])
|
1868
2517
|
|
1869
2518
|
# Filter to valid metrics
|
1870
2519
|
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
1871
2520
|
|
1872
2521
|
if not valid_metrics:
|
1873
|
-
LOGGER.warning("No valid metrics found in %s",
|
1874
|
-
|
2522
|
+
LOGGER.warning("No valid metrics found in primary location %s", best_location['location'])
|
2523
|
+
return pd.DataFrame()
|
1875
2524
|
|
1876
|
-
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics),
|
2525
|
+
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), best_location['location'])
|
1877
2526
|
|
1878
|
-
# Create DataFrame for
|
2527
|
+
# Create DataFrame for the single best location
|
1879
2528
|
df_location = pd.DataFrame(valid_metrics)
|
1880
2529
|
|
1881
|
-
#
|
1882
|
-
|
1883
|
-
|
1884
|
-
variants_with_data.update(new_variants)
|
2530
|
+
# Add metadata about the location
|
2531
|
+
df_location['data_location'] = best_location['location']
|
2532
|
+
df_location['confidence'] = best_location.get('confidence', 0)
|
1885
2533
|
|
1886
|
-
|
1887
|
-
# by checking which variants are present
|
1888
|
-
location_variants = set(df_location['enzyme'].tolist())
|
2534
|
+
LOGGER.info("Successfully extracted data for %d enzymes from primary location", len(df_location))
|
1889
2535
|
|
1890
|
-
#
|
1891
|
-
|
1892
|
-
if
|
1893
|
-
|
1894
|
-
|
1895
|
-
|
1896
|
-
else:
|
1897
|
-
variant_campaigns = enzyme_df[enzyme_df['enzyme'].isin(location_variants)]['campaign_id'].unique()
|
1898
|
-
if len(variant_campaigns) == 1:
|
1899
|
-
campaign_id = variant_campaigns[0]
|
1900
|
-
LOGGER.info("Location %s contains variants from campaign: %s",
|
1901
|
-
location['location'], campaign_id)
|
1902
|
-
elif len(variant_campaigns) > 1:
|
1903
|
-
LOGGER.warning("Location %s contains variants from multiple campaigns: %s",
|
1904
|
-
location['location'], variant_campaigns)
|
1905
|
-
|
1906
|
-
# Extract model reaction specific to this location/campaign
|
1907
|
-
location_context = f"Location: {location['location']}"
|
1908
|
-
if location.get('caption'):
|
1909
|
-
location_context += f"\nCaption: {location['caption']}"
|
1910
|
-
|
1911
|
-
# First find model reaction locations for this campaign/enzyme group
|
2536
|
+
# Extract model reaction info once for this location
|
2537
|
+
location_context = f"Location: {best_location['location']}"
|
2538
|
+
if best_location.get('caption'):
|
2539
|
+
location_context += f"\nCaption: {best_location['caption']}"
|
2540
|
+
|
2541
|
+
# Get enzyme list for model reaction
|
1912
2542
|
location_enzymes = df_location['enzyme'].unique().tolist()
|
2543
|
+
# Get model reaction locations for this campaign
|
1913
2544
|
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
1914
2545
|
|
1915
|
-
#
|
1916
|
-
|
1917
|
-
location['location'],
|
1918
|
-
location_context,
|
1919
|
-
model_reaction_locations
|
1920
|
-
)
|
1921
|
-
|
1922
|
-
# Get full model reaction info with IUPAC names
|
1923
|
-
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
1924
|
-
model_info = self._extract_lineage_model_info(location_model_reaction)
|
1925
|
-
else:
|
1926
|
-
# Fall back to general model reaction extraction
|
1927
|
-
# Pass the enzyme variants from this location
|
1928
|
-
model_info = self.gather_model_reaction_info(location_enzymes)
|
1929
|
-
|
1930
|
-
# Add model reaction info to all enzymes from this location
|
1931
|
-
for key, value in model_info.items():
|
1932
|
-
if isinstance(value, list):
|
1933
|
-
value = "; ".join(str(v) for v in value) if value else None
|
1934
|
-
df_location[key] = value
|
1935
|
-
|
1936
|
-
# Add location and campaign info
|
1937
|
-
df_location['data_location'] = location['location']
|
1938
|
-
df_location['location_type'] = location.get('type', 'unknown')
|
1939
|
-
df_location['location_confidence'] = location.get('confidence', 0)
|
1940
|
-
# Remove lineage_group column - not working properly
|
1941
|
-
# df_location['lineage_group'] = location.get('lineage_hint', campaign_id or 'unknown')
|
1942
|
-
|
1943
|
-
all_results.append(df_location)
|
2546
|
+
# Extract model reaction for this location - use unified approach
|
2547
|
+
LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
|
1944
2548
|
|
1945
|
-
#
|
1946
|
-
LOGGER.info("Progress: %d/%d variants have data",
|
1947
|
-
len(variants_with_data), len(all_variants))
|
1948
|
-
|
1949
|
-
if all_results:
|
1950
|
-
# Combine all results
|
1951
|
-
df_combined = pd.concat(all_results, ignore_index=True)
|
1952
|
-
|
1953
|
-
# If we have duplicates (same variant in multiple locations), keep the one with highest confidence
|
1954
|
-
if df_combined.duplicated(subset=['enzyme']).any():
|
1955
|
-
LOGGER.info("Removing duplicates, keeping highest confidence data")
|
1956
|
-
df_combined = df_combined.sort_values(
|
1957
|
-
['enzyme', 'location_confidence'],
|
1958
|
-
ascending=[True, False]
|
1959
|
-
).drop_duplicates(subset=['enzyme'], keep='first')
|
1960
|
-
|
1961
|
-
# Log extraction summary
|
1962
|
-
LOGGER.info("Extraction complete: %d unique variants from %d locations",
|
1963
|
-
len(df_combined), len(all_results))
|
1964
|
-
|
1965
|
-
if 'data_location' in df_combined.columns:
|
1966
|
-
for location in df_combined['data_location'].unique():
|
1967
|
-
location_enzymes = df_combined[df_combined['data_location'] == location]
|
1968
|
-
LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
|
1969
|
-
|
1970
|
-
return df_combined
|
1971
|
-
else:
|
1972
|
-
LOGGER.warning("No metrics extracted from any location")
|
1973
|
-
return pd.DataFrame()
|
1974
|
-
|
1975
|
-
def _process_multiple_lineages(self, locations: List[Dict[str, Any]],
|
1976
|
-
enzyme_df: pd.DataFrame,
|
1977
|
-
lineage_analysis: Dict[str, Any]) -> pd.DataFrame:
|
1978
|
-
"""Process multiple lineages where each location represents a different model reaction."""
|
1979
|
-
all_metrics = []
|
1980
|
-
lineage_groups = lineage_analysis.get('lineage_groups', [])
|
1981
|
-
|
1982
|
-
# Get all enzyme IDs for extraction attempts
|
1983
|
-
all_enzyme_ids = enzyme_df['enzyme_id'].tolist() if 'enzyme_id' in enzyme_df.columns else []
|
1984
|
-
|
1985
|
-
for group in lineage_groups:
|
1986
|
-
group_location = group.get('data_location')
|
1987
|
-
group_id = group.get('group_id')
|
1988
|
-
|
1989
|
-
# Find the location info
|
1990
|
-
location_info = next((loc for loc in locations if loc['location'] == group_location), None)
|
1991
|
-
if not location_info:
|
1992
|
-
LOGGER.warning("No location info found for group %s at %s", group_id, group_location)
|
1993
|
-
continue
|
1994
|
-
|
1995
|
-
LOGGER.info("Processing location %s (%s)", group_location, group_id)
|
1996
|
-
|
1997
|
-
# Extract metrics from this location for ALL enzymes
|
1998
|
-
# The extractor will return only those that actually have data
|
1999
|
-
metrics_rows = self.extract_metrics_batch(all_enzyme_ids, group_location)
|
2000
|
-
|
2001
|
-
# Filter to enzymes that actually had data in this location
|
2002
|
-
valid_metrics = [m for m in metrics_rows if self._has_valid_metrics(m)]
|
2003
|
-
|
2004
|
-
if not valid_metrics:
|
2005
|
-
LOGGER.warning("No valid metrics found in %s", group_location)
|
2006
|
-
continue
|
2007
|
-
|
2008
|
-
LOGGER.info("Found %d enzymes with data in %s", len(valid_metrics), group_location)
|
2009
|
-
|
2010
|
-
# Create DataFrame for this location
|
2011
|
-
df_location = pd.DataFrame(valid_metrics)
|
2012
|
-
|
2013
|
-
# Extract model reaction specific to this location
|
2014
|
-
# Different locations = different model reactions
|
2015
|
-
location_context = f"Location: {group_location}"
|
2016
|
-
if group.get('caption'):
|
2017
|
-
location_context += f"\nCaption: {group['caption']}"
|
2018
|
-
|
2019
|
-
# First find model reaction locations for this enzyme group
|
2020
|
-
location_enzymes = df_location['enzyme'].unique().tolist() if 'enzyme' in df_location.columns else all_enzyme_ids
|
2021
|
-
model_reaction_locations = self.find_model_reaction_locations(location_enzymes)
|
2022
|
-
|
2023
|
-
# Try to find model reaction for this specific lineage, passing the locations
|
2549
|
+
# Try lineage-specific extraction first
|
2024
2550
|
location_model_reaction = self.find_lineage_model_reaction(
|
2025
|
-
|
2551
|
+
best_location['location'],
|
2026
2552
|
location_context,
|
2027
2553
|
model_reaction_locations
|
2028
2554
|
)
|
2029
2555
|
|
2030
|
-
#
|
2556
|
+
# Check if lineage extraction was successful
|
2031
2557
|
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
2032
|
-
|
2558
|
+
LOGGER.info("Using lineage-specific model reaction data")
|
2559
|
+
model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
|
2033
2560
|
else:
|
2034
|
-
|
2035
|
-
#
|
2561
|
+
LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
|
2562
|
+
# Use the comprehensive multimodal approach as fallback
|
2036
2563
|
model_info = self.gather_model_reaction_info(location_enzymes)
|
2564
|
+
|
2565
|
+
LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
|
2037
2566
|
|
2038
2567
|
# Add model reaction info to all enzymes from this location
|
2039
2568
|
for key, value in model_info.items():
|
@@ -2041,30 +2570,18 @@ TEXT FROM MANUSCRIPT:
|
|
2041
2570
|
value = "; ".join(str(v) for v in value) if value else None
|
2042
2571
|
df_location[key] = value
|
2043
2572
|
|
2044
|
-
# Add location
|
2045
|
-
df_location['
|
2046
|
-
|
2047
|
-
# df_location['lineage_group'] = group.get('lineage_hint', group_id)
|
2048
|
-
|
2049
|
-
all_metrics.append(df_location)
|
2050
|
-
|
2051
|
-
if all_metrics:
|
2052
|
-
# Combine all metrics
|
2053
|
-
df_combined = pd.concat(all_metrics, ignore_index=True)
|
2054
|
-
|
2055
|
-
# Log extraction summary
|
2056
|
-
LOGGER.info("Extraction complete: %d total enzymes from %d locations",
|
2057
|
-
len(df_combined), len(all_metrics))
|
2573
|
+
# Add additional location metadata (data_location already set above)
|
2574
|
+
df_location['location_type'] = best_location.get('type', 'unknown')
|
2575
|
+
df_location['location_confidence'] = best_location.get('confidence', 0)
|
2058
2576
|
|
2059
|
-
|
2060
|
-
|
2061
|
-
location_enzymes = df_combined[df_combined['data_location'] == location]
|
2062
|
-
LOGGER.info(" - %s: %d enzymes", location, len(location_enzymes))
|
2577
|
+
LOGGER.info("Extraction complete: %d variants from primary location %s",
|
2578
|
+
len(df_location), best_location['location'])
|
2063
2579
|
|
2064
|
-
return
|
2065
|
-
|
2066
|
-
|
2067
|
-
|
2580
|
+
return df_location
|
2581
|
+
|
2582
|
+
# No locations found
|
2583
|
+
LOGGER.warning("No valid locations found for extraction")
|
2584
|
+
return pd.DataFrame()
|
2068
2585
|
|
2069
2586
|
def _has_valid_metrics(self, metrics_row: Dict[str, Any]) -> bool:
|
2070
2587
|
"""Check if a metrics row contains any valid performance data."""
|
@@ -2151,14 +2668,18 @@ TEXT FROM MANUSCRIPT:
|
|
2151
2668
|
|
2152
2669
|
return filtered
|
2153
2670
|
|
2154
|
-
def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any]) -> Dict[str, Any]:
|
2671
|
+
def _extract_lineage_model_info(self, lineage_reaction: Dict[str, Any], enzyme_variants: Optional[List[str]] = None) -> Dict[str, Any]:
|
2155
2672
|
"""Extract full model reaction info including IUPAC names for a lineage."""
|
2156
2673
|
# Get substrate/product IDs from lineage-specific extraction
|
2157
2674
|
substrate_ids = lineage_reaction.get('substrate_ids', [])
|
2158
2675
|
product_ids = lineage_reaction.get('product_ids', [])
|
2159
2676
|
|
2160
|
-
# Get general model reaction info for conditions
|
2161
|
-
|
2677
|
+
# Get general model reaction info for conditions, using lineage-specific compound IDs
|
2678
|
+
lineage_ids = {
|
2679
|
+
"substrate_ids": substrate_ids,
|
2680
|
+
"product_ids": product_ids
|
2681
|
+
}
|
2682
|
+
general_info = self.gather_model_reaction_info(enzyme_variants, lineage_compound_ids=lineage_ids)
|
2162
2683
|
|
2163
2684
|
# Override substrate/product lists with lineage-specific ones only if they contain actual compound IDs
|
2164
2685
|
model_info = general_info.copy()
|
@@ -2304,6 +2825,9 @@ TEXT FROM MANUSCRIPT:
|
|
2304
2825
|
LOGGER.error("No enzyme DataFrame provided - this module requires enzyme CSV input")
|
2305
2826
|
return pd.DataFrame()
|
2306
2827
|
|
2828
|
+
# Store enzyme_df for use in extract_metrics_batch
|
2829
|
+
self.enzyme_df = enzyme_df
|
2830
|
+
|
2307
2831
|
# Check if we have campaign_id column - if so, process each campaign separately
|
2308
2832
|
if 'campaign_id' in enzyme_df.columns and not self.campaign_filter:
|
2309
2833
|
campaigns = enzyme_df['campaign_id'].unique()
|
@@ -2322,15 +2846,16 @@ TEXT FROM MANUSCRIPT:
|
|
2322
2846
|
si=self.si,
|
2323
2847
|
cfg=self.cfg,
|
2324
2848
|
debug_dir=self.debug_dir / campaign_id if self.debug_dir else None,
|
2325
|
-
campaign_filter=campaign_id
|
2849
|
+
campaign_filter=campaign_id,
|
2850
|
+
all_campaigns=campaigns.tolist()
|
2326
2851
|
)
|
2327
2852
|
|
2328
2853
|
# Run extraction for this campaign
|
2329
2854
|
campaign_df = campaign_extractor.run(enzyme_df)
|
2330
2855
|
|
2331
2856
|
if not campaign_df.empty:
|
2332
|
-
# Add campaign identifier
|
2333
|
-
campaign_df['
|
2857
|
+
# Add a temporary campaign identifier for merging
|
2858
|
+
campaign_df['_extraction_campaign'] = campaign_id
|
2334
2859
|
all_campaign_results.append(campaign_df)
|
2335
2860
|
LOGGER.info("Extracted %d reactions for campaign %s", len(campaign_df), campaign_id)
|
2336
2861
|
|
@@ -2401,7 +2926,7 @@ TEXT FROM MANUSCRIPT:
|
|
2401
2926
|
def merge_with_lineage_data(
|
2402
2927
|
df_lineage: pd.DataFrame, df_metrics: pd.DataFrame
|
2403
2928
|
) -> pd.DataFrame:
|
2404
|
-
"""
|
2929
|
+
"""Merge lineage and metrics data ensuring one-to-one mapping per campaign."""
|
2405
2930
|
|
2406
2931
|
# Handle both 'enzyme' and 'enzyme_id' column names
|
2407
2932
|
if "enzyme_id" in df_lineage.columns and "enzyme" not in df_lineage.columns:
|
@@ -2410,7 +2935,19 @@ def merge_with_lineage_data(
|
|
2410
2935
|
if "enzyme" not in df_lineage.columns:
|
2411
2936
|
raise ValueError("Lineage CSV must have an 'enzyme' or 'enzyme_id' column.")
|
2412
2937
|
|
2413
|
-
|
2938
|
+
# Check if we have campaign information to match on
|
2939
|
+
if "campaign_id" in df_lineage.columns and "_extraction_campaign" in df_metrics.columns:
|
2940
|
+
# Match on both enzyme and campaign to ensure correct pairing
|
2941
|
+
df_metrics_temp = df_metrics.copy()
|
2942
|
+
df_metrics_temp['campaign_id'] = df_metrics_temp['_extraction_campaign']
|
2943
|
+
df_metrics_temp = df_metrics_temp.drop('_extraction_campaign', axis=1)
|
2944
|
+
merged = df_lineage.merge(df_metrics_temp, on=["enzyme", "campaign_id"], how="left")
|
2945
|
+
else:
|
2946
|
+
# Simple merge on enzyme only
|
2947
|
+
if "_extraction_campaign" in df_metrics.columns:
|
2948
|
+
df_metrics = df_metrics.drop('_extraction_campaign', axis=1)
|
2949
|
+
merged = df_lineage.merge(df_metrics, on="enzyme", how="left")
|
2950
|
+
|
2414
2951
|
return merged
|
2415
2952
|
|
2416
2953
|
###############################################################################
|
@@ -2439,25 +2976,103 @@ def main() -> None:
|
|
2439
2976
|
if args.verbose:
|
2440
2977
|
LOGGER.setLevel(logging.DEBUG)
|
2441
2978
|
cfg = Config()
|
2442
|
-
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir)
|
2443
2979
|
|
2444
|
-
# Load enzyme data from CSV if provided
|
2980
|
+
# Load enzyme data from CSV if provided to detect campaign information
|
2445
2981
|
enzyme_df = None
|
2982
|
+
campaign_filter = None
|
2983
|
+
all_campaigns = None
|
2984
|
+
|
2446
2985
|
if args.lineage_csv and args.lineage_csv.exists():
|
2447
2986
|
LOGGER.info("Loading enzyme data from CSV…")
|
2448
2987
|
enzyme_df = pd.read_csv(args.lineage_csv)
|
2449
|
-
|
2450
|
-
|
2451
|
-
|
2988
|
+
|
2989
|
+
# Detect campaign information from the enzyme CSV
|
2990
|
+
if 'campaign_id' in enzyme_df.columns:
|
2991
|
+
all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
|
2992
|
+
if len(all_campaigns) == 1:
|
2993
|
+
campaign_filter = all_campaigns[0]
|
2994
|
+
LOGGER.info("Detected single campaign: %s", campaign_filter)
|
2995
|
+
|
2996
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
|
2997
|
+
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
2998
|
+
df_metrics = extractor.run(enzyme_df)
|
2999
|
+
|
3000
|
+
elif len(all_campaigns) > 1:
|
3001
|
+
LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
|
3002
|
+
all_results = []
|
3003
|
+
|
3004
|
+
# Process each campaign separately
|
3005
|
+
for campaign in all_campaigns:
|
3006
|
+
LOGGER.info("Processing campaign: %s", campaign)
|
3007
|
+
|
3008
|
+
# Filter enzyme_df to this campaign
|
3009
|
+
campaign_df = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3010
|
+
LOGGER.info("Found %d enzymes for campaign %s", len(campaign_df), campaign)
|
3011
|
+
|
3012
|
+
if len(campaign_df) == 0:
|
3013
|
+
LOGGER.warning("No enzymes found for campaign %s, skipping", campaign)
|
3014
|
+
continue
|
3015
|
+
|
3016
|
+
# Create extractor for this campaign
|
3017
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
|
3018
|
+
campaign_filter=campaign, all_campaigns=all_campaigns)
|
3019
|
+
|
3020
|
+
# Run extraction for this campaign
|
3021
|
+
campaign_metrics = extractor.run(campaign_df)
|
3022
|
+
|
3023
|
+
if not campaign_metrics.empty:
|
3024
|
+
# Merge with lineage data for this campaign
|
3025
|
+
campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3026
|
+
if "enzyme_id" in campaign_lineage.columns and "enzyme" not in campaign_lineage.columns:
|
3027
|
+
campaign_lineage = campaign_lineage.rename(columns={"enzyme_id": "enzyme"})
|
3028
|
+
|
3029
|
+
# Merge campaign metrics with lineage data
|
3030
|
+
campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3031
|
+
|
3032
|
+
# Save campaign-specific file immediately
|
3033
|
+
output_dir = args.output.parent
|
3034
|
+
base_name = args.output.stem
|
3035
|
+
campaign_file = output_dir / f"{base_name}_{campaign}.csv"
|
3036
|
+
campaign_final.to_csv(campaign_file, index=False)
|
3037
|
+
LOGGER.info("Saved %d rows for campaign %s -> %s", len(campaign_final), campaign, campaign_file)
|
3038
|
+
|
3039
|
+
# Add the merged data (not just metrics) to final results
|
3040
|
+
all_results.append(campaign_final)
|
3041
|
+
LOGGER.info("Added %d merged results for campaign %s", len(campaign_final), campaign)
|
3042
|
+
else:
|
3043
|
+
LOGGER.warning("No results extracted for campaign %s", campaign)
|
3044
|
+
|
3045
|
+
# Still save an empty campaign file with lineage data
|
3046
|
+
campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3047
|
+
if not campaign_lineage.empty:
|
3048
|
+
output_dir = args.output.parent
|
3049
|
+
base_name = args.output.stem
|
3050
|
+
campaign_file = output_dir / f"{base_name}_{campaign}.csv"
|
3051
|
+
campaign_lineage.to_csv(campaign_file, index=False)
|
3052
|
+
LOGGER.info("Saved %d rows (lineage only) for campaign %s -> %s", len(campaign_lineage), campaign, campaign_file)
|
3053
|
+
|
3054
|
+
# Combine all campaign results
|
3055
|
+
if all_results:
|
3056
|
+
df_metrics = pd.concat(all_results, ignore_index=True)
|
3057
|
+
LOGGER.info("Combined results from %d campaigns: %d total rows", len(all_results), len(df_metrics))
|
3058
|
+
else:
|
3059
|
+
LOGGER.warning("No results from any campaign")
|
3060
|
+
df_metrics = pd.DataFrame()
|
3061
|
+
else:
|
3062
|
+
# No campaign information, process all enzymes together
|
3063
|
+
extractor = ReactionExtractor(args.manuscript, args.si, cfg, debug_dir=args.debug_dir,
|
3064
|
+
campaign_filter=campaign_filter, all_campaigns=all_campaigns)
|
3065
|
+
df_metrics = extractor.run(enzyme_df)
|
2452
3066
|
|
2453
|
-
|
2454
|
-
|
2455
|
-
|
2456
|
-
else
|
2457
|
-
df_final = df_metrics
|
3067
|
+
# Skip final merge since campaign-specific merges already happened during processing
|
3068
|
+
# This avoids duplicate entries when same enzyme appears in multiple campaigns
|
3069
|
+
df_final = df_metrics
|
3070
|
+
LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
|
2458
3071
|
|
2459
3072
|
df_final.to_csv(args.output, index=False)
|
2460
3073
|
LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
|
3074
|
+
|
3075
|
+
# Campaign-specific files are already saved during processing above
|
2461
3076
|
|
2462
3077
|
if __name__ == "__main__":
|
2463
3078
|
main()
|