debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +656 -27
- debase/enzyme_lineage_extractor.py +1077 -109
- debase/lineage_format.py +221 -12
- debase/reaction_info_extractor.py +133 -23
- debase/substrate_scope_extractor.py +49 -2
- debase/wrapper.py +155 -151
- debase-0.4.4.dist-info/METADATA +121 -0
- debase-0.4.4.dist-info/RECORD +16 -0
- debase-0.4.2.dist-info/METADATA +0 -296
- debase-0.4.2.dist-info/RECORD +0 -16
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ import time
|
|
30
30
|
import logging
|
31
31
|
from pathlib import Path
|
32
32
|
from dataclasses import dataclass, field
|
33
|
-
from typing import List, Optional, Union, Tuple
|
33
|
+
from typing import List, Optional, Union, Tuple, Dict, Any
|
34
34
|
|
35
35
|
MODEL_NAME: str = "gemini-2.5-flash"
|
36
36
|
MAX_CHARS: int = 150_000 # Max characters sent to LLM
|
@@ -142,21 +142,36 @@ def extract_text(pdf_path: str | Path | bytes) -> str:
|
|
142
142
|
|
143
143
|
|
144
144
|
def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -> str:
|
145
|
-
"""Extract figure/table captions
|
145
|
+
"""Extract ALL figure/table captions with extensive surrounding context.
|
146
146
|
|
147
147
|
The function scans every text line on every page and keeps lines whose first
|
148
148
|
token matches `_CAPTION_PREFIX_RE`. This covers labels such as:
|
149
|
-
* Fig. 1, Figure 2A,
|
149
|
+
* Fig. 1, Figure 2A, Figure 2B, Figure 2C (ALL sub-captions)
|
150
150
|
* Table S1, Table 4, Scheme 2, Chart 1B
|
151
|
-
* Supplementary Fig.
|
151
|
+
* Supplementary Fig. S5A, S5B, S5C (ALL variations)
|
152
|
+
|
153
|
+
For SI documents, includes extensive context since understanding what each
|
154
|
+
section contains is crucial for accurate location identification.
|
152
155
|
"""
|
153
156
|
|
154
157
|
doc = _open_doc(pdf_path)
|
155
158
|
captions: list[str] = []
|
156
159
|
try:
|
157
|
-
for page in doc:
|
160
|
+
for page_num, page in enumerate(doc):
|
158
161
|
page_dict = page.get_text("dict")
|
162
|
+
|
163
|
+
# Get all text blocks on this page for broader context
|
164
|
+
page_text_blocks = []
|
159
165
|
for block in page_dict.get("blocks", []):
|
166
|
+
block_text = ""
|
167
|
+
for line in block.get("lines", []):
|
168
|
+
text_line = "".join(span["text"] for span in line.get("spans", []))
|
169
|
+
if text_line.strip():
|
170
|
+
block_text += text_line.strip() + " "
|
171
|
+
if block_text.strip():
|
172
|
+
page_text_blocks.append(block_text.strip())
|
173
|
+
|
174
|
+
for block_idx, block in enumerate(page_dict.get("blocks", [])):
|
160
175
|
# Get all lines in this block
|
161
176
|
block_lines = []
|
162
177
|
for line in block.get("lines", []):
|
@@ -166,21 +181,94 @@ def extract_captions(pdf_path: str | Path | bytes, max_chars: int = MAX_CHARS) -
|
|
166
181
|
# Check if any line starts with a caption prefix
|
167
182
|
for i, line in enumerate(block_lines):
|
168
183
|
if _CAPTION_PREFIX_RE.match(line):
|
169
|
-
|
170
|
-
|
184
|
+
context_parts = []
|
185
|
+
|
186
|
+
# Add page context for SI documents (more critical there)
|
187
|
+
context_parts.append(f"Page {page_num + 1}")
|
188
|
+
|
189
|
+
# Add extensive context before the caption (5-7 lines for SI context)
|
190
|
+
context_before = []
|
191
|
+
|
192
|
+
# First try to get context from current block
|
193
|
+
for k in range(max(0, i-7), i):
|
194
|
+
if k < len(block_lines) and block_lines[k].strip():
|
195
|
+
if not _CAPTION_PREFIX_RE.match(block_lines[k]):
|
196
|
+
context_before.append(block_lines[k])
|
197
|
+
|
198
|
+
# If not enough context, look at previous text blocks on the page
|
199
|
+
if len(context_before) < 3 and block_idx > 0:
|
200
|
+
prev_block_text = page_text_blocks[block_idx - 1] if block_idx < len(page_text_blocks) else ""
|
201
|
+
if prev_block_text:
|
202
|
+
# Get last few sentences from previous block
|
203
|
+
sentences = prev_block_text.split('. ')
|
204
|
+
context_before = sentences[-2:] + context_before if len(sentences) > 1 else [prev_block_text] + context_before
|
205
|
+
|
206
|
+
if context_before:
|
207
|
+
# Include more extensive context for better understanding
|
208
|
+
context_text = " ".join(context_before[-5:]) # Last 5 lines/sentences of context
|
209
|
+
context_parts.append("Context: " + context_text)
|
210
|
+
|
211
|
+
# Extract the COMPLETE caption including all sub-parts
|
171
212
|
caption_parts = [line]
|
172
|
-
|
213
|
+
j = i + 1
|
214
|
+
|
215
|
+
# Continue collecting caption text until we hit a clear break
|
216
|
+
while j < len(block_lines):
|
173
217
|
next_line = block_lines[j]
|
174
|
-
|
175
|
-
|
176
|
-
|
218
|
+
|
219
|
+
# Stop if we hit an empty line followed by non-caption text
|
220
|
+
if not next_line:
|
221
|
+
# Check if the line after empty is a new caption
|
222
|
+
if j + 1 < len(block_lines) and _CAPTION_PREFIX_RE.match(block_lines[j + 1]):
|
223
|
+
break
|
224
|
+
# If next non-empty line is not a caption, continue collecting
|
225
|
+
elif j + 1 < len(block_lines):
|
226
|
+
j += 1
|
227
|
+
continue
|
228
|
+
else:
|
229
|
+
break
|
230
|
+
|
231
|
+
# Stop if we hit a new caption
|
177
232
|
if _CAPTION_PREFIX_RE.match(next_line):
|
178
233
|
break
|
234
|
+
|
235
|
+
# Include this line as part of the caption
|
179
236
|
caption_parts.append(next_line)
|
237
|
+
j += 1
|
180
238
|
|
181
|
-
# Join the parts
|
239
|
+
# Join the caption parts
|
182
240
|
full_caption = " ".join(caption_parts)
|
183
|
-
|
241
|
+
context_parts.append("Caption: " + full_caption)
|
242
|
+
|
243
|
+
# Add extensive context after the caption (especially important for SI)
|
244
|
+
context_after = []
|
245
|
+
|
246
|
+
# Look for descriptive text following the caption
|
247
|
+
for k in range(j, min(len(block_lines), j + 10)): # Look ahead up to 10 lines
|
248
|
+
if k < len(block_lines) and block_lines[k].strip():
|
249
|
+
if not _CAPTION_PREFIX_RE.match(block_lines[k]):
|
250
|
+
context_after.append(block_lines[k])
|
251
|
+
|
252
|
+
# If not enough context, look at next text blocks
|
253
|
+
if len(context_after) < 3 and block_idx + 1 < len(page_text_blocks):
|
254
|
+
next_block_text = page_text_blocks[block_idx + 1]
|
255
|
+
if next_block_text:
|
256
|
+
# Get first few sentences from next block
|
257
|
+
sentences = next_block_text.split('. ')
|
258
|
+
context_after.extend(sentences[:3] if len(sentences) > 1 else [next_block_text])
|
259
|
+
|
260
|
+
if context_after:
|
261
|
+
# Include extensive following context
|
262
|
+
following_text = " ".join(context_after[:7]) # First 7 lines of following context
|
263
|
+
context_parts.append("Following: " + following_text)
|
264
|
+
|
265
|
+
# For SI documents, add section context if this appears to be a section header
|
266
|
+
if any(keyword in full_caption.lower() for keyword in ['supplementary', 'supporting', 'si ', 's1', 's2', 's3']):
|
267
|
+
context_parts.append("SI_SECTION: This appears to be supplementary material content")
|
268
|
+
|
269
|
+
# Combine all parts with proper separation
|
270
|
+
full_caption_with_context = " | ".join(context_parts)
|
271
|
+
captions.append(full_caption_with_context)
|
184
272
|
finally:
|
185
273
|
doc.close()
|
186
274
|
|
@@ -639,17 +727,18 @@ Return a JSON object with:
|
|
639
727
|
|
640
728
|
_LINEAGE_LOC_PROMPT = """
|
641
729
|
You are an expert reader of protein engineering manuscripts.
|
730
|
+
{campaign_context}
|
642
731
|
Given the following article text, list up to {max_results} *locations* (page
|
643
732
|
numbers, figure/table IDs, or section headings) that you would review first to
|
644
733
|
find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
|
645
|
-
came from which parent and what mutations were introduced).
|
734
|
+
came from which parent and what mutations were introduced){campaign_specific}.
|
646
735
|
|
647
736
|
Respond with a JSON array of objects, each containing:
|
648
737
|
- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
|
649
738
|
- "type": one of "table", "figure", "text", "section"
|
650
739
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
651
740
|
- "reason": brief explanation of why this location likely contains lineage
|
652
|
-
|
741
|
+
{campaign_field}
|
653
742
|
IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
|
654
743
|
|
655
744
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
@@ -660,9 +749,9 @@ Don't include oligonucleotide results or result from only one round.
|
|
660
749
|
|
661
750
|
Example output:
|
662
751
|
[
|
663
|
-
{{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"}},
|
664
|
-
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"}},
|
665
|
-
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"}}
|
752
|
+
{{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
|
753
|
+
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
|
754
|
+
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
|
666
755
|
]
|
667
756
|
""".strip()
|
668
757
|
|
@@ -827,7 +916,39 @@ def identify_evolution_locations(
|
|
827
916
|
|
828
917
|
# Include TOC before the main text
|
829
918
|
combined_text = toc_text + text if toc_text else text
|
830
|
-
|
919
|
+
|
920
|
+
# Add campaign context if provided
|
921
|
+
campaign_context = ""
|
922
|
+
campaign_specific = ""
|
923
|
+
campaign_field = ""
|
924
|
+
campaign_example = ""
|
925
|
+
|
926
|
+
if campaigns and len(campaigns) == 1:
|
927
|
+
# Single campaign - make it specific
|
928
|
+
camp = campaigns[0]
|
929
|
+
campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
|
930
|
+
if hasattr(camp, 'notes') and camp.notes:
|
931
|
+
campaign_context += f"- Key identifiers: {camp.notes}\n"
|
932
|
+
campaign_specific = f" for the '{camp.campaign_name}' campaign"
|
933
|
+
campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
|
934
|
+
campaign_example = f', "campaign_id": "{camp.campaign_id}"'
|
935
|
+
elif campaigns and len(campaigns) > 1:
|
936
|
+
# Multiple campaigns - list them all
|
937
|
+
campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
|
938
|
+
for camp in campaigns:
|
939
|
+
campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
|
940
|
+
campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
|
941
|
+
campaign_specific = " for any of the identified campaigns"
|
942
|
+
campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
|
943
|
+
campaign_example = ', "campaign_id": "campaign_id_here"'
|
944
|
+
|
945
|
+
prompt = _LINEAGE_LOC_PROMPT.format(
|
946
|
+
campaign_context=campaign_context,
|
947
|
+
max_results=max_results,
|
948
|
+
campaign_specific=campaign_specific,
|
949
|
+
campaign_field=campaign_field,
|
950
|
+
campaign_example=campaign_example
|
951
|
+
) + "\n\nTEXT:\n" + combined_text
|
831
952
|
locs: List[dict] = []
|
832
953
|
try:
|
833
954
|
locs = generate_json_with_retry(
|
@@ -1290,6 +1411,138 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1290
1411
|
|
1291
1412
|
# ---- 6.4 Public API -------------------------------------------------------
|
1292
1413
|
|
1414
|
+
def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
|
1415
|
+
"""Extract text from a specific location (table, section, etc.) in the full text."""
|
1416
|
+
import re
|
1417
|
+
|
1418
|
+
if location_type == 'table':
|
1419
|
+
# Find ALL mentions of this table and combine them
|
1420
|
+
location_clean = location.strip()
|
1421
|
+
|
1422
|
+
# Different ways the table might be referenced
|
1423
|
+
search_patterns = [
|
1424
|
+
location_clean, # Exact match
|
1425
|
+
location_clean.replace("Supplementary ", "Supp. "), # Common abbreviation
|
1426
|
+
location_clean.replace("Supplementary ", "S"), # E.g., "Table S3"
|
1427
|
+
location_clean.replace("Supplementary Table ", "Table S"), # Another common format
|
1428
|
+
]
|
1429
|
+
|
1430
|
+
# Collect all occurrences
|
1431
|
+
all_occurrences = []
|
1432
|
+
seen_positions = set()
|
1433
|
+
|
1434
|
+
for search_term in search_patterns:
|
1435
|
+
pattern = re.compile(re.escape(search_term), re.IGNORECASE)
|
1436
|
+
for match in pattern.finditer(full_text):
|
1437
|
+
# Avoid duplicates from overlapping patterns
|
1438
|
+
if match.start() in seen_positions:
|
1439
|
+
continue
|
1440
|
+
seen_positions.add(match.start())
|
1441
|
+
|
1442
|
+
# Extract generous context around each mention
|
1443
|
+
start = max(0, match.start() - 1000)
|
1444
|
+
end = min(len(full_text), match.end() + 10000)
|
1445
|
+
context = full_text[start:end]
|
1446
|
+
|
1447
|
+
all_occurrences.append({
|
1448
|
+
'position': match.start(),
|
1449
|
+
'context': context,
|
1450
|
+
'match': match.group()
|
1451
|
+
})
|
1452
|
+
|
1453
|
+
if not all_occurrences:
|
1454
|
+
log.warning(f"No occurrences of table '{location_clean}' found in text")
|
1455
|
+
return None
|
1456
|
+
|
1457
|
+
log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
|
1458
|
+
|
1459
|
+
# Combine all occurrences into one text for Gemini to analyze
|
1460
|
+
combined_text = f"=== All occurrences of {location_clean} ===\n\n"
|
1461
|
+
|
1462
|
+
for i, occurrence in enumerate(all_occurrences, 1):
|
1463
|
+
combined_text += f"--- Occurrence {i} at position {occurrence['position']} ---\n"
|
1464
|
+
combined_text += occurrence['context']
|
1465
|
+
combined_text += "\n\n"
|
1466
|
+
|
1467
|
+
# Limit total length to avoid overwhelming the model
|
1468
|
+
if len(combined_text) > 50000:
|
1469
|
+
combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
|
1470
|
+
|
1471
|
+
return combined_text
|
1472
|
+
|
1473
|
+
elif location_type == 'figure':
|
1474
|
+
# For figures, we mainly want the caption and any text description
|
1475
|
+
location_clean = location.strip()
|
1476
|
+
patterns = [
|
1477
|
+
rf'({re.escape(location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})',
|
1478
|
+
rf'(Figure\s+S?\d+[^\n]*{re.escape(location_clean.split()[-1] if location_clean.split() else location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})'
|
1479
|
+
]
|
1480
|
+
|
1481
|
+
for pattern in patterns:
|
1482
|
+
match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
1483
|
+
if match:
|
1484
|
+
# For figures, include surrounding context as the data might be described nearby
|
1485
|
+
start = max(0, match.start() - 1000)
|
1486
|
+
end = min(match.end() + 2000, len(full_text))
|
1487
|
+
return full_text[start:end]
|
1488
|
+
|
1489
|
+
elif location_type == 'section':
|
1490
|
+
# Look for section heading
|
1491
|
+
location_clean = location.strip()
|
1492
|
+
patterns = [
|
1493
|
+
# Section with number
|
1494
|
+
rf'((?:^|\n)\d+\.?\s*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]).*\n){{0,500}})',
|
1495
|
+
# Section without number
|
1496
|
+
rf'((?:^|\n){re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]|\n[A-Z]{{2,}}).*\n){{0,500}})',
|
1497
|
+
# More flexible section matching
|
1498
|
+
rf'((?:^|\n)[^\n]*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+|\n[A-Z]{{2,}}).*\n){{0,500}})'
|
1499
|
+
]
|
1500
|
+
|
1501
|
+
for pattern in patterns:
|
1502
|
+
match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
1503
|
+
if match:
|
1504
|
+
return match.group(1)
|
1505
|
+
|
1506
|
+
elif location_type == 'text':
|
1507
|
+
# Try to find the location as a page marker or general text
|
1508
|
+
if location.isdigit():
|
1509
|
+
# Page number - look for page markers
|
1510
|
+
page_num = int(location)
|
1511
|
+
# Look for page breaks or page numbers
|
1512
|
+
patterns = [
|
1513
|
+
rf'(?:^|\n)\s*-?\s*{page_num}\s*-?\s*\n((?:.*\n){{0,300}})',
|
1514
|
+
rf'(?:page|p\.?)\s*{page_num}[^\n]*\n((?:.*\n){{0,300}})',
|
1515
|
+
rf'\n{page_num}\n((?:.*\n){{0,300}})'
|
1516
|
+
]
|
1517
|
+
for pattern in patterns:
|
1518
|
+
match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE)
|
1519
|
+
if match:
|
1520
|
+
start = match.start()
|
1521
|
+
end = min(start + 15000, len(full_text))
|
1522
|
+
return full_text[start:end]
|
1523
|
+
|
1524
|
+
# Fallback: try fuzzy search for the location string
|
1525
|
+
location_words = location.split()
|
1526
|
+
if len(location_words) >= 2:
|
1527
|
+
# Try to find at least the first two words together
|
1528
|
+
search_pattern = rf'{re.escape(location_words[0])}\s+{re.escape(location_words[1])}'
|
1529
|
+
match = re.search(search_pattern, full_text, re.IGNORECASE)
|
1530
|
+
if match:
|
1531
|
+
start = max(0, match.start() - 500)
|
1532
|
+
end = min(match.start() + 8000, len(full_text))
|
1533
|
+
return full_text[start:end]
|
1534
|
+
|
1535
|
+
# Last resort: find any occurrence of the location string
|
1536
|
+
idx = full_text.lower().find(location.lower())
|
1537
|
+
if idx != -1:
|
1538
|
+
start = max(0, idx - 500)
|
1539
|
+
end = min(idx + 8000, len(full_text))
|
1540
|
+
return full_text[start:end]
|
1541
|
+
|
1542
|
+
log.warning(f"Could not find location '{location}' of type '{location_type}' in text")
|
1543
|
+
return None
|
1544
|
+
|
1545
|
+
|
1293
1546
|
def get_lineage(
|
1294
1547
|
caption_text: str,
|
1295
1548
|
full_text: str,
|
@@ -1328,63 +1581,220 @@ def get_lineage(
|
|
1328
1581
|
campaigns = [default_campaign]
|
1329
1582
|
log.info(f"Created default campaign: {default_campaign.campaign_name}")
|
1330
1583
|
|
1331
|
-
# Use captions for identification - they're concise and focused
|
1332
|
-
locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
|
1333
|
-
|
1334
1584
|
all_variants = []
|
1335
1585
|
|
1336
1586
|
if campaigns:
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1340
|
-
|
1587
|
+
log.info("Using campaign-aware location identification")
|
1588
|
+
|
1589
|
+
# Process each campaign separately
|
1590
|
+
for campaign in campaigns:
|
1591
|
+
log.info(f"\nProcessing campaign: {campaign.campaign_id} - {campaign.campaign_name}")
|
1592
|
+
|
1593
|
+
# Use identify_evolution_locations with campaign context
|
1594
|
+
locations = identify_evolution_locations(
|
1595
|
+
caption_text,
|
1596
|
+
model,
|
1597
|
+
max_results=5,
|
1598
|
+
debug_dir=debug_dir,
|
1599
|
+
campaigns=[campaign], # Pass single campaign for focused search
|
1600
|
+
pdf_paths=pdf_paths
|
1601
|
+
)
|
1602
|
+
|
1603
|
+
if not locations:
|
1604
|
+
log.warning(f"No locations found for campaign {campaign.campaign_id}, trying full text extraction")
|
1605
|
+
# Fall back to full text extraction
|
1606
|
+
campaign_variants = extract_complete_lineage(
|
1607
|
+
full_text, model,
|
1608
|
+
debug_dir=debug_dir,
|
1609
|
+
campaign_id=campaign.campaign_id,
|
1610
|
+
campaign_info=campaign,
|
1611
|
+
pdf_paths=pdf_paths
|
1612
|
+
)
|
1613
|
+
all_variants.extend(campaign_variants)
|
1614
|
+
continue
|
1615
|
+
|
1616
|
+
log.info(f"Found {len(locations)} potential locations for campaign {campaign.campaign_id}")
|
1617
|
+
for loc in locations:
|
1618
|
+
log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
|
1619
|
+
|
1620
|
+
# Try to extract from the best location
|
1621
|
+
extracted_variants = []
|
1622
|
+
for location in locations:
|
1623
|
+
if extracted_variants:
|
1624
|
+
break # Already got variants
|
1625
|
+
|
1626
|
+
location_str = location.get('location', '')
|
1627
|
+
location_type = location.get('type', '')
|
1628
|
+
confidence = location.get('confidence', 0)
|
1629
|
+
|
1630
|
+
# Try figure extraction for high-confidence figures
|
1631
|
+
if location_type == 'figure' and confidence >= 70 and pdf_paths:
|
1632
|
+
log.info(f"Attempting to extract figure: {location_str}")
|
1633
|
+
|
1634
|
+
figure_bytes = None
|
1635
|
+
for pdf_path in pdf_paths:
|
1636
|
+
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
|
1637
|
+
if figure_bytes:
|
1638
|
+
log.info(f"Successfully extracted figure from {pdf_path.name}")
|
1639
|
+
break
|
1640
|
+
|
1641
|
+
if figure_bytes:
|
1642
|
+
# Save figure if debug enabled
|
1643
|
+
if debug_dir:
|
1644
|
+
debug_path = Path(debug_dir)
|
1645
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1646
|
+
figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
|
1647
|
+
_dump(figure_bytes, figure_file)
|
1648
|
+
log.info(f"Saved figure to: {figure_file}")
|
1649
|
+
|
1650
|
+
# Extract lineage from figure
|
1651
|
+
variants = extract_lineage_from_figure(
|
1652
|
+
figure_bytes, model,
|
1653
|
+
debug_dir=debug_dir,
|
1654
|
+
campaign_id=campaign.campaign_id,
|
1655
|
+
campaign_info=campaign
|
1656
|
+
)
|
1657
|
+
if variants:
|
1658
|
+
log.info(f"Extracted {len(variants)} variants from figure")
|
1659
|
+
extracted_variants = variants
|
1660
|
+
continue
|
1661
|
+
|
1662
|
+
# Try table/text extraction
|
1663
|
+
if location_type in ['table', 'text', 'section'] and not extracted_variants:
|
1664
|
+
log.info(f"Attempting text extraction for {location_type}: {location_str}")
|
1665
|
+
|
1666
|
+
# Extract the specific section/table from full text
|
1667
|
+
section_text = _extract_location_text(full_text, location_str, location_type)
|
1668
|
+
if section_text:
|
1669
|
+
log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
|
1670
|
+
# Save extracted section if debug enabled
|
1671
|
+
if debug_dir:
|
1672
|
+
debug_path = Path(debug_dir)
|
1673
|
+
section_file = debug_path / f"extracted_{location_type}_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
|
1674
|
+
_dump(f"=== EXTRACTED {location_type.upper()} ===\nLocation: {location_str}\nLength: {len(section_text)} chars\n{'='*80}\n\n{section_text}", section_file)
|
1675
|
+
|
1676
|
+
variants = extract_complete_lineage(
|
1677
|
+
section_text, model,
|
1678
|
+
debug_dir=debug_dir,
|
1679
|
+
campaign_id=campaign.campaign_id,
|
1680
|
+
campaign_info=campaign,
|
1681
|
+
pdf_paths=pdf_paths
|
1682
|
+
)
|
1683
|
+
if variants:
|
1684
|
+
log.info(f"Extracted {len(variants)} variants from {location_type}")
|
1685
|
+
extracted_variants = variants
|
1686
|
+
else:
|
1687
|
+
log.warning(f"Could not extract text from {location_type}: {location_str}")
|
1688
|
+
|
1689
|
+
# If no variants extracted from specific locations, try full text
|
1690
|
+
if not extracted_variants:
|
1691
|
+
log.warning(f"Could not extract from specific locations, trying full text for campaign {campaign.campaign_id}")
|
1692
|
+
extracted_variants = extract_complete_lineage(
|
1693
|
+
full_text, model,
|
1694
|
+
debug_dir=debug_dir,
|
1695
|
+
campaign_id=campaign.campaign_id,
|
1696
|
+
campaign_info=campaign,
|
1697
|
+
pdf_paths=pdf_paths
|
1698
|
+
)
|
1699
|
+
|
1700
|
+
all_variants.extend(extracted_variants)
|
1701
|
+
|
1702
|
+
return all_variants, campaigns
|
1703
|
+
|
1704
|
+
# Original fallback code for when no campaigns are identified
|
1705
|
+
log.info("Processing campaigns with direct caption and TOC analysis (skipping global location finding)")
|
1706
|
+
|
1707
|
+
# Prepare all captions and TOC with context for campaign-specific selection
|
1708
|
+
caption_entries = []
|
1709
|
+
|
1710
|
+
# Add table of contents entries if available
|
1711
|
+
if pdf_paths:
|
1712
|
+
toc_sections = []
|
1713
|
+
for pdf_path in pdf_paths:
|
1714
|
+
# Extract first few pages looking for TOC
|
1715
|
+
try:
|
1716
|
+
import fitz # PyMuPDF
|
1717
|
+
doc = fitz.open(pdf_path)
|
1718
|
+
toc_text = ""
|
1719
|
+
for page_num in range(min(5, doc.page_count)): # First 5 pages
|
1720
|
+
page = doc[page_num] # Correct PyMuPDF syntax
|
1721
|
+
page_text = page.get_text()
|
1722
|
+
if any(keyword in page_text.lower() for keyword in ['contents', 'table of contents', 'overview']):
|
1723
|
+
toc_text += f"\n--- Page {page_num + 1} TOC ---\n{page_text}\n"
|
1724
|
+
doc.close()
|
1725
|
+
if toc_text:
|
1726
|
+
toc_sections.append(toc_text)
|
1727
|
+
except Exception as e:
|
1728
|
+
log.warning(f"Failed to extract TOC from {pdf_path}: {e}")
|
1729
|
+
|
1730
|
+
if toc_sections:
|
1731
|
+
caption_entries.append({
|
1732
|
+
'type': 'table_of_contents',
|
1733
|
+
'location': 'Table of Contents',
|
1734
|
+
'context': '\n'.join(toc_sections)[:1000] + "..."
|
1735
|
+
})
|
1736
|
+
|
1737
|
+
# Parse figure and table captions from caption_text
|
1738
|
+
# Split by common caption patterns
|
1739
|
+
caption_patterns = [
|
1740
|
+
r'(?:^|\n)(?:Figure|Fig\.?)\s*\d+[:\.]',
|
1741
|
+
r'(?:^|\n)(?:Table|Tab\.?)\s*\d+[:\.]',
|
1742
|
+
r'(?:^|\n)(?:Scheme|Sch\.?)\s*\d+[:\.]'
|
1743
|
+
]
|
1744
|
+
|
1745
|
+
import re
|
1746
|
+
for pattern in caption_patterns:
|
1747
|
+
matches = list(re.finditer(pattern, caption_text, re.MULTILINE | re.IGNORECASE))
|
1748
|
+
for i, match in enumerate(matches):
|
1749
|
+
start_pos = match.start()
|
1750
|
+
# Find the end of this caption (start of next caption or end of text)
|
1751
|
+
if i + 1 < len(matches):
|
1752
|
+
end_pos = matches[i + 1].start()
|
1753
|
+
else:
|
1754
|
+
end_pos = min(start_pos + 2000, len(caption_text)) # Max 2000 chars per caption
|
1755
|
+
|
1756
|
+
caption_content = caption_text[start_pos:end_pos].strip()
|
1757
|
+
if len(caption_content) > 20: # Skip very short captions
|
1758
|
+
# Extract context from full text around this caption
|
1759
|
+
context_start = max(0, full_text.find(caption_content[:100]) - 500)
|
1760
|
+
context_end = min(len(full_text), context_start + 2000)
|
1761
|
+
context = full_text[context_start:context_end]
|
1762
|
+
|
1763
|
+
caption_entries.append({
|
1764
|
+
'type': 'figure' if 'fig' in pattern.lower() else 'table' if 'tab' in pattern.lower() else 'scheme',
|
1765
|
+
'location': caption_content.split('\n')[0][:100] + "..." if len(caption_content.split('\n')[0]) > 100 else caption_content.split('\n')[0],
|
1766
|
+
'context': context
|
1767
|
+
})
|
1768
|
+
|
1769
|
+
log.info(f"Prepared {len(caption_entries)} caption/TOC entries for campaign-specific analysis")
|
1770
|
+
|
1771
|
+
# If no caption entries found, fall back to full text extraction
|
1772
|
+
if not caption_entries:
|
1773
|
+
log.info("No caption entries found, extracting from full text with campaign context")
|
1341
1774
|
for campaign in campaigns:
|
1342
1775
|
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1343
|
-
campaign_variants =
|
1344
|
-
full_text, model,
|
1345
|
-
debug_dir=debug_dir,
|
1346
|
-
|
1776
|
+
campaign_variants = extract_complete_lineage(
|
1777
|
+
full_text, model,
|
1778
|
+
debug_dir=debug_dir,
|
1779
|
+
campaign_id=campaign.campaign_id,
|
1780
|
+
campaign_info=campaign,
|
1781
|
+
pdf_paths=pdf_paths
|
1347
1782
|
)
|
1348
1783
|
all_variants.extend(campaign_variants)
|
1349
1784
|
return all_variants, campaigns
|
1350
|
-
|
1351
|
-
#
|
1352
|
-
location_summary = []
|
1353
|
-
for loc in locations[:5]:
|
1354
|
-
if isinstance(loc, dict):
|
1355
|
-
location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
|
1356
|
-
else:
|
1357
|
-
location_summary.append(str(loc))
|
1358
|
-
log.info("Gemini identified %d potential lineage locations: %s",
|
1359
|
-
len(locations), ", ".join(location_summary))
|
1360
|
-
|
1361
|
-
# Extract context around each location for better decision making
|
1362
|
-
locations_with_context = []
|
1363
|
-
for loc in locations:
|
1364
|
-
location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1365
|
-
# Extract 1000 chars of context around the location
|
1366
|
-
context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
|
1367
|
-
locations_with_context.append({
|
1368
|
-
'location': loc,
|
1369
|
-
'context': context_text # Full extracted context
|
1370
|
-
})
|
1371
|
-
|
1372
|
-
# For each campaign, ask Gemini to select the best location
|
1785
|
+
|
1786
|
+
# For each campaign, ask Gemini to select the best location from captions/TOC
|
1373
1787
|
for campaign in campaigns:
|
1374
1788
|
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1375
1789
|
|
1376
|
-
# Build locations context string
|
1790
|
+
# Build locations context string from caption entries
|
1377
1791
|
locations_str = ""
|
1378
|
-
for i,
|
1379
|
-
|
1380
|
-
|
1381
|
-
|
1382
|
-
location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
|
1383
|
-
confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
|
1384
|
-
reason = loc.get('reason', '') if isinstance(loc, dict) else ''
|
1792
|
+
for i, entry in enumerate(caption_entries):
|
1793
|
+
location_str = entry['location']
|
1794
|
+
location_type = entry['type']
|
1795
|
+
context = entry['context']
|
1385
1796
|
|
1386
|
-
locations_str += f"\n{i+1}. {location_str} (Type: {location_type}
|
1387
|
-
locations_str += f" Reason: {reason}\n"
|
1797
|
+
locations_str += f"\n{i+1}. {location_str} (Type: {location_type})\n"
|
1388
1798
|
locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
|
1389
1799
|
|
1390
1800
|
# Ask Gemini to select best location for this campaign
|
@@ -1426,26 +1836,39 @@ def get_lineage(
|
|
1426
1836
|
|
1427
1837
|
log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
|
1428
1838
|
|
1429
|
-
# Find the actual
|
1430
|
-
|
1431
|
-
|
1432
|
-
if
|
1433
|
-
|
1839
|
+
# Find the actual caption entry
|
1840
|
+
selected_entry = None
|
1841
|
+
for entry in caption_entries:
|
1842
|
+
if entry['location'] == selected_location:
|
1843
|
+
selected_entry = entry
|
1434
1844
|
break
|
1435
1845
|
|
1436
|
-
if not
|
1437
|
-
log.warning(f"Could not find selected location '{selected_location}' in
|
1438
|
-
# Fall back to
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1846
|
+
if not selected_entry:
|
1847
|
+
log.warning(f"Could not find selected location '{selected_location}' in caption entries")
|
1848
|
+
# Fall back to first entry
|
1849
|
+
selected_entry = caption_entries[0] if caption_entries else None
|
1850
|
+
|
1851
|
+
# Convert caption entry to location format for compatibility
|
1852
|
+
if selected_entry:
|
1853
|
+
primary_location = {
|
1854
|
+
'location': selected_entry['location'],
|
1855
|
+
'type': selected_entry['type'],
|
1856
|
+
'confidence': 0.8, # Default confidence for caption-based selection
|
1857
|
+
'reason': f"Selected from {selected_entry['type']} captions"
|
1858
|
+
}
|
1442
1859
|
|
1443
1860
|
except Exception as e:
|
1444
1861
|
log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
|
1445
|
-
# Fall back to
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1862
|
+
# Fall back to first caption entry
|
1863
|
+
if caption_entries:
|
1864
|
+
primary_location = {
|
1865
|
+
'location': caption_entries[0]['location'],
|
1866
|
+
'type': caption_entries[0]['type'],
|
1867
|
+
'confidence': 0.5, # Lower confidence for fallback
|
1868
|
+
'reason': f"Fallback to first {caption_entries[0]['type']} caption"
|
1869
|
+
}
|
1870
|
+
else:
|
1871
|
+
primary_location = None
|
1449
1872
|
|
1450
1873
|
if not primary_location:
|
1451
1874
|
log.warning(f"No location found for campaign {campaign.campaign_id}")
|
@@ -1587,6 +2010,97 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
|
|
1587
2010
|
return []
|
1588
2011
|
|
1589
2012
|
# --- 7.2 Page-based extraction helper ---------------------------------------
|
2013
|
+
def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
|
2014
|
+
"""Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
|
2015
|
+
|
2016
|
+
Args:
|
2017
|
+
prompt: The prompt to send to Gemini
|
2018
|
+
model: The Gemini model instance
|
2019
|
+
context: Additional context for logging (e.g., "validation" or "extraction")
|
2020
|
+
|
2021
|
+
Returns:
|
2022
|
+
The validated sequence or None if no consensus
|
2023
|
+
"""
|
2024
|
+
sequences = []
|
2025
|
+
max_attempts = 5 # Increased from 3 to 5
|
2026
|
+
|
2027
|
+
# Try up to 5 times
|
2028
|
+
for attempt in range(max_attempts):
|
2029
|
+
try:
|
2030
|
+
response = model.generate_content(prompt)
|
2031
|
+
result = _extract_text(response).strip()
|
2032
|
+
|
2033
|
+
# Parse the result to extract just the sequence
|
2034
|
+
if result == "VALID":
|
2035
|
+
sequences.append("VALID")
|
2036
|
+
elif result == "UNCERTAIN":
|
2037
|
+
sequences.append("UNCERTAIN")
|
2038
|
+
elif result.startswith("M") and len(result) > 50:
|
2039
|
+
# Clean the sequence
|
2040
|
+
clean_seq = result.upper().replace(" ", "").replace("\n", "")
|
2041
|
+
if all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in clean_seq):
|
2042
|
+
sequences.append(clean_seq)
|
2043
|
+
else:
|
2044
|
+
sequences.append("INVALID")
|
2045
|
+
else:
|
2046
|
+
sequences.append("INVALID")
|
2047
|
+
|
2048
|
+
log.info(f"Gemini {context} attempt {attempt + 1}: {len(result) if result.startswith('M') else result}")
|
2049
|
+
|
2050
|
+
except Exception as e:
|
2051
|
+
log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
|
2052
|
+
sequences.append("ERROR")
|
2053
|
+
|
2054
|
+
# Check for early consensus after 2 attempts
|
2055
|
+
if len(sequences) == 2:
|
2056
|
+
# Clean sequences before comparison
|
2057
|
+
seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
|
2058
|
+
seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
|
2059
|
+
|
2060
|
+
if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
|
2061
|
+
log.info(f"Gemini {context} consensus reached after 2 attempts")
|
2062
|
+
return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
|
2063
|
+
else:
|
2064
|
+
log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
|
2065
|
+
|
2066
|
+
# After all attempts, find consensus
|
2067
|
+
valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
|
2068
|
+
|
2069
|
+
if not valid_sequences:
|
2070
|
+
log.error(f"All {max_attempts} {context} attempts failed")
|
2071
|
+
return None
|
2072
|
+
|
2073
|
+
# Find any matching pair
|
2074
|
+
for i in range(len(sequences)):
|
2075
|
+
for j in range(i + 1, len(sequences)):
|
2076
|
+
# Clean sequences before comparison
|
2077
|
+
seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
|
2078
|
+
seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
|
2079
|
+
|
2080
|
+
if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
|
2081
|
+
log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
|
2082
|
+
return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
|
2083
|
+
|
2084
|
+
# If no exact match, use adaptive validation
|
2085
|
+
# Count occurrences of each valid sequence
|
2086
|
+
sequence_counts = {}
|
2087
|
+
for seq in valid_sequences:
|
2088
|
+
if seq not in ["VALID", "UNCERTAIN"]:
|
2089
|
+
# Clean sequence before counting
|
2090
|
+
seq_clean = seq.replace(" ", "").replace("\n", "")
|
2091
|
+
sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
|
2092
|
+
|
2093
|
+
# Return the most common sequence if it appears at least twice
|
2094
|
+
if sequence_counts:
|
2095
|
+
most_common = max(sequence_counts.items(), key=lambda x: x[1])
|
2096
|
+
if most_common[1] >= 2:
|
2097
|
+
log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
|
2098
|
+
return most_common[0]
|
2099
|
+
|
2100
|
+
log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
|
2101
|
+
return None
|
2102
|
+
|
2103
|
+
|
1590
2104
|
def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
|
1591
2105
|
"""Validate and potentially correct a sequence using Gemini by checking against known mutations."""
|
1592
2106
|
|
@@ -1616,7 +2130,7 @@ def _validate_sequence_against_mutations(sequence: str, variants: List[Variant],
|
|
1616
2130
|
if not local_issues:
|
1617
2131
|
return None # No obvious issues found
|
1618
2132
|
|
1619
|
-
log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
|
2133
|
+
log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
|
1620
2134
|
|
1621
2135
|
prompt = f"""
|
1622
2136
|
You are validating a protein sequence that was extracted from a scientific paper.
|
@@ -1641,26 +2155,14 @@ Return ONLY the corrected sequence if changes are needed, or "VALID" if no chang
|
|
1641
2155
|
If you cannot determine the correct sequence, return "UNCERTAIN".
|
1642
2156
|
"""
|
1643
2157
|
|
1644
|
-
|
1645
|
-
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
return None
|
1653
|
-
elif result.startswith("M") and len(result) > 50:
|
1654
|
-
# Gemini returned a corrected sequence
|
1655
|
-
log.info(f"Gemini suggested sequence correction (length {len(result)})")
|
1656
|
-
return result
|
1657
|
-
else:
|
1658
|
-
log.warning(f"Unexpected validation response: {result[:100]}...")
|
1659
|
-
return None
|
1660
|
-
|
1661
|
-
except Exception as e:
|
1662
|
-
log.warning(f"Failed to validate sequence: {e}")
|
1663
|
-
return None
|
2158
|
+
# Use triple validation
|
2159
|
+
result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
|
2160
|
+
|
2161
|
+
if result == "VALID" or result is None:
|
2162
|
+
return None # No changes needed
|
2163
|
+
else:
|
2164
|
+
log.info(f"Gemini suggested sequence correction (length {len(result)})")
|
2165
|
+
return result
|
1664
2166
|
|
1665
2167
|
|
1666
2168
|
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
|
@@ -1827,10 +2329,18 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
|
|
1827
2329
|
- Extract the variant_id exactly as written where the sequence appears
|
1828
2330
|
- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
|
1829
2331
|
|
2332
|
+
SEQUENCE EXTRACTION RULES:
|
2333
|
+
- Copy sequences EXACTLY as they appear in the text
|
2334
|
+
- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
|
2335
|
+
- Do NOT add, remove, or modify any amino acids
|
2336
|
+
- Preserve the exact length and character sequence
|
2337
|
+
- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
|
2338
|
+
- Double-check that consecutive identical amino acids are copied correctly
|
2339
|
+
|
1830
2340
|
For each variant return:
|
1831
2341
|
* variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
|
1832
|
-
* aa_seq - amino-acid sequence (uppercase), or null
|
1833
|
-
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
|
2342
|
+
* aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
|
2343
|
+
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
|
1834
2344
|
|
1835
2345
|
Respond ONLY with **minified JSON** that matches the schema below.
|
1836
2346
|
NO markdown, no code fences, no commentary.
|
@@ -1846,8 +2356,258 @@ TEXT (may be truncated):
|
|
1846
2356
|
```
|
1847
2357
|
""".strip()
|
1848
2358
|
|
1849
|
-
def
|
1850
|
-
"""
|
2359
|
+
def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
|
2360
|
+
"""Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
|
2361
|
+
|
2362
|
+
Args:
|
2363
|
+
model: The Gemini model instance
|
2364
|
+
prompt: The prompt to send to Gemini
|
2365
|
+
schema_hint: The JSON schema hint
|
2366
|
+
debug_dir: Optional debug directory
|
2367
|
+
|
2368
|
+
Returns:
|
2369
|
+
The validated sequence JSON data or None if no consensus
|
2370
|
+
"""
|
2371
|
+
responses = []
|
2372
|
+
max_attempts = 5 # Increased from 3 to 5
|
2373
|
+
|
2374
|
+
# Try up to 5 times
|
2375
|
+
for attempt in range(max_attempts):
|
2376
|
+
try:
|
2377
|
+
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
2378
|
+
resp = model.generate_content(prompt)
|
2379
|
+
raw = _extract_text(resp).strip()
|
2380
|
+
|
2381
|
+
# Save debug info
|
2382
|
+
if debug_dir:
|
2383
|
+
debug_path = Path(debug_dir)
|
2384
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
2385
|
+
response_file = debug_path / f"sequences_attempt_{attempt + 1}_{int(time.time())}.txt"
|
2386
|
+
with open(response_file, 'w') as f:
|
2387
|
+
f.write(f"=== SEQUENCE EXTRACTION ATTEMPT {attempt + 1} ===\n")
|
2388
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
2389
|
+
f.write(f"Length: {len(raw)} characters\n")
|
2390
|
+
f.write("="*80 + "\n\n")
|
2391
|
+
f.write(raw)
|
2392
|
+
|
2393
|
+
# Parse JSON response (similar to generate_json_with_retry logic)
|
2394
|
+
fence_re = re.compile(r"```json|```", re.I)
|
2395
|
+
if raw.startswith("```"):
|
2396
|
+
raw = fence_re.sub("", raw).strip()
|
2397
|
+
|
2398
|
+
# Try to parse as JSON
|
2399
|
+
try:
|
2400
|
+
parsed = json.loads(raw)
|
2401
|
+
except json.JSONDecodeError:
|
2402
|
+
# Look for JSON array or object in the response
|
2403
|
+
json_start = -1
|
2404
|
+
json_end = -1
|
2405
|
+
bracket_stack = []
|
2406
|
+
in_string = False
|
2407
|
+
escape_next = False
|
2408
|
+
|
2409
|
+
for i, char in enumerate(raw):
|
2410
|
+
if escape_next:
|
2411
|
+
escape_next = False
|
2412
|
+
continue
|
2413
|
+
|
2414
|
+
if char == '\\':
|
2415
|
+
escape_next = True
|
2416
|
+
continue
|
2417
|
+
|
2418
|
+
if char == '"' and not escape_next:
|
2419
|
+
in_string = not in_string
|
2420
|
+
continue
|
2421
|
+
|
2422
|
+
if in_string:
|
2423
|
+
continue
|
2424
|
+
|
2425
|
+
if char in '[{':
|
2426
|
+
if json_start == -1:
|
2427
|
+
json_start = i
|
2428
|
+
bracket_stack.append(char)
|
2429
|
+
elif char in ']}':
|
2430
|
+
if bracket_stack:
|
2431
|
+
opening = bracket_stack.pop()
|
2432
|
+
if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
|
2433
|
+
if not bracket_stack: # Found complete JSON
|
2434
|
+
json_end = i + 1
|
2435
|
+
break
|
2436
|
+
|
2437
|
+
if json_start >= 0 and json_end > json_start:
|
2438
|
+
json_str = raw[json_start:json_end]
|
2439
|
+
parsed = json.loads(json_str)
|
2440
|
+
else:
|
2441
|
+
if '[]' in raw:
|
2442
|
+
parsed = []
|
2443
|
+
else:
|
2444
|
+
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
2445
|
+
|
2446
|
+
# Store both the original and normalized response
|
2447
|
+
normalized_response = _normalize_sequence_response(parsed)
|
2448
|
+
responses.append((parsed, normalized_response))
|
2449
|
+
|
2450
|
+
log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
|
2451
|
+
|
2452
|
+
except Exception as e:
|
2453
|
+
log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
|
2454
|
+
responses.append(None)
|
2455
|
+
|
2456
|
+
# Check for early consensus after 2 attempts
|
2457
|
+
if len(responses) == 2:
|
2458
|
+
if (responses[0] and responses[1] and
|
2459
|
+
_sequences_match(responses[0][1], responses[1][1])):
|
2460
|
+
log.info("Sequence extraction consensus reached after 2 attempts")
|
2461
|
+
return responses[0][0] # Return original parsed data
|
2462
|
+
else:
|
2463
|
+
log.info("Sequence extraction mismatch after 2 attempts - trying third")
|
2464
|
+
|
2465
|
+
# After all attempts, use adaptive validation
|
2466
|
+
valid_responses = [r for r in responses if r is not None]
|
2467
|
+
|
2468
|
+
if not valid_responses:
|
2469
|
+
log.error(f"All {max_attempts} sequence extraction attempts failed")
|
2470
|
+
return None
|
2471
|
+
|
2472
|
+
# First, try to find exact consensus (any matching pair)
|
2473
|
+
for i in range(len(valid_responses)):
|
2474
|
+
for j in range(i + 1, len(valid_responses)):
|
2475
|
+
if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
|
2476
|
+
log.info(f"Sequence extraction consensus found: attempts with matching content")
|
2477
|
+
return valid_responses[i][0] # Return original parsed data
|
2478
|
+
|
2479
|
+
# If no exact consensus, use adaptive validation
|
2480
|
+
log.info("No exact consensus found, applying adaptive validation...")
|
2481
|
+
|
2482
|
+
# Find sequences that appear consistently across multiple attempts
|
2483
|
+
consistent_sequences = _find_consistent_sequences(valid_responses)
|
2484
|
+
|
2485
|
+
if consistent_sequences:
|
2486
|
+
log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
|
2487
|
+
return consistent_sequences
|
2488
|
+
|
2489
|
+
# If still no consensus, use the attempt with the most sequences
|
2490
|
+
best_response = max(valid_responses,
|
2491
|
+
key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
|
2492
|
+
|
2493
|
+
if best_response and len(best_response[1]) > 0:
|
2494
|
+
log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
|
2495
|
+
return best_response[0]
|
2496
|
+
|
2497
|
+
log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
|
2498
|
+
return None
|
2499
|
+
|
2500
|
+
|
2501
|
+
def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
|
2502
|
+
"""Find sequences that appear consistently across multiple extraction attempts.
|
2503
|
+
|
2504
|
+
Args:
|
2505
|
+
valid_responses: List of (original_data, normalized_data) tuples
|
2506
|
+
|
2507
|
+
Returns:
|
2508
|
+
List of consistent sequences with confidence scores, or None if none found
|
2509
|
+
"""
|
2510
|
+
if not valid_responses:
|
2511
|
+
return None
|
2512
|
+
|
2513
|
+
# Count how many times each sequence appears
|
2514
|
+
sequence_counts = {}
|
2515
|
+
sequence_full_data = {}
|
2516
|
+
|
2517
|
+
for original, normalized in valid_responses:
|
2518
|
+
if not isinstance(normalized, list):
|
2519
|
+
continue
|
2520
|
+
|
2521
|
+
for seq in normalized:
|
2522
|
+
variant_id = seq.get("variant_id", "")
|
2523
|
+
aa_seq = seq.get("aa_seq", "")
|
2524
|
+
# Clean sequence before using in key
|
2525
|
+
aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
|
2526
|
+
|
2527
|
+
# Create a unique key for this sequence
|
2528
|
+
key = f"{variant_id}|{aa_seq_clean}"
|
2529
|
+
|
2530
|
+
if key not in sequence_counts:
|
2531
|
+
sequence_counts[key] = 0
|
2532
|
+
sequence_full_data[key] = []
|
2533
|
+
|
2534
|
+
sequence_counts[key] += 1
|
2535
|
+
|
2536
|
+
# Find the full data for this sequence from the original response
|
2537
|
+
if isinstance(original, list):
|
2538
|
+
for orig_seq in original:
|
2539
|
+
if (orig_seq.get("variant_id") == variant_id and
|
2540
|
+
orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
|
2541
|
+
sequence_full_data[key].append(orig_seq)
|
2542
|
+
break
|
2543
|
+
|
2544
|
+
# Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
|
2545
|
+
min_appearances = max(2, len(valid_responses) // 2)
|
2546
|
+
consistent_sequences = []
|
2547
|
+
|
2548
|
+
for key, count in sequence_counts.items():
|
2549
|
+
if count >= min_appearances:
|
2550
|
+
# Use the first occurrence of the full data
|
2551
|
+
if sequence_full_data[key]:
|
2552
|
+
seq_data = sequence_full_data[key][0].copy()
|
2553
|
+
# Add confidence based on how many times it appeared
|
2554
|
+
seq_data["confidence"] = count / len(valid_responses)
|
2555
|
+
seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
|
2556
|
+
consistent_sequences.append(seq_data)
|
2557
|
+
|
2558
|
+
return consistent_sequences if consistent_sequences else None
|
2559
|
+
|
2560
|
+
|
2561
|
+
def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
|
2562
|
+
"""Normalize sequence response for comparison."""
|
2563
|
+
if not isinstance(data, list):
|
2564
|
+
return []
|
2565
|
+
|
2566
|
+
normalized = []
|
2567
|
+
for item in data:
|
2568
|
+
if isinstance(item, dict):
|
2569
|
+
# Extract key fields for comparison
|
2570
|
+
normalized_item = {
|
2571
|
+
"variant_id": item.get("variant_id", ""),
|
2572
|
+
"aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
|
2573
|
+
"dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
|
2574
|
+
"confidence": item.get("confidence", 0.0)
|
2575
|
+
}
|
2576
|
+
normalized.append(normalized_item)
|
2577
|
+
|
2578
|
+
# Sort by variant_id for consistent comparison
|
2579
|
+
return sorted(normalized, key=lambda x: x["variant_id"])
|
2580
|
+
|
2581
|
+
|
2582
|
+
def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
|
2583
|
+
"""Check if two sequence response lists match on key fields."""
|
2584
|
+
if len(seq1) != len(seq2):
|
2585
|
+
return False
|
2586
|
+
|
2587
|
+
for i, (s1, s2) in enumerate(zip(seq1, seq2)):
|
2588
|
+
# Compare variant IDs
|
2589
|
+
if s1.get("variant_id") != s2.get("variant_id"):
|
2590
|
+
return False
|
2591
|
+
|
2592
|
+
# Compare amino acid sequences (most critical)
|
2593
|
+
aa1 = s1.get("aa_seq", "")
|
2594
|
+
aa2 = s2.get("aa_seq", "")
|
2595
|
+
if aa1 and aa2 and aa1 != aa2:
|
2596
|
+
return False
|
2597
|
+
elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
|
2598
|
+
return False
|
2599
|
+
|
2600
|
+
# Compare DNA sequences if present
|
2601
|
+
dna1 = s1.get("dna_seq", "")
|
2602
|
+
dna2 = s2.get("dna_seq", "")
|
2603
|
+
if dna1 and dna2 and dna1 != dna2:
|
2604
|
+
return False
|
2605
|
+
|
2606
|
+
return True
|
2607
|
+
|
2608
|
+
|
2609
|
+
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
2610
|
+
"""Prompt Gemini and convert its JSON reply into SequenceBlock objects with triple validation."""
|
1851
2611
|
base_prompt = _SEQ_EXTRACTION_PROMPT.format(
|
1852
2612
|
schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
|
1853
2613
|
)
|
@@ -1864,8 +2624,50 @@ Match sequences to these known variants when possible. Variants may be labeled d
|
|
1864
2624
|
else:
|
1865
2625
|
prompt = base_prompt
|
1866
2626
|
|
1867
|
-
|
1868
|
-
|
2627
|
+
# Add mutation validation context if we have lineage variants with mutations
|
2628
|
+
if lineage_variants:
|
2629
|
+
mutation_context = _build_mutation_validation_context(lineage_variants)
|
2630
|
+
if mutation_context:
|
2631
|
+
prompt = f"""{prompt}
|
2632
|
+
|
2633
|
+
CRITICAL MUTATION VALIDATION:
|
2634
|
+
{mutation_context}
|
2635
|
+
|
2636
|
+
IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
|
2637
|
+
For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
|
2638
|
+
"""
|
2639
|
+
|
2640
|
+
# Save the complete prompt for debugging
|
2641
|
+
if debug_dir:
|
2642
|
+
debug_path = Path(debug_dir)
|
2643
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
2644
|
+
prompt_file = debug_path / f"sequence_extraction_prompt_{int(time.time())}.txt"
|
2645
|
+
with open(prompt_file, 'w') as f:
|
2646
|
+
f.write(f"=== SEQUENCE EXTRACTION PROMPT ===\n")
|
2647
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
2648
|
+
f.write(f"Text length: {len(text)} characters\n")
|
2649
|
+
f.write(f"Truncated to: {len(text[:MAX_CHARS])} characters\n")
|
2650
|
+
f.write(f"Total prompt length: {len(prompt)} characters\n")
|
2651
|
+
f.write("="*80 + "\n\n")
|
2652
|
+
f.write(prompt)
|
2653
|
+
log.info(f"Saved sequence extraction prompt to {prompt_file}")
|
2654
|
+
|
2655
|
+
# Use triple validation for sequence extraction
|
2656
|
+
log.info("Extracting sequences with triple validation to ensure accuracy")
|
2657
|
+
data = _extract_sequences_with_triple_validation(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir)
|
2658
|
+
|
2659
|
+
if not data:
|
2660
|
+
log.warning("Failed to get consistent sequence extraction after triple validation")
|
2661
|
+
return []
|
2662
|
+
|
2663
|
+
extracted_sequences = _parse_sequences(data)
|
2664
|
+
|
2665
|
+
# Post-process: validate sequences against mutations if we have lineage info
|
2666
|
+
if lineage_variants:
|
2667
|
+
validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
|
2668
|
+
return validated_sequences
|
2669
|
+
|
2670
|
+
return extracted_sequences
|
1869
2671
|
|
1870
2672
|
# --- 7.4 JSON -> dataclass helpers -------------------------------------------
|
1871
2673
|
_VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
|
@@ -1916,6 +2718,167 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
1916
2718
|
)
|
1917
2719
|
return blocks
|
1918
2720
|
|
2721
|
+
def _build_mutation_validation_context(lineage_variants: List[Variant]) -> str:
|
2722
|
+
"""Build mutation context for sequence validation."""
|
2723
|
+
mutation_info = []
|
2724
|
+
|
2725
|
+
for variant in lineage_variants:
|
2726
|
+
if variant.mutations and variant.parent_id:
|
2727
|
+
mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
|
2728
|
+
mutation_info.append(f"Variant '{variant.variant_id}' (parent: '{variant.parent_id}') has mutations: {mutations_str}")
|
2729
|
+
|
2730
|
+
if not mutation_info:
|
2731
|
+
return ""
|
2732
|
+
|
2733
|
+
context = "Known mutation relationships:\n" + "\n".join(mutation_info[:10]) # Limit to first 10 for context
|
2734
|
+
if len(mutation_info) > 10:
|
2735
|
+
context += f"\n... and {len(mutation_info) - 10} more variants with mutations"
|
2736
|
+
|
2737
|
+
return context
|
2738
|
+
|
2739
|
+
def _validate_sequences_against_mutations(sequences: List[SequenceBlock], lineage_variants: List[Variant], model, debug_dir: str | Path | None = None) -> List[SequenceBlock]:
|
2740
|
+
"""Validate extracted sequences against known mutations and fix inconsistencies."""
|
2741
|
+
# Create lookups for easier access
|
2742
|
+
seq_lookup = {seq.variant_id: seq for seq in sequences}
|
2743
|
+
variant_lookup = {var.variant_id: var for var in lineage_variants}
|
2744
|
+
|
2745
|
+
validation_issues = []
|
2746
|
+
corrected_sequences = []
|
2747
|
+
|
2748
|
+
for seq in sequences:
|
2749
|
+
variant = variant_lookup.get(seq.variant_id)
|
2750
|
+
if not variant or not variant.parent_id or not variant.mutations or not seq.aa_seq:
|
2751
|
+
corrected_sequences.append(seq)
|
2752
|
+
continue
|
2753
|
+
|
2754
|
+
parent_seq = seq_lookup.get(variant.parent_id)
|
2755
|
+
if not parent_seq or not parent_seq.aa_seq:
|
2756
|
+
corrected_sequences.append(seq)
|
2757
|
+
continue
|
2758
|
+
|
2759
|
+
# Check if mutations are consistent
|
2760
|
+
issues = _check_mutation_consistency(seq.aa_seq, parent_seq.aa_seq, variant.mutations, seq.variant_id, variant.parent_id)
|
2761
|
+
|
2762
|
+
if issues:
|
2763
|
+
validation_issues.extend(issues)
|
2764
|
+
log.warning(f"Sequence validation issues for {seq.variant_id}: {'; '.join(issues)}")
|
2765
|
+
|
2766
|
+
# Try to get corrected sequence from Gemini
|
2767
|
+
corrected_seq = _get_corrected_sequence_from_gemini(seq, parent_seq, variant, issues, model, debug_dir)
|
2768
|
+
if corrected_seq:
|
2769
|
+
corrected_sequences.append(corrected_seq)
|
2770
|
+
log.info(f"Corrected sequence for {seq.variant_id} using Gemini validation")
|
2771
|
+
else:
|
2772
|
+
corrected_sequences.append(seq) # Keep original if correction fails
|
2773
|
+
else:
|
2774
|
+
corrected_sequences.append(seq)
|
2775
|
+
|
2776
|
+
if validation_issues:
|
2777
|
+
log.warning(f"Found {len(validation_issues)} sequence validation issues across {len([s for s in sequences if s.variant_id in [v.variant_id for v in lineage_variants if v.mutations]])} variants with mutations")
|
2778
|
+
|
2779
|
+
return corrected_sequences
|
2780
|
+
|
2781
|
+
def _check_mutation_consistency(child_seq: str, parent_seq: str, mutations, child_id: str, parent_id: str) -> List[str]:
|
2782
|
+
"""Check if mutations are consistent between parent and child sequences."""
|
2783
|
+
import re
|
2784
|
+
|
2785
|
+
issues = []
|
2786
|
+
|
2787
|
+
# Parse mutations (handle both string and list formats)
|
2788
|
+
if isinstance(mutations, list):
|
2789
|
+
mutation_strs = mutations
|
2790
|
+
else:
|
2791
|
+
mutation_strs = [m.strip() for m in str(mutations).split(',') if m.strip()]
|
2792
|
+
|
2793
|
+
for mut_str in mutation_strs:
|
2794
|
+
# Parse mutation like "A100V"
|
2795
|
+
match = re.match(r'^([A-Z])(\d+)([A-Z])$', mut_str.strip())
|
2796
|
+
if not match:
|
2797
|
+
continue # Skip non-standard mutation formats
|
2798
|
+
|
2799
|
+
orig_aa, pos_str, new_aa = match.groups()
|
2800
|
+
pos = int(pos_str) - 1 # Convert to 0-based indexing
|
2801
|
+
|
2802
|
+
# Check bounds
|
2803
|
+
if pos >= len(parent_seq) or pos >= len(child_seq):
|
2804
|
+
issues.append(f"Mutation {mut_str} position out of bounds")
|
2805
|
+
continue
|
2806
|
+
|
2807
|
+
# Check parent sequence has expected original amino acid
|
2808
|
+
if parent_seq[pos] != orig_aa:
|
2809
|
+
issues.append(f"Mutation {mut_str}: parent {parent_id} has {parent_seq[pos]} at position {pos+1}, expected {orig_aa}")
|
2810
|
+
|
2811
|
+
# Check child sequence has expected new amino acid
|
2812
|
+
if child_seq[pos] != new_aa:
|
2813
|
+
issues.append(f"Mutation {mut_str}: child {child_id} has {child_seq[pos]} at position {pos+1}, expected {new_aa}")
|
2814
|
+
|
2815
|
+
return issues
|
2816
|
+
|
2817
|
+
def _get_corrected_sequence_from_gemini(seq: SequenceBlock, parent_seq: SequenceBlock, variant: Variant, issues: List[str], model, debug_dir: str | Path | None = None) -> SequenceBlock | None:
|
2818
|
+
"""Use Gemini to get a corrected sequence based on mutation validation issues."""
|
2819
|
+
if not model:
|
2820
|
+
return None
|
2821
|
+
|
2822
|
+
mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
|
2823
|
+
issues_str = "; ".join(issues)
|
2824
|
+
|
2825
|
+
prompt = f"""You extracted a sequence for variant "{seq.variant_id}" but there are mutation validation issues:
|
2826
|
+
|
2827
|
+
ISSUES: {issues_str}
|
2828
|
+
|
2829
|
+
PARENT SEQUENCE ({variant.parent_id}):
|
2830
|
+
{parent_seq.aa_seq}
|
2831
|
+
|
2832
|
+
EXTRACTED SEQUENCE ({seq.variant_id}):
|
2833
|
+
{seq.aa_seq}
|
2834
|
+
|
2835
|
+
EXPECTED MUTATIONS: {mutations_str}
|
2836
|
+
|
2837
|
+
Based on the parent sequence and the expected mutations, provide the CORRECT sequence for {seq.variant_id}.
|
2838
|
+
Apply each mutation to the parent sequence in order.
|
2839
|
+
|
2840
|
+
For example, if parent has "A" at position 100 and mutation is "A100V", then child should have "V" at position 100.
|
2841
|
+
|
2842
|
+
IMPORTANT SEQUENCE RULES:
|
2843
|
+
- Copy the sequence EXACTLY - do not add, remove, or modify any amino acids
|
2844
|
+
- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
|
2845
|
+
- Preserve the exact length of the sequence
|
2846
|
+
- Only change the specific positions indicated by the mutations
|
2847
|
+
- Double-check that consecutive identical amino acids are copied correctly
|
2848
|
+
|
2849
|
+
Return ONLY the corrected amino acid sequence (no explanation, no formatting).
|
2850
|
+
If you cannot determine the correct sequence, return "UNCERTAIN".
|
2851
|
+
"""
|
2852
|
+
|
2853
|
+
try:
|
2854
|
+
if debug_dir:
|
2855
|
+
import time
|
2856
|
+
timestamp = int(time.time())
|
2857
|
+
prompt_file = Path(debug_dir) / f"sequence_validation_{seq.variant_id}_{timestamp}.txt"
|
2858
|
+
_dump(prompt, prompt_file)
|
2859
|
+
|
2860
|
+
# Use triple validation for sequence correction
|
2861
|
+
log.info(f"Correcting sequence for {seq.variant_id} with triple validation")
|
2862
|
+
corrected_seq = _extract_plain_sequence_with_triple_validation(prompt, model, f"correction for {seq.variant_id}")
|
2863
|
+
|
2864
|
+
if debug_dir and corrected_seq:
|
2865
|
+
response_file = Path(debug_dir) / f"sequence_validation_response_{seq.variant_id}_{timestamp}.txt"
|
2866
|
+
_dump(corrected_seq, response_file)
|
2867
|
+
|
2868
|
+
if corrected_seq and corrected_seq not in ["UNCERTAIN", "VALID"] and _clean_seq(corrected_seq, _VALID_AA):
|
2869
|
+
return SequenceBlock(
|
2870
|
+
variant_id=seq.variant_id,
|
2871
|
+
aa_seq=corrected_seq,
|
2872
|
+
dna_seq=seq.dna_seq,
|
2873
|
+
confidence=0.8, # Lower confidence for corrected sequences
|
2874
|
+
truncated=seq.truncated
|
2875
|
+
)
|
2876
|
+
|
2877
|
+
except Exception as e:
|
2878
|
+
log.warning(f"Failed to get corrected sequence for {seq.variant_id}: {e}")
|
2879
|
+
|
2880
|
+
return None
|
2881
|
+
|
1919
2882
|
# --- 7.5 Convenience wrapper -------------------------------------------------
|
1920
2883
|
def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
1921
2884
|
# Phase 1: Identify where sequences might be located
|
@@ -1973,6 +2936,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
1973
2936
|
|
1974
2937
|
# Fallback to text search if page extraction didn't work
|
1975
2938
|
if not focused_text:
|
2939
|
+
log.info("Page extraction did not return text, falling back to text search")
|
1976
2940
|
focused_text = _extract_text_at_locations(
|
1977
2941
|
text, [best_location],
|
1978
2942
|
context_chars=max(min_length, 30000),
|
@@ -1982,6 +2946,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
1982
2946
|
if focused_text and len(focused_text) < len(text):
|
1983
2947
|
log.info("Reduced text from %d to %d chars using validated location",
|
1984
2948
|
len(text), len(focused_text))
|
2949
|
+
else:
|
2950
|
+
log.warning("Failed to reduce text size - focused_text length: %d, full text length: %d",
|
2951
|
+
len(focused_text) if focused_text else 0, len(text))
|
1985
2952
|
# Build lineage context if available
|
1986
2953
|
lineage_context = None
|
1987
2954
|
if lineage_variants:
|
@@ -1993,7 +2960,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
1993
2960
|
variant_info.append(info)
|
1994
2961
|
lineage_context = "\n".join(variant_info)
|
1995
2962
|
|
1996
|
-
return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
2963
|
+
return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
|
1997
2964
|
else:
|
1998
2965
|
log.warning("Location validation failed or returned invalid location: %s",
|
1999
2966
|
validation.get("reason", "Unknown"))
|
@@ -2011,7 +2978,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2011
2978
|
variant_info.append(info)
|
2012
2979
|
lineage_context = "\n".join(variant_info)
|
2013
2980
|
|
2014
|
-
return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
2981
|
+
return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
|
2015
2982
|
|
2016
2983
|
# === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
|
2017
2984
|
"""When no sequences are found in the paper, attempt to fetch them from PDB."""
|
@@ -2077,6 +3044,7 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
|
|
2077
3044
|
log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
|
2078
3045
|
return {}
|
2079
3046
|
|
3047
|
+
|
2080
3048
|
def extract_enzyme_info_with_gemini(
|
2081
3049
|
text: str,
|
2082
3050
|
variants: List[Variant],
|
@@ -2146,7 +3114,7 @@ If you cannot determine certain fields, set them to null.
|
|
2146
3114
|
# Validate it looks like a protein sequence
|
2147
3115
|
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
|
2148
3116
|
# Sanity check the sequence against known mutations
|
2149
|
-
validated_seq = _validate_sequence_against_mutations(seq, variants,
|
3117
|
+
validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
|
2150
3118
|
if validated_seq:
|
2151
3119
|
seq = validated_seq
|
2152
3120
|
log.info(f"Sequence validated and potentially corrected by Gemini")
|
@@ -2714,7 +3682,7 @@ def run_pipeline(
|
|
2714
3682
|
|
2715
3683
|
# 1. Prepare raw text ------------------------------------------------------
|
2716
3684
|
# Always load both caption text (for identification) and full text (for extraction)
|
2717
|
-
pdf_paths = [p for p in (
|
3685
|
+
pdf_paths = [p for p in (manuscript, si_path) if p]
|
2718
3686
|
caption_text = limited_caption_concat(*pdf_paths)
|
2719
3687
|
full_text = limited_concat(*pdf_paths)
|
2720
3688
|
|