debase 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +512 -33
- debase/enzyme_lineage_extractor.py +977 -97
- debase/lineage_format.py +221 -12
- debase/reaction_info_extractor.py +133 -23
- debase/substrate_scope_extractor.py +49 -2
- debase/wrapper.py +155 -151
- debase-0.4.4.dist-info/METADATA +121 -0
- debase-0.4.4.dist-info/RECORD +16 -0
- debase-0.4.3.dist-info/METADATA +0 -296
- debase-0.4.3.dist-info/RECORD +0 -16
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.3.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ import time
|
|
30
30
|
import logging
|
31
31
|
from pathlib import Path
|
32
32
|
from dataclasses import dataclass, field
|
33
|
-
from typing import List, Optional, Union, Tuple
|
33
|
+
from typing import List, Optional, Union, Tuple, Dict, Any
|
34
34
|
|
35
35
|
MODEL_NAME: str = "gemini-2.5-flash"
|
36
36
|
MAX_CHARS: int = 150_000 # Max characters sent to LLM
|
@@ -727,17 +727,18 @@ Return a JSON object with:
|
|
727
727
|
|
728
728
|
_LINEAGE_LOC_PROMPT = """
|
729
729
|
You are an expert reader of protein engineering manuscripts.
|
730
|
+
{campaign_context}
|
730
731
|
Given the following article text, list up to {max_results} *locations* (page
|
731
732
|
numbers, figure/table IDs, or section headings) that you would review first to
|
732
733
|
find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
|
733
|
-
came from which parent and what mutations were introduced).
|
734
|
+
came from which parent and what mutations were introduced){campaign_specific}.
|
734
735
|
|
735
736
|
Respond with a JSON array of objects, each containing:
|
736
737
|
- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
|
737
738
|
- "type": one of "table", "figure", "text", "section"
|
738
739
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
739
740
|
- "reason": brief explanation of why this location likely contains lineage
|
740
|
-
|
741
|
+
{campaign_field}
|
741
742
|
IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
|
742
743
|
|
743
744
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
@@ -748,9 +749,9 @@ Don't include oligonucleotide results or result from only one round.
|
|
748
749
|
|
749
750
|
Example output:
|
750
751
|
[
|
751
|
-
{{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"}},
|
752
|
-
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"}},
|
753
|
-
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"}}
|
752
|
+
{{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
|
753
|
+
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
|
754
|
+
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
|
754
755
|
]
|
755
756
|
""".strip()
|
756
757
|
|
@@ -915,7 +916,39 @@ def identify_evolution_locations(
|
|
915
916
|
|
916
917
|
# Include TOC before the main text
|
917
918
|
combined_text = toc_text + text if toc_text else text
|
918
|
-
|
919
|
+
|
920
|
+
# Add campaign context if provided
|
921
|
+
campaign_context = ""
|
922
|
+
campaign_specific = ""
|
923
|
+
campaign_field = ""
|
924
|
+
campaign_example = ""
|
925
|
+
|
926
|
+
if campaigns and len(campaigns) == 1:
|
927
|
+
# Single campaign - make it specific
|
928
|
+
camp = campaigns[0]
|
929
|
+
campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
|
930
|
+
if hasattr(camp, 'notes') and camp.notes:
|
931
|
+
campaign_context += f"- Key identifiers: {camp.notes}\n"
|
932
|
+
campaign_specific = f" for the '{camp.campaign_name}' campaign"
|
933
|
+
campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
|
934
|
+
campaign_example = f', "campaign_id": "{camp.campaign_id}"'
|
935
|
+
elif campaigns and len(campaigns) > 1:
|
936
|
+
# Multiple campaigns - list them all
|
937
|
+
campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
|
938
|
+
for camp in campaigns:
|
939
|
+
campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
|
940
|
+
campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
|
941
|
+
campaign_specific = " for any of the identified campaigns"
|
942
|
+
campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
|
943
|
+
campaign_example = ', "campaign_id": "campaign_id_here"'
|
944
|
+
|
945
|
+
prompt = _LINEAGE_LOC_PROMPT.format(
|
946
|
+
campaign_context=campaign_context,
|
947
|
+
max_results=max_results,
|
948
|
+
campaign_specific=campaign_specific,
|
949
|
+
campaign_field=campaign_field,
|
950
|
+
campaign_example=campaign_example
|
951
|
+
) + "\n\nTEXT:\n" + combined_text
|
919
952
|
locs: List[dict] = []
|
920
953
|
try:
|
921
954
|
locs = generate_json_with_retry(
|
@@ -1378,6 +1411,138 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1378
1411
|
|
1379
1412
|
# ---- 6.4 Public API -------------------------------------------------------
|
1380
1413
|
|
1414
|
+
def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
|
1415
|
+
"""Extract text from a specific location (table, section, etc.) in the full text."""
|
1416
|
+
import re
|
1417
|
+
|
1418
|
+
if location_type == 'table':
|
1419
|
+
# Find ALL mentions of this table and combine them
|
1420
|
+
location_clean = location.strip()
|
1421
|
+
|
1422
|
+
# Different ways the table might be referenced
|
1423
|
+
search_patterns = [
|
1424
|
+
location_clean, # Exact match
|
1425
|
+
location_clean.replace("Supplementary ", "Supp. "), # Common abbreviation
|
1426
|
+
location_clean.replace("Supplementary ", "S"), # E.g., "Table S3"
|
1427
|
+
location_clean.replace("Supplementary Table ", "Table S"), # Another common format
|
1428
|
+
]
|
1429
|
+
|
1430
|
+
# Collect all occurrences
|
1431
|
+
all_occurrences = []
|
1432
|
+
seen_positions = set()
|
1433
|
+
|
1434
|
+
for search_term in search_patterns:
|
1435
|
+
pattern = re.compile(re.escape(search_term), re.IGNORECASE)
|
1436
|
+
for match in pattern.finditer(full_text):
|
1437
|
+
# Avoid duplicates from overlapping patterns
|
1438
|
+
if match.start() in seen_positions:
|
1439
|
+
continue
|
1440
|
+
seen_positions.add(match.start())
|
1441
|
+
|
1442
|
+
# Extract generous context around each mention
|
1443
|
+
start = max(0, match.start() - 1000)
|
1444
|
+
end = min(len(full_text), match.end() + 10000)
|
1445
|
+
context = full_text[start:end]
|
1446
|
+
|
1447
|
+
all_occurrences.append({
|
1448
|
+
'position': match.start(),
|
1449
|
+
'context': context,
|
1450
|
+
'match': match.group()
|
1451
|
+
})
|
1452
|
+
|
1453
|
+
if not all_occurrences:
|
1454
|
+
log.warning(f"No occurrences of table '{location_clean}' found in text")
|
1455
|
+
return None
|
1456
|
+
|
1457
|
+
log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
|
1458
|
+
|
1459
|
+
# Combine all occurrences into one text for Gemini to analyze
|
1460
|
+
combined_text = f"=== All occurrences of {location_clean} ===\n\n"
|
1461
|
+
|
1462
|
+
for i, occurrence in enumerate(all_occurrences, 1):
|
1463
|
+
combined_text += f"--- Occurrence {i} at position {occurrence['position']} ---\n"
|
1464
|
+
combined_text += occurrence['context']
|
1465
|
+
combined_text += "\n\n"
|
1466
|
+
|
1467
|
+
# Limit total length to avoid overwhelming the model
|
1468
|
+
if len(combined_text) > 50000:
|
1469
|
+
combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
|
1470
|
+
|
1471
|
+
return combined_text
|
1472
|
+
|
1473
|
+
elif location_type == 'figure':
|
1474
|
+
# For figures, we mainly want the caption and any text description
|
1475
|
+
location_clean = location.strip()
|
1476
|
+
patterns = [
|
1477
|
+
rf'({re.escape(location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})',
|
1478
|
+
rf'(Figure\s+S?\d+[^\n]*{re.escape(location_clean.split()[-1] if location_clean.split() else location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})'
|
1479
|
+
]
|
1480
|
+
|
1481
|
+
for pattern in patterns:
|
1482
|
+
match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
1483
|
+
if match:
|
1484
|
+
# For figures, include surrounding context as the data might be described nearby
|
1485
|
+
start = max(0, match.start() - 1000)
|
1486
|
+
end = min(match.end() + 2000, len(full_text))
|
1487
|
+
return full_text[start:end]
|
1488
|
+
|
1489
|
+
elif location_type == 'section':
|
1490
|
+
# Look for section heading
|
1491
|
+
location_clean = location.strip()
|
1492
|
+
patterns = [
|
1493
|
+
# Section with number
|
1494
|
+
rf'((?:^|\n)\d+\.?\s*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]).*\n){{0,500}})',
|
1495
|
+
# Section without number
|
1496
|
+
rf'((?:^|\n){re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]|\n[A-Z]{{2,}}).*\n){{0,500}})',
|
1497
|
+
# More flexible section matching
|
1498
|
+
rf'((?:^|\n)[^\n]*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+|\n[A-Z]{{2,}}).*\n){{0,500}})'
|
1499
|
+
]
|
1500
|
+
|
1501
|
+
for pattern in patterns:
|
1502
|
+
match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
1503
|
+
if match:
|
1504
|
+
return match.group(1)
|
1505
|
+
|
1506
|
+
elif location_type == 'text':
|
1507
|
+
# Try to find the location as a page marker or general text
|
1508
|
+
if location.isdigit():
|
1509
|
+
# Page number - look for page markers
|
1510
|
+
page_num = int(location)
|
1511
|
+
# Look for page breaks or page numbers
|
1512
|
+
patterns = [
|
1513
|
+
rf'(?:^|\n)\s*-?\s*{page_num}\s*-?\s*\n((?:.*\n){{0,300}})',
|
1514
|
+
rf'(?:page|p\.?)\s*{page_num}[^\n]*\n((?:.*\n){{0,300}})',
|
1515
|
+
rf'\n{page_num}\n((?:.*\n){{0,300}})'
|
1516
|
+
]
|
1517
|
+
for pattern in patterns:
|
1518
|
+
match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE)
|
1519
|
+
if match:
|
1520
|
+
start = match.start()
|
1521
|
+
end = min(start + 15000, len(full_text))
|
1522
|
+
return full_text[start:end]
|
1523
|
+
|
1524
|
+
# Fallback: try fuzzy search for the location string
|
1525
|
+
location_words = location.split()
|
1526
|
+
if len(location_words) >= 2:
|
1527
|
+
# Try to find at least the first two words together
|
1528
|
+
search_pattern = rf'{re.escape(location_words[0])}\s+{re.escape(location_words[1])}'
|
1529
|
+
match = re.search(search_pattern, full_text, re.IGNORECASE)
|
1530
|
+
if match:
|
1531
|
+
start = max(0, match.start() - 500)
|
1532
|
+
end = min(match.start() + 8000, len(full_text))
|
1533
|
+
return full_text[start:end]
|
1534
|
+
|
1535
|
+
# Last resort: find any occurrence of the location string
|
1536
|
+
idx = full_text.lower().find(location.lower())
|
1537
|
+
if idx != -1:
|
1538
|
+
start = max(0, idx - 500)
|
1539
|
+
end = min(idx + 8000, len(full_text))
|
1540
|
+
return full_text[start:end]
|
1541
|
+
|
1542
|
+
log.warning(f"Could not find location '{location}' of type '{location_type}' in text")
|
1543
|
+
return None
|
1544
|
+
|
1545
|
+
|
1381
1546
|
def get_lineage(
|
1382
1547
|
caption_text: str,
|
1383
1548
|
full_text: str,
|
@@ -1416,63 +1581,220 @@ def get_lineage(
|
|
1416
1581
|
campaigns = [default_campaign]
|
1417
1582
|
log.info(f"Created default campaign: {default_campaign.campaign_name}")
|
1418
1583
|
|
1419
|
-
# Use captions for identification - they're concise and focused
|
1420
|
-
locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
|
1421
|
-
|
1422
1584
|
all_variants = []
|
1423
1585
|
|
1424
1586
|
if campaigns:
|
1425
|
-
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
1587
|
+
log.info("Using campaign-aware location identification")
|
1588
|
+
|
1589
|
+
# Process each campaign separately
|
1590
|
+
for campaign in campaigns:
|
1591
|
+
log.info(f"\nProcessing campaign: {campaign.campaign_id} - {campaign.campaign_name}")
|
1592
|
+
|
1593
|
+
# Use identify_evolution_locations with campaign context
|
1594
|
+
locations = identify_evolution_locations(
|
1595
|
+
caption_text,
|
1596
|
+
model,
|
1597
|
+
max_results=5,
|
1598
|
+
debug_dir=debug_dir,
|
1599
|
+
campaigns=[campaign], # Pass single campaign for focused search
|
1600
|
+
pdf_paths=pdf_paths
|
1601
|
+
)
|
1602
|
+
|
1603
|
+
if not locations:
|
1604
|
+
log.warning(f"No locations found for campaign {campaign.campaign_id}, trying full text extraction")
|
1605
|
+
# Fall back to full text extraction
|
1606
|
+
campaign_variants = extract_complete_lineage(
|
1607
|
+
full_text, model,
|
1608
|
+
debug_dir=debug_dir,
|
1609
|
+
campaign_id=campaign.campaign_id,
|
1610
|
+
campaign_info=campaign,
|
1611
|
+
pdf_paths=pdf_paths
|
1612
|
+
)
|
1613
|
+
all_variants.extend(campaign_variants)
|
1614
|
+
continue
|
1615
|
+
|
1616
|
+
log.info(f"Found {len(locations)} potential locations for campaign {campaign.campaign_id}")
|
1617
|
+
for loc in locations:
|
1618
|
+
log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
|
1619
|
+
|
1620
|
+
# Try to extract from the best location
|
1621
|
+
extracted_variants = []
|
1622
|
+
for location in locations:
|
1623
|
+
if extracted_variants:
|
1624
|
+
break # Already got variants
|
1625
|
+
|
1626
|
+
location_str = location.get('location', '')
|
1627
|
+
location_type = location.get('type', '')
|
1628
|
+
confidence = location.get('confidence', 0)
|
1629
|
+
|
1630
|
+
# Try figure extraction for high-confidence figures
|
1631
|
+
if location_type == 'figure' and confidence >= 70 and pdf_paths:
|
1632
|
+
log.info(f"Attempting to extract figure: {location_str}")
|
1633
|
+
|
1634
|
+
figure_bytes = None
|
1635
|
+
for pdf_path in pdf_paths:
|
1636
|
+
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
|
1637
|
+
if figure_bytes:
|
1638
|
+
log.info(f"Successfully extracted figure from {pdf_path.name}")
|
1639
|
+
break
|
1640
|
+
|
1641
|
+
if figure_bytes:
|
1642
|
+
# Save figure if debug enabled
|
1643
|
+
if debug_dir:
|
1644
|
+
debug_path = Path(debug_dir)
|
1645
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1646
|
+
figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
|
1647
|
+
_dump(figure_bytes, figure_file)
|
1648
|
+
log.info(f"Saved figure to: {figure_file}")
|
1649
|
+
|
1650
|
+
# Extract lineage from figure
|
1651
|
+
variants = extract_lineage_from_figure(
|
1652
|
+
figure_bytes, model,
|
1653
|
+
debug_dir=debug_dir,
|
1654
|
+
campaign_id=campaign.campaign_id,
|
1655
|
+
campaign_info=campaign
|
1656
|
+
)
|
1657
|
+
if variants:
|
1658
|
+
log.info(f"Extracted {len(variants)} variants from figure")
|
1659
|
+
extracted_variants = variants
|
1660
|
+
continue
|
1661
|
+
|
1662
|
+
# Try table/text extraction
|
1663
|
+
if location_type in ['table', 'text', 'section'] and not extracted_variants:
|
1664
|
+
log.info(f"Attempting text extraction for {location_type}: {location_str}")
|
1665
|
+
|
1666
|
+
# Extract the specific section/table from full text
|
1667
|
+
section_text = _extract_location_text(full_text, location_str, location_type)
|
1668
|
+
if section_text:
|
1669
|
+
log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
|
1670
|
+
# Save extracted section if debug enabled
|
1671
|
+
if debug_dir:
|
1672
|
+
debug_path = Path(debug_dir)
|
1673
|
+
section_file = debug_path / f"extracted_{location_type}_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
|
1674
|
+
_dump(f"=== EXTRACTED {location_type.upper()} ===\nLocation: {location_str}\nLength: {len(section_text)} chars\n{'='*80}\n\n{section_text}", section_file)
|
1675
|
+
|
1676
|
+
variants = extract_complete_lineage(
|
1677
|
+
section_text, model,
|
1678
|
+
debug_dir=debug_dir,
|
1679
|
+
campaign_id=campaign.campaign_id,
|
1680
|
+
campaign_info=campaign,
|
1681
|
+
pdf_paths=pdf_paths
|
1682
|
+
)
|
1683
|
+
if variants:
|
1684
|
+
log.info(f"Extracted {len(variants)} variants from {location_type}")
|
1685
|
+
extracted_variants = variants
|
1686
|
+
else:
|
1687
|
+
log.warning(f"Could not extract text from {location_type}: {location_str}")
|
1688
|
+
|
1689
|
+
# If no variants extracted from specific locations, try full text
|
1690
|
+
if not extracted_variants:
|
1691
|
+
log.warning(f"Could not extract from specific locations, trying full text for campaign {campaign.campaign_id}")
|
1692
|
+
extracted_variants = extract_complete_lineage(
|
1693
|
+
full_text, model,
|
1694
|
+
debug_dir=debug_dir,
|
1695
|
+
campaign_id=campaign.campaign_id,
|
1696
|
+
campaign_info=campaign,
|
1697
|
+
pdf_paths=pdf_paths
|
1698
|
+
)
|
1699
|
+
|
1700
|
+
all_variants.extend(extracted_variants)
|
1701
|
+
|
1702
|
+
return all_variants, campaigns
|
1703
|
+
|
1704
|
+
# Original fallback code for when no campaigns are identified
|
1705
|
+
log.info("Processing campaigns with direct caption and TOC analysis (skipping global location finding)")
|
1706
|
+
|
1707
|
+
# Prepare all captions and TOC with context for campaign-specific selection
|
1708
|
+
caption_entries = []
|
1709
|
+
|
1710
|
+
# Add table of contents entries if available
|
1711
|
+
if pdf_paths:
|
1712
|
+
toc_sections = []
|
1713
|
+
for pdf_path in pdf_paths:
|
1714
|
+
# Extract first few pages looking for TOC
|
1715
|
+
try:
|
1716
|
+
import fitz # PyMuPDF
|
1717
|
+
doc = fitz.open(pdf_path)
|
1718
|
+
toc_text = ""
|
1719
|
+
for page_num in range(min(5, doc.page_count)): # First 5 pages
|
1720
|
+
page = doc[page_num] # Correct PyMuPDF syntax
|
1721
|
+
page_text = page.get_text()
|
1722
|
+
if any(keyword in page_text.lower() for keyword in ['contents', 'table of contents', 'overview']):
|
1723
|
+
toc_text += f"\n--- Page {page_num + 1} TOC ---\n{page_text}\n"
|
1724
|
+
doc.close()
|
1725
|
+
if toc_text:
|
1726
|
+
toc_sections.append(toc_text)
|
1727
|
+
except Exception as e:
|
1728
|
+
log.warning(f"Failed to extract TOC from {pdf_path}: {e}")
|
1729
|
+
|
1730
|
+
if toc_sections:
|
1731
|
+
caption_entries.append({
|
1732
|
+
'type': 'table_of_contents',
|
1733
|
+
'location': 'Table of Contents',
|
1734
|
+
'context': '\n'.join(toc_sections)[:1000] + "..."
|
1735
|
+
})
|
1736
|
+
|
1737
|
+
# Parse figure and table captions from caption_text
|
1738
|
+
# Split by common caption patterns
|
1739
|
+
caption_patterns = [
|
1740
|
+
r'(?:^|\n)(?:Figure|Fig\.?)\s*\d+[:\.]',
|
1741
|
+
r'(?:^|\n)(?:Table|Tab\.?)\s*\d+[:\.]',
|
1742
|
+
r'(?:^|\n)(?:Scheme|Sch\.?)\s*\d+[:\.]'
|
1743
|
+
]
|
1744
|
+
|
1745
|
+
import re
|
1746
|
+
for pattern in caption_patterns:
|
1747
|
+
matches = list(re.finditer(pattern, caption_text, re.MULTILINE | re.IGNORECASE))
|
1748
|
+
for i, match in enumerate(matches):
|
1749
|
+
start_pos = match.start()
|
1750
|
+
# Find the end of this caption (start of next caption or end of text)
|
1751
|
+
if i + 1 < len(matches):
|
1752
|
+
end_pos = matches[i + 1].start()
|
1753
|
+
else:
|
1754
|
+
end_pos = min(start_pos + 2000, len(caption_text)) # Max 2000 chars per caption
|
1755
|
+
|
1756
|
+
caption_content = caption_text[start_pos:end_pos].strip()
|
1757
|
+
if len(caption_content) > 20: # Skip very short captions
|
1758
|
+
# Extract context from full text around this caption
|
1759
|
+
context_start = max(0, full_text.find(caption_content[:100]) - 500)
|
1760
|
+
context_end = min(len(full_text), context_start + 2000)
|
1761
|
+
context = full_text[context_start:context_end]
|
1762
|
+
|
1763
|
+
caption_entries.append({
|
1764
|
+
'type': 'figure' if 'fig' in pattern.lower() else 'table' if 'tab' in pattern.lower() else 'scheme',
|
1765
|
+
'location': caption_content.split('\n')[0][:100] + "..." if len(caption_content.split('\n')[0]) > 100 else caption_content.split('\n')[0],
|
1766
|
+
'context': context
|
1767
|
+
})
|
1768
|
+
|
1769
|
+
log.info(f"Prepared {len(caption_entries)} caption/TOC entries for campaign-specific analysis")
|
1770
|
+
|
1771
|
+
# If no caption entries found, fall back to full text extraction
|
1772
|
+
if not caption_entries:
|
1773
|
+
log.info("No caption entries found, extracting from full text with campaign context")
|
1429
1774
|
for campaign in campaigns:
|
1430
1775
|
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1431
|
-
campaign_variants =
|
1432
|
-
full_text, model,
|
1433
|
-
debug_dir=debug_dir,
|
1434
|
-
|
1776
|
+
campaign_variants = extract_complete_lineage(
|
1777
|
+
full_text, model,
|
1778
|
+
debug_dir=debug_dir,
|
1779
|
+
campaign_id=campaign.campaign_id,
|
1780
|
+
campaign_info=campaign,
|
1781
|
+
pdf_paths=pdf_paths
|
1435
1782
|
)
|
1436
1783
|
all_variants.extend(campaign_variants)
|
1437
1784
|
return all_variants, campaigns
|
1438
|
-
|
1439
|
-
#
|
1440
|
-
location_summary = []
|
1441
|
-
for loc in locations[:5]:
|
1442
|
-
if isinstance(loc, dict):
|
1443
|
-
location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
|
1444
|
-
else:
|
1445
|
-
location_summary.append(str(loc))
|
1446
|
-
log.info("Gemini identified %d potential lineage locations: %s",
|
1447
|
-
len(locations), ", ".join(location_summary))
|
1448
|
-
|
1449
|
-
# Extract context around each location for better decision making
|
1450
|
-
locations_with_context = []
|
1451
|
-
for loc in locations:
|
1452
|
-
location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
|
1453
|
-
# Extract 1000 chars of context around the location
|
1454
|
-
context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
|
1455
|
-
locations_with_context.append({
|
1456
|
-
'location': loc,
|
1457
|
-
'context': context_text # Full extracted context
|
1458
|
-
})
|
1459
|
-
|
1460
|
-
# For each campaign, ask Gemini to select the best location
|
1785
|
+
|
1786
|
+
# For each campaign, ask Gemini to select the best location from captions/TOC
|
1461
1787
|
for campaign in campaigns:
|
1462
1788
|
log.info(f"Processing campaign: {campaign.campaign_id}")
|
1463
1789
|
|
1464
|
-
# Build locations context string
|
1790
|
+
# Build locations context string from caption entries
|
1465
1791
|
locations_str = ""
|
1466
|
-
for i,
|
1467
|
-
|
1468
|
-
|
1469
|
-
|
1470
|
-
location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
|
1471
|
-
confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
|
1472
|
-
reason = loc.get('reason', '') if isinstance(loc, dict) else ''
|
1792
|
+
for i, entry in enumerate(caption_entries):
|
1793
|
+
location_str = entry['location']
|
1794
|
+
location_type = entry['type']
|
1795
|
+
context = entry['context']
|
1473
1796
|
|
1474
|
-
locations_str += f"\n{i+1}. {location_str} (Type: {location_type}
|
1475
|
-
locations_str += f" Reason: {reason}\n"
|
1797
|
+
locations_str += f"\n{i+1}. {location_str} (Type: {location_type})\n"
|
1476
1798
|
locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
|
1477
1799
|
|
1478
1800
|
# Ask Gemini to select best location for this campaign
|
@@ -1514,26 +1836,39 @@ def get_lineage(
|
|
1514
1836
|
|
1515
1837
|
log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
|
1516
1838
|
|
1517
|
-
# Find the actual
|
1518
|
-
|
1519
|
-
|
1520
|
-
if
|
1521
|
-
|
1839
|
+
# Find the actual caption entry
|
1840
|
+
selected_entry = None
|
1841
|
+
for entry in caption_entries:
|
1842
|
+
if entry['location'] == selected_location:
|
1843
|
+
selected_entry = entry
|
1522
1844
|
break
|
1523
1845
|
|
1524
|
-
if not
|
1525
|
-
log.warning(f"Could not find selected location '{selected_location}' in
|
1526
|
-
# Fall back to
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1846
|
+
if not selected_entry:
|
1847
|
+
log.warning(f"Could not find selected location '{selected_location}' in caption entries")
|
1848
|
+
# Fall back to first entry
|
1849
|
+
selected_entry = caption_entries[0] if caption_entries else None
|
1850
|
+
|
1851
|
+
# Convert caption entry to location format for compatibility
|
1852
|
+
if selected_entry:
|
1853
|
+
primary_location = {
|
1854
|
+
'location': selected_entry['location'],
|
1855
|
+
'type': selected_entry['type'],
|
1856
|
+
'confidence': 0.8, # Default confidence for caption-based selection
|
1857
|
+
'reason': f"Selected from {selected_entry['type']} captions"
|
1858
|
+
}
|
1530
1859
|
|
1531
1860
|
except Exception as e:
|
1532
1861
|
log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
|
1533
|
-
# Fall back to
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1862
|
+
# Fall back to first caption entry
|
1863
|
+
if caption_entries:
|
1864
|
+
primary_location = {
|
1865
|
+
'location': caption_entries[0]['location'],
|
1866
|
+
'type': caption_entries[0]['type'],
|
1867
|
+
'confidence': 0.5, # Lower confidence for fallback
|
1868
|
+
'reason': f"Fallback to first {caption_entries[0]['type']} caption"
|
1869
|
+
}
|
1870
|
+
else:
|
1871
|
+
primary_location = None
|
1537
1872
|
|
1538
1873
|
if not primary_location:
|
1539
1874
|
log.warning(f"No location found for campaign {campaign.campaign_id}")
|
@@ -1675,6 +2010,97 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
|
|
1675
2010
|
return []
|
1676
2011
|
|
1677
2012
|
# --- 7.2 Page-based extraction helper ---------------------------------------
|
2013
|
+
def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
|
2014
|
+
"""Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
|
2015
|
+
|
2016
|
+
Args:
|
2017
|
+
prompt: The prompt to send to Gemini
|
2018
|
+
model: The Gemini model instance
|
2019
|
+
context: Additional context for logging (e.g., "validation" or "extraction")
|
2020
|
+
|
2021
|
+
Returns:
|
2022
|
+
The validated sequence or None if no consensus
|
2023
|
+
"""
|
2024
|
+
sequences = []
|
2025
|
+
max_attempts = 5 # Increased from 3 to 5
|
2026
|
+
|
2027
|
+
# Try up to 5 times
|
2028
|
+
for attempt in range(max_attempts):
|
2029
|
+
try:
|
2030
|
+
response = model.generate_content(prompt)
|
2031
|
+
result = _extract_text(response).strip()
|
2032
|
+
|
2033
|
+
# Parse the result to extract just the sequence
|
2034
|
+
if result == "VALID":
|
2035
|
+
sequences.append("VALID")
|
2036
|
+
elif result == "UNCERTAIN":
|
2037
|
+
sequences.append("UNCERTAIN")
|
2038
|
+
elif result.startswith("M") and len(result) > 50:
|
2039
|
+
# Clean the sequence
|
2040
|
+
clean_seq = result.upper().replace(" ", "").replace("\n", "")
|
2041
|
+
if all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in clean_seq):
|
2042
|
+
sequences.append(clean_seq)
|
2043
|
+
else:
|
2044
|
+
sequences.append("INVALID")
|
2045
|
+
else:
|
2046
|
+
sequences.append("INVALID")
|
2047
|
+
|
2048
|
+
log.info(f"Gemini {context} attempt {attempt + 1}: {len(result) if result.startswith('M') else result}")
|
2049
|
+
|
2050
|
+
except Exception as e:
|
2051
|
+
log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
|
2052
|
+
sequences.append("ERROR")
|
2053
|
+
|
2054
|
+
# Check for early consensus after 2 attempts
|
2055
|
+
if len(sequences) == 2:
|
2056
|
+
# Clean sequences before comparison
|
2057
|
+
seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
|
2058
|
+
seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
|
2059
|
+
|
2060
|
+
if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
|
2061
|
+
log.info(f"Gemini {context} consensus reached after 2 attempts")
|
2062
|
+
return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
|
2063
|
+
else:
|
2064
|
+
log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
|
2065
|
+
|
2066
|
+
# After all attempts, find consensus
|
2067
|
+
valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
|
2068
|
+
|
2069
|
+
if not valid_sequences:
|
2070
|
+
log.error(f"All {max_attempts} {context} attempts failed")
|
2071
|
+
return None
|
2072
|
+
|
2073
|
+
# Find any matching pair
|
2074
|
+
for i in range(len(sequences)):
|
2075
|
+
for j in range(i + 1, len(sequences)):
|
2076
|
+
# Clean sequences before comparison
|
2077
|
+
seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
|
2078
|
+
seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
|
2079
|
+
|
2080
|
+
if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
|
2081
|
+
log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
|
2082
|
+
return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
|
2083
|
+
|
2084
|
+
# If no exact match, use adaptive validation
|
2085
|
+
# Count occurrences of each valid sequence
|
2086
|
+
sequence_counts = {}
|
2087
|
+
for seq in valid_sequences:
|
2088
|
+
if seq not in ["VALID", "UNCERTAIN"]:
|
2089
|
+
# Clean sequence before counting
|
2090
|
+
seq_clean = seq.replace(" ", "").replace("\n", "")
|
2091
|
+
sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
|
2092
|
+
|
2093
|
+
# Return the most common sequence if it appears at least twice
|
2094
|
+
if sequence_counts:
|
2095
|
+
most_common = max(sequence_counts.items(), key=lambda x: x[1])
|
2096
|
+
if most_common[1] >= 2:
|
2097
|
+
log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
|
2098
|
+
return most_common[0]
|
2099
|
+
|
2100
|
+
log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
|
2101
|
+
return None
|
2102
|
+
|
2103
|
+
|
1678
2104
|
def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
|
1679
2105
|
"""Validate and potentially correct a sequence using Gemini by checking against known mutations."""
|
1680
2106
|
|
@@ -1704,7 +2130,7 @@ def _validate_sequence_against_mutations(sequence: str, variants: List[Variant],
|
|
1704
2130
|
if not local_issues:
|
1705
2131
|
return None # No obvious issues found
|
1706
2132
|
|
1707
|
-
log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
|
2133
|
+
log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
|
1708
2134
|
|
1709
2135
|
prompt = f"""
|
1710
2136
|
You are validating a protein sequence that was extracted from a scientific paper.
|
@@ -1729,26 +2155,14 @@ Return ONLY the corrected sequence if changes are needed, or "VALID" if no chang
|
|
1729
2155
|
If you cannot determine the correct sequence, return "UNCERTAIN".
|
1730
2156
|
"""
|
1731
2157
|
|
1732
|
-
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
|
1739
|
-
|
1740
|
-
return None
|
1741
|
-
elif result.startswith("M") and len(result) > 50:
|
1742
|
-
# Gemini returned a corrected sequence
|
1743
|
-
log.info(f"Gemini suggested sequence correction (length {len(result)})")
|
1744
|
-
return result
|
1745
|
-
else:
|
1746
|
-
log.warning(f"Unexpected validation response: {result[:100]}...")
|
1747
|
-
return None
|
1748
|
-
|
1749
|
-
except Exception as e:
|
1750
|
-
log.warning(f"Failed to validate sequence: {e}")
|
1751
|
-
return None
|
2158
|
+
# Use triple validation
|
2159
|
+
result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
|
2160
|
+
|
2161
|
+
if result == "VALID" or result is None:
|
2162
|
+
return None # No changes needed
|
2163
|
+
else:
|
2164
|
+
log.info(f"Gemini suggested sequence correction (length {len(result)})")
|
2165
|
+
return result
|
1752
2166
|
|
1753
2167
|
|
1754
2168
|
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
|
@@ -1915,10 +2329,18 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
|
|
1915
2329
|
- Extract the variant_id exactly as written where the sequence appears
|
1916
2330
|
- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
|
1917
2331
|
|
2332
|
+
SEQUENCE EXTRACTION RULES:
|
2333
|
+
- Copy sequences EXACTLY as they appear in the text
|
2334
|
+
- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
|
2335
|
+
- Do NOT add, remove, or modify any amino acids
|
2336
|
+
- Preserve the exact length and character sequence
|
2337
|
+
- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
|
2338
|
+
- Double-check that consecutive identical amino acids are copied correctly
|
2339
|
+
|
1918
2340
|
For each variant return:
|
1919
2341
|
* variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
|
1920
|
-
* aa_seq - amino-acid sequence (uppercase), or null
|
1921
|
-
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
|
2342
|
+
* aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
|
2343
|
+
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
|
1922
2344
|
|
1923
2345
|
Respond ONLY with **minified JSON** that matches the schema below.
|
1924
2346
|
NO markdown, no code fences, no commentary.
|
@@ -1934,8 +2356,258 @@ TEXT (may be truncated):
|
|
1934
2356
|
```
|
1935
2357
|
""".strip()
|
1936
2358
|
|
1937
|
-
def
|
1938
|
-
"""
|
2359
|
+
def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
|
2360
|
+
"""Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
|
2361
|
+
|
2362
|
+
Args:
|
2363
|
+
model: The Gemini model instance
|
2364
|
+
prompt: The prompt to send to Gemini
|
2365
|
+
schema_hint: The JSON schema hint
|
2366
|
+
debug_dir: Optional debug directory
|
2367
|
+
|
2368
|
+
Returns:
|
2369
|
+
The validated sequence JSON data or None if no consensus
|
2370
|
+
"""
|
2371
|
+
responses = []
|
2372
|
+
max_attempts = 5 # Increased from 3 to 5
|
2373
|
+
|
2374
|
+
# Try up to 5 times
|
2375
|
+
for attempt in range(max_attempts):
|
2376
|
+
try:
|
2377
|
+
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
2378
|
+
resp = model.generate_content(prompt)
|
2379
|
+
raw = _extract_text(resp).strip()
|
2380
|
+
|
2381
|
+
# Save debug info
|
2382
|
+
if debug_dir:
|
2383
|
+
debug_path = Path(debug_dir)
|
2384
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
2385
|
+
response_file = debug_path / f"sequences_attempt_{attempt + 1}_{int(time.time())}.txt"
|
2386
|
+
with open(response_file, 'w') as f:
|
2387
|
+
f.write(f"=== SEQUENCE EXTRACTION ATTEMPT {attempt + 1} ===\n")
|
2388
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
2389
|
+
f.write(f"Length: {len(raw)} characters\n")
|
2390
|
+
f.write("="*80 + "\n\n")
|
2391
|
+
f.write(raw)
|
2392
|
+
|
2393
|
+
# Parse JSON response (similar to generate_json_with_retry logic)
|
2394
|
+
fence_re = re.compile(r"```json|```", re.I)
|
2395
|
+
if raw.startswith("```"):
|
2396
|
+
raw = fence_re.sub("", raw).strip()
|
2397
|
+
|
2398
|
+
# Try to parse as JSON
|
2399
|
+
try:
|
2400
|
+
parsed = json.loads(raw)
|
2401
|
+
except json.JSONDecodeError:
|
2402
|
+
# Look for JSON array or object in the response
|
2403
|
+
json_start = -1
|
2404
|
+
json_end = -1
|
2405
|
+
bracket_stack = []
|
2406
|
+
in_string = False
|
2407
|
+
escape_next = False
|
2408
|
+
|
2409
|
+
for i, char in enumerate(raw):
|
2410
|
+
if escape_next:
|
2411
|
+
escape_next = False
|
2412
|
+
continue
|
2413
|
+
|
2414
|
+
if char == '\\':
|
2415
|
+
escape_next = True
|
2416
|
+
continue
|
2417
|
+
|
2418
|
+
if char == '"' and not escape_next:
|
2419
|
+
in_string = not in_string
|
2420
|
+
continue
|
2421
|
+
|
2422
|
+
if in_string:
|
2423
|
+
continue
|
2424
|
+
|
2425
|
+
if char in '[{':
|
2426
|
+
if json_start == -1:
|
2427
|
+
json_start = i
|
2428
|
+
bracket_stack.append(char)
|
2429
|
+
elif char in ']}':
|
2430
|
+
if bracket_stack:
|
2431
|
+
opening = bracket_stack.pop()
|
2432
|
+
if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
|
2433
|
+
if not bracket_stack: # Found complete JSON
|
2434
|
+
json_end = i + 1
|
2435
|
+
break
|
2436
|
+
|
2437
|
+
if json_start >= 0 and json_end > json_start:
|
2438
|
+
json_str = raw[json_start:json_end]
|
2439
|
+
parsed = json.loads(json_str)
|
2440
|
+
else:
|
2441
|
+
if '[]' in raw:
|
2442
|
+
parsed = []
|
2443
|
+
else:
|
2444
|
+
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
2445
|
+
|
2446
|
+
# Store both the original and normalized response
|
2447
|
+
normalized_response = _normalize_sequence_response(parsed)
|
2448
|
+
responses.append((parsed, normalized_response))
|
2449
|
+
|
2450
|
+
log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
|
2451
|
+
|
2452
|
+
except Exception as e:
|
2453
|
+
log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
|
2454
|
+
responses.append(None)
|
2455
|
+
|
2456
|
+
# Check for early consensus after 2 attempts
|
2457
|
+
if len(responses) == 2:
|
2458
|
+
if (responses[0] and responses[1] and
|
2459
|
+
_sequences_match(responses[0][1], responses[1][1])):
|
2460
|
+
log.info("Sequence extraction consensus reached after 2 attempts")
|
2461
|
+
return responses[0][0] # Return original parsed data
|
2462
|
+
else:
|
2463
|
+
log.info("Sequence extraction mismatch after 2 attempts - trying third")
|
2464
|
+
|
2465
|
+
# After all attempts, use adaptive validation
|
2466
|
+
valid_responses = [r for r in responses if r is not None]
|
2467
|
+
|
2468
|
+
if not valid_responses:
|
2469
|
+
log.error(f"All {max_attempts} sequence extraction attempts failed")
|
2470
|
+
return None
|
2471
|
+
|
2472
|
+
# First, try to find exact consensus (any matching pair)
|
2473
|
+
for i in range(len(valid_responses)):
|
2474
|
+
for j in range(i + 1, len(valid_responses)):
|
2475
|
+
if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
|
2476
|
+
log.info(f"Sequence extraction consensus found: attempts with matching content")
|
2477
|
+
return valid_responses[i][0] # Return original parsed data
|
2478
|
+
|
2479
|
+
# If no exact consensus, use adaptive validation
|
2480
|
+
log.info("No exact consensus found, applying adaptive validation...")
|
2481
|
+
|
2482
|
+
# Find sequences that appear consistently across multiple attempts
|
2483
|
+
consistent_sequences = _find_consistent_sequences(valid_responses)
|
2484
|
+
|
2485
|
+
if consistent_sequences:
|
2486
|
+
log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
|
2487
|
+
return consistent_sequences
|
2488
|
+
|
2489
|
+
# If still no consensus, use the attempt with the most sequences
|
2490
|
+
best_response = max(valid_responses,
|
2491
|
+
key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
|
2492
|
+
|
2493
|
+
if best_response and len(best_response[1]) > 0:
|
2494
|
+
log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
|
2495
|
+
return best_response[0]
|
2496
|
+
|
2497
|
+
log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
|
2498
|
+
return None
|
2499
|
+
|
2500
|
+
|
2501
|
+
def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
|
2502
|
+
"""Find sequences that appear consistently across multiple extraction attempts.
|
2503
|
+
|
2504
|
+
Args:
|
2505
|
+
valid_responses: List of (original_data, normalized_data) tuples
|
2506
|
+
|
2507
|
+
Returns:
|
2508
|
+
List of consistent sequences with confidence scores, or None if none found
|
2509
|
+
"""
|
2510
|
+
if not valid_responses:
|
2511
|
+
return None
|
2512
|
+
|
2513
|
+
# Count how many times each sequence appears
|
2514
|
+
sequence_counts = {}
|
2515
|
+
sequence_full_data = {}
|
2516
|
+
|
2517
|
+
for original, normalized in valid_responses:
|
2518
|
+
if not isinstance(normalized, list):
|
2519
|
+
continue
|
2520
|
+
|
2521
|
+
for seq in normalized:
|
2522
|
+
variant_id = seq.get("variant_id", "")
|
2523
|
+
aa_seq = seq.get("aa_seq", "")
|
2524
|
+
# Clean sequence before using in key
|
2525
|
+
aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
|
2526
|
+
|
2527
|
+
# Create a unique key for this sequence
|
2528
|
+
key = f"{variant_id}|{aa_seq_clean}"
|
2529
|
+
|
2530
|
+
if key not in sequence_counts:
|
2531
|
+
sequence_counts[key] = 0
|
2532
|
+
sequence_full_data[key] = []
|
2533
|
+
|
2534
|
+
sequence_counts[key] += 1
|
2535
|
+
|
2536
|
+
# Find the full data for this sequence from the original response
|
2537
|
+
if isinstance(original, list):
|
2538
|
+
for orig_seq in original:
|
2539
|
+
if (orig_seq.get("variant_id") == variant_id and
|
2540
|
+
orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
|
2541
|
+
sequence_full_data[key].append(orig_seq)
|
2542
|
+
break
|
2543
|
+
|
2544
|
+
# Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
|
2545
|
+
min_appearances = max(2, len(valid_responses) // 2)
|
2546
|
+
consistent_sequences = []
|
2547
|
+
|
2548
|
+
for key, count in sequence_counts.items():
|
2549
|
+
if count >= min_appearances:
|
2550
|
+
# Use the first occurrence of the full data
|
2551
|
+
if sequence_full_data[key]:
|
2552
|
+
seq_data = sequence_full_data[key][0].copy()
|
2553
|
+
# Add confidence based on how many times it appeared
|
2554
|
+
seq_data["confidence"] = count / len(valid_responses)
|
2555
|
+
seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
|
2556
|
+
consistent_sequences.append(seq_data)
|
2557
|
+
|
2558
|
+
return consistent_sequences if consistent_sequences else None
|
2559
|
+
|
2560
|
+
|
2561
|
+
def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
|
2562
|
+
"""Normalize sequence response for comparison."""
|
2563
|
+
if not isinstance(data, list):
|
2564
|
+
return []
|
2565
|
+
|
2566
|
+
normalized = []
|
2567
|
+
for item in data:
|
2568
|
+
if isinstance(item, dict):
|
2569
|
+
# Extract key fields for comparison
|
2570
|
+
normalized_item = {
|
2571
|
+
"variant_id": item.get("variant_id", ""),
|
2572
|
+
"aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
|
2573
|
+
"dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
|
2574
|
+
"confidence": item.get("confidence", 0.0)
|
2575
|
+
}
|
2576
|
+
normalized.append(normalized_item)
|
2577
|
+
|
2578
|
+
# Sort by variant_id for consistent comparison
|
2579
|
+
return sorted(normalized, key=lambda x: x["variant_id"])
|
2580
|
+
|
2581
|
+
|
2582
|
+
def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
|
2583
|
+
"""Check if two sequence response lists match on key fields."""
|
2584
|
+
if len(seq1) != len(seq2):
|
2585
|
+
return False
|
2586
|
+
|
2587
|
+
for i, (s1, s2) in enumerate(zip(seq1, seq2)):
|
2588
|
+
# Compare variant IDs
|
2589
|
+
if s1.get("variant_id") != s2.get("variant_id"):
|
2590
|
+
return False
|
2591
|
+
|
2592
|
+
# Compare amino acid sequences (most critical)
|
2593
|
+
aa1 = s1.get("aa_seq", "")
|
2594
|
+
aa2 = s2.get("aa_seq", "")
|
2595
|
+
if aa1 and aa2 and aa1 != aa2:
|
2596
|
+
return False
|
2597
|
+
elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
|
2598
|
+
return False
|
2599
|
+
|
2600
|
+
# Compare DNA sequences if present
|
2601
|
+
dna1 = s1.get("dna_seq", "")
|
2602
|
+
dna2 = s2.get("dna_seq", "")
|
2603
|
+
if dna1 and dna2 and dna1 != dna2:
|
2604
|
+
return False
|
2605
|
+
|
2606
|
+
return True
|
2607
|
+
|
2608
|
+
|
2609
|
+
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
2610
|
+
"""Prompt Gemini and convert its JSON reply into SequenceBlock objects with triple validation."""
|
1939
2611
|
base_prompt = _SEQ_EXTRACTION_PROMPT.format(
|
1940
2612
|
schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
|
1941
2613
|
)
|
@@ -1952,8 +2624,50 @@ Match sequences to these known variants when possible. Variants may be labeled d
|
|
1952
2624
|
else:
|
1953
2625
|
prompt = base_prompt
|
1954
2626
|
|
1955
|
-
|
1956
|
-
|
2627
|
+
# Add mutation validation context if we have lineage variants with mutations
|
2628
|
+
if lineage_variants:
|
2629
|
+
mutation_context = _build_mutation_validation_context(lineage_variants)
|
2630
|
+
if mutation_context:
|
2631
|
+
prompt = f"""{prompt}
|
2632
|
+
|
2633
|
+
CRITICAL MUTATION VALIDATION:
|
2634
|
+
{mutation_context}
|
2635
|
+
|
2636
|
+
IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
|
2637
|
+
For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
|
2638
|
+
"""
|
2639
|
+
|
2640
|
+
# Save the complete prompt for debugging
|
2641
|
+
if debug_dir:
|
2642
|
+
debug_path = Path(debug_dir)
|
2643
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
2644
|
+
prompt_file = debug_path / f"sequence_extraction_prompt_{int(time.time())}.txt"
|
2645
|
+
with open(prompt_file, 'w') as f:
|
2646
|
+
f.write(f"=== SEQUENCE EXTRACTION PROMPT ===\n")
|
2647
|
+
f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
|
2648
|
+
f.write(f"Text length: {len(text)} characters\n")
|
2649
|
+
f.write(f"Truncated to: {len(text[:MAX_CHARS])} characters\n")
|
2650
|
+
f.write(f"Total prompt length: {len(prompt)} characters\n")
|
2651
|
+
f.write("="*80 + "\n\n")
|
2652
|
+
f.write(prompt)
|
2653
|
+
log.info(f"Saved sequence extraction prompt to {prompt_file}")
|
2654
|
+
|
2655
|
+
# Use triple validation for sequence extraction
|
2656
|
+
log.info("Extracting sequences with triple validation to ensure accuracy")
|
2657
|
+
data = _extract_sequences_with_triple_validation(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir)
|
2658
|
+
|
2659
|
+
if not data:
|
2660
|
+
log.warning("Failed to get consistent sequence extraction after triple validation")
|
2661
|
+
return []
|
2662
|
+
|
2663
|
+
extracted_sequences = _parse_sequences(data)
|
2664
|
+
|
2665
|
+
# Post-process: validate sequences against mutations if we have lineage info
|
2666
|
+
if lineage_variants:
|
2667
|
+
validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
|
2668
|
+
return validated_sequences
|
2669
|
+
|
2670
|
+
return extracted_sequences
|
1957
2671
|
|
1958
2672
|
# --- 7.4 JSON -> dataclass helpers -------------------------------------------
|
1959
2673
|
_VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
|
@@ -2004,6 +2718,167 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
2004
2718
|
)
|
2005
2719
|
return blocks
|
2006
2720
|
|
2721
|
+
def _build_mutation_validation_context(lineage_variants: List[Variant]) -> str:
|
2722
|
+
"""Build mutation context for sequence validation."""
|
2723
|
+
mutation_info = []
|
2724
|
+
|
2725
|
+
for variant in lineage_variants:
|
2726
|
+
if variant.mutations and variant.parent_id:
|
2727
|
+
mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
|
2728
|
+
mutation_info.append(f"Variant '{variant.variant_id}' (parent: '{variant.parent_id}') has mutations: {mutations_str}")
|
2729
|
+
|
2730
|
+
if not mutation_info:
|
2731
|
+
return ""
|
2732
|
+
|
2733
|
+
context = "Known mutation relationships:\n" + "\n".join(mutation_info[:10]) # Limit to first 10 for context
|
2734
|
+
if len(mutation_info) > 10:
|
2735
|
+
context += f"\n... and {len(mutation_info) - 10} more variants with mutations"
|
2736
|
+
|
2737
|
+
return context
|
2738
|
+
|
2739
|
+
def _validate_sequences_against_mutations(sequences: List[SequenceBlock], lineage_variants: List[Variant], model, debug_dir: str | Path | None = None) -> List[SequenceBlock]:
|
2740
|
+
"""Validate extracted sequences against known mutations and fix inconsistencies."""
|
2741
|
+
# Create lookups for easier access
|
2742
|
+
seq_lookup = {seq.variant_id: seq for seq in sequences}
|
2743
|
+
variant_lookup = {var.variant_id: var for var in lineage_variants}
|
2744
|
+
|
2745
|
+
validation_issues = []
|
2746
|
+
corrected_sequences = []
|
2747
|
+
|
2748
|
+
for seq in sequences:
|
2749
|
+
variant = variant_lookup.get(seq.variant_id)
|
2750
|
+
if not variant or not variant.parent_id or not variant.mutations or not seq.aa_seq:
|
2751
|
+
corrected_sequences.append(seq)
|
2752
|
+
continue
|
2753
|
+
|
2754
|
+
parent_seq = seq_lookup.get(variant.parent_id)
|
2755
|
+
if not parent_seq or not parent_seq.aa_seq:
|
2756
|
+
corrected_sequences.append(seq)
|
2757
|
+
continue
|
2758
|
+
|
2759
|
+
# Check if mutations are consistent
|
2760
|
+
issues = _check_mutation_consistency(seq.aa_seq, parent_seq.aa_seq, variant.mutations, seq.variant_id, variant.parent_id)
|
2761
|
+
|
2762
|
+
if issues:
|
2763
|
+
validation_issues.extend(issues)
|
2764
|
+
log.warning(f"Sequence validation issues for {seq.variant_id}: {'; '.join(issues)}")
|
2765
|
+
|
2766
|
+
# Try to get corrected sequence from Gemini
|
2767
|
+
corrected_seq = _get_corrected_sequence_from_gemini(seq, parent_seq, variant, issues, model, debug_dir)
|
2768
|
+
if corrected_seq:
|
2769
|
+
corrected_sequences.append(corrected_seq)
|
2770
|
+
log.info(f"Corrected sequence for {seq.variant_id} using Gemini validation")
|
2771
|
+
else:
|
2772
|
+
corrected_sequences.append(seq) # Keep original if correction fails
|
2773
|
+
else:
|
2774
|
+
corrected_sequences.append(seq)
|
2775
|
+
|
2776
|
+
if validation_issues:
|
2777
|
+
log.warning(f"Found {len(validation_issues)} sequence validation issues across {len([s for s in sequences if s.variant_id in [v.variant_id for v in lineage_variants if v.mutations]])} variants with mutations")
|
2778
|
+
|
2779
|
+
return corrected_sequences
|
2780
|
+
|
2781
|
+
def _check_mutation_consistency(child_seq: str, parent_seq: str, mutations, child_id: str, parent_id: str) -> List[str]:
|
2782
|
+
"""Check if mutations are consistent between parent and child sequences."""
|
2783
|
+
import re
|
2784
|
+
|
2785
|
+
issues = []
|
2786
|
+
|
2787
|
+
# Parse mutations (handle both string and list formats)
|
2788
|
+
if isinstance(mutations, list):
|
2789
|
+
mutation_strs = mutations
|
2790
|
+
else:
|
2791
|
+
mutation_strs = [m.strip() for m in str(mutations).split(',') if m.strip()]
|
2792
|
+
|
2793
|
+
for mut_str in mutation_strs:
|
2794
|
+
# Parse mutation like "A100V"
|
2795
|
+
match = re.match(r'^([A-Z])(\d+)([A-Z])$', mut_str.strip())
|
2796
|
+
if not match:
|
2797
|
+
continue # Skip non-standard mutation formats
|
2798
|
+
|
2799
|
+
orig_aa, pos_str, new_aa = match.groups()
|
2800
|
+
pos = int(pos_str) - 1 # Convert to 0-based indexing
|
2801
|
+
|
2802
|
+
# Check bounds
|
2803
|
+
if pos >= len(parent_seq) or pos >= len(child_seq):
|
2804
|
+
issues.append(f"Mutation {mut_str} position out of bounds")
|
2805
|
+
continue
|
2806
|
+
|
2807
|
+
# Check parent sequence has expected original amino acid
|
2808
|
+
if parent_seq[pos] != orig_aa:
|
2809
|
+
issues.append(f"Mutation {mut_str}: parent {parent_id} has {parent_seq[pos]} at position {pos+1}, expected {orig_aa}")
|
2810
|
+
|
2811
|
+
# Check child sequence has expected new amino acid
|
2812
|
+
if child_seq[pos] != new_aa:
|
2813
|
+
issues.append(f"Mutation {mut_str}: child {child_id} has {child_seq[pos]} at position {pos+1}, expected {new_aa}")
|
2814
|
+
|
2815
|
+
return issues
|
2816
|
+
|
2817
|
+
def _get_corrected_sequence_from_gemini(seq: SequenceBlock, parent_seq: SequenceBlock, variant: Variant, issues: List[str], model, debug_dir: str | Path | None = None) -> SequenceBlock | None:
|
2818
|
+
"""Use Gemini to get a corrected sequence based on mutation validation issues."""
|
2819
|
+
if not model:
|
2820
|
+
return None
|
2821
|
+
|
2822
|
+
mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
|
2823
|
+
issues_str = "; ".join(issues)
|
2824
|
+
|
2825
|
+
prompt = f"""You extracted a sequence for variant "{seq.variant_id}" but there are mutation validation issues:
|
2826
|
+
|
2827
|
+
ISSUES: {issues_str}
|
2828
|
+
|
2829
|
+
PARENT SEQUENCE ({variant.parent_id}):
|
2830
|
+
{parent_seq.aa_seq}
|
2831
|
+
|
2832
|
+
EXTRACTED SEQUENCE ({seq.variant_id}):
|
2833
|
+
{seq.aa_seq}
|
2834
|
+
|
2835
|
+
EXPECTED MUTATIONS: {mutations_str}
|
2836
|
+
|
2837
|
+
Based on the parent sequence and the expected mutations, provide the CORRECT sequence for {seq.variant_id}.
|
2838
|
+
Apply each mutation to the parent sequence in order.
|
2839
|
+
|
2840
|
+
For example, if parent has "A" at position 100 and mutation is "A100V", then child should have "V" at position 100.
|
2841
|
+
|
2842
|
+
IMPORTANT SEQUENCE RULES:
|
2843
|
+
- Copy the sequence EXACTLY - do not add, remove, or modify any amino acids
|
2844
|
+
- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
|
2845
|
+
- Preserve the exact length of the sequence
|
2846
|
+
- Only change the specific positions indicated by the mutations
|
2847
|
+
- Double-check that consecutive identical amino acids are copied correctly
|
2848
|
+
|
2849
|
+
Return ONLY the corrected amino acid sequence (no explanation, no formatting).
|
2850
|
+
If you cannot determine the correct sequence, return "UNCERTAIN".
|
2851
|
+
"""
|
2852
|
+
|
2853
|
+
try:
|
2854
|
+
if debug_dir:
|
2855
|
+
import time
|
2856
|
+
timestamp = int(time.time())
|
2857
|
+
prompt_file = Path(debug_dir) / f"sequence_validation_{seq.variant_id}_{timestamp}.txt"
|
2858
|
+
_dump(prompt, prompt_file)
|
2859
|
+
|
2860
|
+
# Use triple validation for sequence correction
|
2861
|
+
log.info(f"Correcting sequence for {seq.variant_id} with triple validation")
|
2862
|
+
corrected_seq = _extract_plain_sequence_with_triple_validation(prompt, model, f"correction for {seq.variant_id}")
|
2863
|
+
|
2864
|
+
if debug_dir and corrected_seq:
|
2865
|
+
response_file = Path(debug_dir) / f"sequence_validation_response_{seq.variant_id}_{timestamp}.txt"
|
2866
|
+
_dump(corrected_seq, response_file)
|
2867
|
+
|
2868
|
+
if corrected_seq and corrected_seq not in ["UNCERTAIN", "VALID"] and _clean_seq(corrected_seq, _VALID_AA):
|
2869
|
+
return SequenceBlock(
|
2870
|
+
variant_id=seq.variant_id,
|
2871
|
+
aa_seq=corrected_seq,
|
2872
|
+
dna_seq=seq.dna_seq,
|
2873
|
+
confidence=0.8, # Lower confidence for corrected sequences
|
2874
|
+
truncated=seq.truncated
|
2875
|
+
)
|
2876
|
+
|
2877
|
+
except Exception as e:
|
2878
|
+
log.warning(f"Failed to get corrected sequence for {seq.variant_id}: {e}")
|
2879
|
+
|
2880
|
+
return None
|
2881
|
+
|
2007
2882
|
# --- 7.5 Convenience wrapper -------------------------------------------------
|
2008
2883
|
def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
2009
2884
|
# Phase 1: Identify where sequences might be located
|
@@ -2061,6 +2936,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2061
2936
|
|
2062
2937
|
# Fallback to text search if page extraction didn't work
|
2063
2938
|
if not focused_text:
|
2939
|
+
log.info("Page extraction did not return text, falling back to text search")
|
2064
2940
|
focused_text = _extract_text_at_locations(
|
2065
2941
|
text, [best_location],
|
2066
2942
|
context_chars=max(min_length, 30000),
|
@@ -2070,6 +2946,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2070
2946
|
if focused_text and len(focused_text) < len(text):
|
2071
2947
|
log.info("Reduced text from %d to %d chars using validated location",
|
2072
2948
|
len(text), len(focused_text))
|
2949
|
+
else:
|
2950
|
+
log.warning("Failed to reduce text size - focused_text length: %d, full text length: %d",
|
2951
|
+
len(focused_text) if focused_text else 0, len(text))
|
2073
2952
|
# Build lineage context if available
|
2074
2953
|
lineage_context = None
|
2075
2954
|
if lineage_variants:
|
@@ -2081,7 +2960,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2081
2960
|
variant_info.append(info)
|
2082
2961
|
lineage_context = "\n".join(variant_info)
|
2083
2962
|
|
2084
|
-
return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
2963
|
+
return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
|
2085
2964
|
else:
|
2086
2965
|
log.warning("Location validation failed or returned invalid location: %s",
|
2087
2966
|
validation.get("reason", "Unknown"))
|
@@ -2099,7 +2978,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2099
2978
|
variant_info.append(info)
|
2100
2979
|
lineage_context = "\n".join(variant_info)
|
2101
2980
|
|
2102
|
-
return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
|
2981
|
+
return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
|
2103
2982
|
|
2104
2983
|
# === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
|
2105
2984
|
"""When no sequences are found in the paper, attempt to fetch them from PDB."""
|
@@ -2165,6 +3044,7 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
|
|
2165
3044
|
log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
|
2166
3045
|
return {}
|
2167
3046
|
|
3047
|
+
|
2168
3048
|
def extract_enzyme_info_with_gemini(
|
2169
3049
|
text: str,
|
2170
3050
|
variants: List[Variant],
|
@@ -2234,7 +3114,7 @@ If you cannot determine certain fields, set them to null.
|
|
2234
3114
|
# Validate it looks like a protein sequence
|
2235
3115
|
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
|
2236
3116
|
# Sanity check the sequence against known mutations
|
2237
|
-
validated_seq = _validate_sequence_against_mutations(seq, variants,
|
3117
|
+
validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
|
2238
3118
|
if validated_seq:
|
2239
3119
|
seq = validated_seq
|
2240
3120
|
log.info(f"Sequence validated and potentially corrected by Gemini")
|
@@ -2802,7 +3682,7 @@ def run_pipeline(
|
|
2802
3682
|
|
2803
3683
|
# 1. Prepare raw text ------------------------------------------------------
|
2804
3684
|
# Always load both caption text (for identification) and full text (for extraction)
|
2805
|
-
pdf_paths = [p for p in (
|
3685
|
+
pdf_paths = [p for p in (manuscript, si_path) if p]
|
2806
3686
|
caption_text = limited_caption_concat(*pdf_paths)
|
2807
3687
|
full_text = limited_concat(*pdf_paths)
|
2808
3688
|
|