debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,7 @@ import time
30
30
  import logging
31
31
  from pathlib import Path
32
32
  from dataclasses import dataclass, field
33
- from typing import List, Optional, Union, Tuple
33
+ from typing import List, Optional, Union, Tuple, Dict, Any
34
34
 
35
35
  MODEL_NAME: str = "gemini-2.5-flash"
36
36
  MAX_CHARS: int = 150_000 # Max characters sent to LLM
@@ -727,17 +727,18 @@ Return a JSON object with:
727
727
 
728
728
  _LINEAGE_LOC_PROMPT = """
729
729
  You are an expert reader of protein engineering manuscripts.
730
+ {campaign_context}
730
731
  Given the following article text, list up to {max_results} *locations* (page
731
732
  numbers, figure/table IDs, or section headings) that you would review first to
732
733
  find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
733
- came from which parent and what mutations were introduced).
734
+ came from which parent and what mutations were introduced){campaign_specific}.
734
735
 
735
736
  Respond with a JSON array of objects, each containing:
736
737
  - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
737
738
  - "type": one of "table", "figure", "text", "section"
738
739
  - "confidence": your confidence score (0-100) that this location contains lineage data
739
740
  - "reason": brief explanation of why this location likely contains lineage
740
-
741
+ {campaign_field}
741
742
  IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
742
743
 
743
744
  Order by confidence score (highest first). Tables showing complete variant lineages or
@@ -748,9 +749,9 @@ Don't include oligonucleotide results or result from only one round.
748
749
 
749
750
  Example output:
750
751
  [
751
- {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"}},
752
- {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"}},
753
- {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"}}
752
+ {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
753
+ {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
754
+ {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
754
755
  ]
755
756
  """.strip()
756
757
 
@@ -915,7 +916,39 @@ def identify_evolution_locations(
915
916
 
916
917
  # Include TOC before the main text
917
918
  combined_text = toc_text + text if toc_text else text
918
- prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
919
+
920
+ # Add campaign context if provided
921
+ campaign_context = ""
922
+ campaign_specific = ""
923
+ campaign_field = ""
924
+ campaign_example = ""
925
+
926
+ if campaigns and len(campaigns) == 1:
927
+ # Single campaign - make it specific
928
+ camp = campaigns[0]
929
+ campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
930
+ if hasattr(camp, 'notes') and camp.notes:
931
+ campaign_context += f"- Key identifiers: {camp.notes}\n"
932
+ campaign_specific = f" for the '{camp.campaign_name}' campaign"
933
+ campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
934
+ campaign_example = f', "campaign_id": "{camp.campaign_id}"'
935
+ elif campaigns and len(campaigns) > 1:
936
+ # Multiple campaigns - list them all
937
+ campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
938
+ for camp in campaigns:
939
+ campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
940
+ campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
941
+ campaign_specific = " for any of the identified campaigns"
942
+ campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
943
+ campaign_example = ', "campaign_id": "campaign_id_here"'
944
+
945
+ prompt = _LINEAGE_LOC_PROMPT.format(
946
+ campaign_context=campaign_context,
947
+ max_results=max_results,
948
+ campaign_specific=campaign_specific,
949
+ campaign_field=campaign_field,
950
+ campaign_example=campaign_example
951
+ ) + "\n\nTEXT:\n" + combined_text
919
952
  locs: List[dict] = []
920
953
  try:
921
954
  locs = generate_json_with_retry(
@@ -1378,6 +1411,138 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1378
1411
 
1379
1412
  # ---- 6.4 Public API -------------------------------------------------------
1380
1413
 
1414
+ def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
1415
+ """Extract text from a specific location (table, section, etc.) in the full text."""
1416
+ import re
1417
+
1418
+ if location_type == 'table':
1419
+ # Find ALL mentions of this table and combine them
1420
+ location_clean = location.strip()
1421
+
1422
+ # Different ways the table might be referenced
1423
+ search_patterns = [
1424
+ location_clean, # Exact match
1425
+ location_clean.replace("Supplementary ", "Supp. "), # Common abbreviation
1426
+ location_clean.replace("Supplementary ", "S"), # E.g., "Table S3"
1427
+ location_clean.replace("Supplementary Table ", "Table S"), # Another common format
1428
+ ]
1429
+
1430
+ # Collect all occurrences
1431
+ all_occurrences = []
1432
+ seen_positions = set()
1433
+
1434
+ for search_term in search_patterns:
1435
+ pattern = re.compile(re.escape(search_term), re.IGNORECASE)
1436
+ for match in pattern.finditer(full_text):
1437
+ # Avoid duplicates from overlapping patterns
1438
+ if match.start() in seen_positions:
1439
+ continue
1440
+ seen_positions.add(match.start())
1441
+
1442
+ # Extract generous context around each mention
1443
+ start = max(0, match.start() - 1000)
1444
+ end = min(len(full_text), match.end() + 10000)
1445
+ context = full_text[start:end]
1446
+
1447
+ all_occurrences.append({
1448
+ 'position': match.start(),
1449
+ 'context': context,
1450
+ 'match': match.group()
1451
+ })
1452
+
1453
+ if not all_occurrences:
1454
+ log.warning(f"No occurrences of table '{location_clean}' found in text")
1455
+ return None
1456
+
1457
+ log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
1458
+
1459
+ # Combine all occurrences into one text for Gemini to analyze
1460
+ combined_text = f"=== All occurrences of {location_clean} ===\n\n"
1461
+
1462
+ for i, occurrence in enumerate(all_occurrences, 1):
1463
+ combined_text += f"--- Occurrence {i} at position {occurrence['position']} ---\n"
1464
+ combined_text += occurrence['context']
1465
+ combined_text += "\n\n"
1466
+
1467
+ # Limit total length to avoid overwhelming the model
1468
+ if len(combined_text) > 50000:
1469
+ combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
1470
+
1471
+ return combined_text
1472
+
1473
+ elif location_type == 'figure':
1474
+ # For figures, we mainly want the caption and any text description
1475
+ location_clean = location.strip()
1476
+ patterns = [
1477
+ rf'({re.escape(location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})',
1478
+ rf'(Figure\s+S?\d+[^\n]*{re.escape(location_clean.split()[-1] if location_clean.split() else location_clean)}[^\n]*\n(?:(?!(?:Table|Tab\.|Figure|Fig\.|Section|\n\n\n)).*\n){{0,20}})'
1479
+ ]
1480
+
1481
+ for pattern in patterns:
1482
+ match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
1483
+ if match:
1484
+ # For figures, include surrounding context as the data might be described nearby
1485
+ start = max(0, match.start() - 1000)
1486
+ end = min(match.end() + 2000, len(full_text))
1487
+ return full_text[start:end]
1488
+
1489
+ elif location_type == 'section':
1490
+ # Look for section heading
1491
+ location_clean = location.strip()
1492
+ patterns = [
1493
+ # Section with number
1494
+ rf'((?:^|\n)\d+\.?\s*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]).*\n){{0,500}})',
1495
+ # Section without number
1496
+ rf'((?:^|\n){re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+[A-Z]|\n[A-Z]{{2,}}).*\n){{0,500}})',
1497
+ # More flexible section matching
1498
+ rf'((?:^|\n)[^\n]*{re.escape(location_clean)}[^\n]*\n(?:(?!\n\d+\.\s+|\n[A-Z]{{2,}}).*\n){{0,500}})'
1499
+ ]
1500
+
1501
+ for pattern in patterns:
1502
+ match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE | re.DOTALL)
1503
+ if match:
1504
+ return match.group(1)
1505
+
1506
+ elif location_type == 'text':
1507
+ # Try to find the location as a page marker or general text
1508
+ if location.isdigit():
1509
+ # Page number - look for page markers
1510
+ page_num = int(location)
1511
+ # Look for page breaks or page numbers
1512
+ patterns = [
1513
+ rf'(?:^|\n)\s*-?\s*{page_num}\s*-?\s*\n((?:.*\n){{0,300}})',
1514
+ rf'(?:page|p\.?)\s*{page_num}[^\n]*\n((?:.*\n){{0,300}})',
1515
+ rf'\n{page_num}\n((?:.*\n){{0,300}})'
1516
+ ]
1517
+ for pattern in patterns:
1518
+ match = re.search(pattern, full_text, re.IGNORECASE | re.MULTILINE)
1519
+ if match:
1520
+ start = match.start()
1521
+ end = min(start + 15000, len(full_text))
1522
+ return full_text[start:end]
1523
+
1524
+ # Fallback: try fuzzy search for the location string
1525
+ location_words = location.split()
1526
+ if len(location_words) >= 2:
1527
+ # Try to find at least the first two words together
1528
+ search_pattern = rf'{re.escape(location_words[0])}\s+{re.escape(location_words[1])}'
1529
+ match = re.search(search_pattern, full_text, re.IGNORECASE)
1530
+ if match:
1531
+ start = max(0, match.start() - 500)
1532
+ end = min(match.start() + 8000, len(full_text))
1533
+ return full_text[start:end]
1534
+
1535
+ # Last resort: find any occurrence of the location string
1536
+ idx = full_text.lower().find(location.lower())
1537
+ if idx != -1:
1538
+ start = max(0, idx - 500)
1539
+ end = min(idx + 8000, len(full_text))
1540
+ return full_text[start:end]
1541
+
1542
+ log.warning(f"Could not find location '{location}' of type '{location_type}' in text")
1543
+ return None
1544
+
1545
+
1381
1546
  def get_lineage(
1382
1547
  caption_text: str,
1383
1548
  full_text: str,
@@ -1416,63 +1581,220 @@ def get_lineage(
1416
1581
  campaigns = [default_campaign]
1417
1582
  log.info(f"Created default campaign: {default_campaign.campaign_name}")
1418
1583
 
1419
- # Use captions for identification - they're concise and focused
1420
- locations = identify_evolution_locations(caption_text, model, debug_dir=debug_dir, campaigns=None, pdf_paths=pdf_paths)
1421
-
1422
1584
  all_variants = []
1423
1585
 
1424
1586
  if campaigns:
1425
- # If we have campaigns but no specific locations, use general extraction
1426
- if not locations:
1427
- log.info("No specific lineage locations found, extracting from full text with campaign context")
1428
- # Extract lineage for each campaign using full text
1587
+ log.info("Using campaign-aware location identification")
1588
+
1589
+ # Process each campaign separately
1590
+ for campaign in campaigns:
1591
+ log.info(f"\nProcessing campaign: {campaign.campaign_id} - {campaign.campaign_name}")
1592
+
1593
+ # Use identify_evolution_locations with campaign context
1594
+ locations = identify_evolution_locations(
1595
+ caption_text,
1596
+ model,
1597
+ max_results=5,
1598
+ debug_dir=debug_dir,
1599
+ campaigns=[campaign], # Pass single campaign for focused search
1600
+ pdf_paths=pdf_paths
1601
+ )
1602
+
1603
+ if not locations:
1604
+ log.warning(f"No locations found for campaign {campaign.campaign_id}, trying full text extraction")
1605
+ # Fall back to full text extraction
1606
+ campaign_variants = extract_complete_lineage(
1607
+ full_text, model,
1608
+ debug_dir=debug_dir,
1609
+ campaign_id=campaign.campaign_id,
1610
+ campaign_info=campaign,
1611
+ pdf_paths=pdf_paths
1612
+ )
1613
+ all_variants.extend(campaign_variants)
1614
+ continue
1615
+
1616
+ log.info(f"Found {len(locations)} potential locations for campaign {campaign.campaign_id}")
1617
+ for loc in locations:
1618
+ log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
1619
+
1620
+ # Try to extract from the best location
1621
+ extracted_variants = []
1622
+ for location in locations:
1623
+ if extracted_variants:
1624
+ break # Already got variants
1625
+
1626
+ location_str = location.get('location', '')
1627
+ location_type = location.get('type', '')
1628
+ confidence = location.get('confidence', 0)
1629
+
1630
+ # Try figure extraction for high-confidence figures
1631
+ if location_type == 'figure' and confidence >= 70 and pdf_paths:
1632
+ log.info(f"Attempting to extract figure: {location_str}")
1633
+
1634
+ figure_bytes = None
1635
+ for pdf_path in pdf_paths:
1636
+ figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
1637
+ if figure_bytes:
1638
+ log.info(f"Successfully extracted figure from {pdf_path.name}")
1639
+ break
1640
+
1641
+ if figure_bytes:
1642
+ # Save figure if debug enabled
1643
+ if debug_dir:
1644
+ debug_path = Path(debug_dir)
1645
+ debug_path.mkdir(parents=True, exist_ok=True)
1646
+ figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
1647
+ _dump(figure_bytes, figure_file)
1648
+ log.info(f"Saved figure to: {figure_file}")
1649
+
1650
+ # Extract lineage from figure
1651
+ variants = extract_lineage_from_figure(
1652
+ figure_bytes, model,
1653
+ debug_dir=debug_dir,
1654
+ campaign_id=campaign.campaign_id,
1655
+ campaign_info=campaign
1656
+ )
1657
+ if variants:
1658
+ log.info(f"Extracted {len(variants)} variants from figure")
1659
+ extracted_variants = variants
1660
+ continue
1661
+
1662
+ # Try table/text extraction
1663
+ if location_type in ['table', 'text', 'section'] and not extracted_variants:
1664
+ log.info(f"Attempting text extraction for {location_type}: {location_str}")
1665
+
1666
+ # Extract the specific section/table from full text
1667
+ section_text = _extract_location_text(full_text, location_str, location_type)
1668
+ if section_text:
1669
+ log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
1670
+ # Save extracted section if debug enabled
1671
+ if debug_dir:
1672
+ debug_path = Path(debug_dir)
1673
+ section_file = debug_path / f"extracted_{location_type}_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.txt"
1674
+ _dump(f"=== EXTRACTED {location_type.upper()} ===\nLocation: {location_str}\nLength: {len(section_text)} chars\n{'='*80}\n\n{section_text}", section_file)
1675
+
1676
+ variants = extract_complete_lineage(
1677
+ section_text, model,
1678
+ debug_dir=debug_dir,
1679
+ campaign_id=campaign.campaign_id,
1680
+ campaign_info=campaign,
1681
+ pdf_paths=pdf_paths
1682
+ )
1683
+ if variants:
1684
+ log.info(f"Extracted {len(variants)} variants from {location_type}")
1685
+ extracted_variants = variants
1686
+ else:
1687
+ log.warning(f"Could not extract text from {location_type}: {location_str}")
1688
+
1689
+ # If no variants extracted from specific locations, try full text
1690
+ if not extracted_variants:
1691
+ log.warning(f"Could not extract from specific locations, trying full text for campaign {campaign.campaign_id}")
1692
+ extracted_variants = extract_complete_lineage(
1693
+ full_text, model,
1694
+ debug_dir=debug_dir,
1695
+ campaign_id=campaign.campaign_id,
1696
+ campaign_info=campaign,
1697
+ pdf_paths=pdf_paths
1698
+ )
1699
+
1700
+ all_variants.extend(extracted_variants)
1701
+
1702
+ return all_variants, campaigns
1703
+
1704
+ # Original fallback code for when no campaigns are identified
1705
+ log.info("Processing campaigns with direct caption and TOC analysis (skipping global location finding)")
1706
+
1707
+ # Prepare all captions and TOC with context for campaign-specific selection
1708
+ caption_entries = []
1709
+
1710
+ # Add table of contents entries if available
1711
+ if pdf_paths:
1712
+ toc_sections = []
1713
+ for pdf_path in pdf_paths:
1714
+ # Extract first few pages looking for TOC
1715
+ try:
1716
+ import fitz # PyMuPDF
1717
+ doc = fitz.open(pdf_path)
1718
+ toc_text = ""
1719
+ for page_num in range(min(5, doc.page_count)): # First 5 pages
1720
+ page = doc[page_num] # Correct PyMuPDF syntax
1721
+ page_text = page.get_text()
1722
+ if any(keyword in page_text.lower() for keyword in ['contents', 'table of contents', 'overview']):
1723
+ toc_text += f"\n--- Page {page_num + 1} TOC ---\n{page_text}\n"
1724
+ doc.close()
1725
+ if toc_text:
1726
+ toc_sections.append(toc_text)
1727
+ except Exception as e:
1728
+ log.warning(f"Failed to extract TOC from {pdf_path}: {e}")
1729
+
1730
+ if toc_sections:
1731
+ caption_entries.append({
1732
+ 'type': 'table_of_contents',
1733
+ 'location': 'Table of Contents',
1734
+ 'context': '\n'.join(toc_sections)[:1000] + "..."
1735
+ })
1736
+
1737
+ # Parse figure and table captions from caption_text
1738
+ # Split by common caption patterns
1739
+ caption_patterns = [
1740
+ r'(?:^|\n)(?:Figure|Fig\.?)\s*\d+[:\.]',
1741
+ r'(?:^|\n)(?:Table|Tab\.?)\s*\d+[:\.]',
1742
+ r'(?:^|\n)(?:Scheme|Sch\.?)\s*\d+[:\.]'
1743
+ ]
1744
+
1745
+ import re
1746
+ for pattern in caption_patterns:
1747
+ matches = list(re.finditer(pattern, caption_text, re.MULTILINE | re.IGNORECASE))
1748
+ for i, match in enumerate(matches):
1749
+ start_pos = match.start()
1750
+ # Find the end of this caption (start of next caption or end of text)
1751
+ if i + 1 < len(matches):
1752
+ end_pos = matches[i + 1].start()
1753
+ else:
1754
+ end_pos = min(start_pos + 2000, len(caption_text)) # Max 2000 chars per caption
1755
+
1756
+ caption_content = caption_text[start_pos:end_pos].strip()
1757
+ if len(caption_content) > 20: # Skip very short captions
1758
+ # Extract context from full text around this caption
1759
+ context_start = max(0, full_text.find(caption_content[:100]) - 500)
1760
+ context_end = min(len(full_text), context_start + 2000)
1761
+ context = full_text[context_start:context_end]
1762
+
1763
+ caption_entries.append({
1764
+ 'type': 'figure' if 'fig' in pattern.lower() else 'table' if 'tab' in pattern.lower() else 'scheme',
1765
+ 'location': caption_content.split('\n')[0][:100] + "..." if len(caption_content.split('\n')[0]) > 100 else caption_content.split('\n')[0],
1766
+ 'context': context
1767
+ })
1768
+
1769
+ log.info(f"Prepared {len(caption_entries)} caption/TOC entries for campaign-specific analysis")
1770
+
1771
+ # If no caption entries found, fall back to full text extraction
1772
+ if not caption_entries:
1773
+ log.info("No caption entries found, extracting from full text with campaign context")
1429
1774
  for campaign in campaigns:
1430
1775
  log.info(f"Processing campaign: {campaign.campaign_id}")
1431
- campaign_variants = extract_campaign_lineage(
1432
- full_text, model, campaign_id=campaign.campaign_id,
1433
- debug_dir=debug_dir, pdf_paths=pdf_paths,
1434
- campaign_info=campaign
1776
+ campaign_variants = extract_complete_lineage(
1777
+ full_text, model,
1778
+ debug_dir=debug_dir,
1779
+ campaign_id=campaign.campaign_id,
1780
+ campaign_info=campaign,
1781
+ pdf_paths=pdf_paths
1435
1782
  )
1436
1783
  all_variants.extend(campaign_variants)
1437
1784
  return all_variants, campaigns
1438
- # Original logic for when we have both locations and campaigns
1439
- # Log location information
1440
- location_summary = []
1441
- for loc in locations[:5]:
1442
- if isinstance(loc, dict):
1443
- location_summary.append(f"{loc.get('location', 'Unknown')} ({loc.get('type', 'unknown')}, confidence: {loc.get('confidence', 0)})")
1444
- else:
1445
- location_summary.append(str(loc))
1446
- log.info("Gemini identified %d potential lineage locations: %s",
1447
- len(locations), ", ".join(location_summary))
1448
-
1449
- # Extract context around each location for better decision making
1450
- locations_with_context = []
1451
- for loc in locations:
1452
- location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1453
- # Extract 1000 chars of context around the location
1454
- context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
1455
- locations_with_context.append({
1456
- 'location': loc,
1457
- 'context': context_text # Full extracted context
1458
- })
1459
-
1460
- # For each campaign, ask Gemini to select the best location
1785
+
1786
+ # For each campaign, ask Gemini to select the best location from captions/TOC
1461
1787
  for campaign in campaigns:
1462
1788
  log.info(f"Processing campaign: {campaign.campaign_id}")
1463
1789
 
1464
- # Build locations context string
1790
+ # Build locations context string from caption entries
1465
1791
  locations_str = ""
1466
- for i, loc_ctx in enumerate(locations_with_context):
1467
- loc = loc_ctx['location']
1468
- context = loc_ctx['context']
1469
- location_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1470
- location_type = loc.get('type', '') if isinstance(loc, dict) else 'unknown'
1471
- confidence = loc.get('confidence', 0) if isinstance(loc, dict) else 0
1472
- reason = loc.get('reason', '') if isinstance(loc, dict) else ''
1792
+ for i, entry in enumerate(caption_entries):
1793
+ location_str = entry['location']
1794
+ location_type = entry['type']
1795
+ context = entry['context']
1473
1796
 
1474
- locations_str += f"\n{i+1}. {location_str} (Type: {location_type}, Confidence: {confidence})\n"
1475
- locations_str += f" Reason: {reason}\n"
1797
+ locations_str += f"\n{i+1}. {location_str} (Type: {location_type})\n"
1476
1798
  locations_str += f" Context (first 500 chars):\n {context[:500]}...\n"
1477
1799
 
1478
1800
  # Ask Gemini to select best location for this campaign
@@ -1514,26 +1836,39 @@ def get_lineage(
1514
1836
 
1515
1837
  log.info(f"Selected location for {campaign.campaign_id}: {selected_location} (confidence: {confidence})")
1516
1838
 
1517
- # Find the actual location object
1518
- for loc in locations:
1519
- loc_str = loc.get('location', '') if isinstance(loc, dict) else str(loc)
1520
- if loc_str == selected_location:
1521
- primary_location = loc
1839
+ # Find the actual caption entry
1840
+ selected_entry = None
1841
+ for entry in caption_entries:
1842
+ if entry['location'] == selected_location:
1843
+ selected_entry = entry
1522
1844
  break
1523
1845
 
1524
- if not primary_location:
1525
- log.warning(f"Could not find selected location '{selected_location}' in locations list")
1526
- # Fall back to highest confidence location
1527
- primary_location = sorted(locations,
1528
- key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1529
- reverse=True)[0] if locations else None
1846
+ if not selected_entry:
1847
+ log.warning(f"Could not find selected location '{selected_location}' in caption entries")
1848
+ # Fall back to first entry
1849
+ selected_entry = caption_entries[0] if caption_entries else None
1850
+
1851
+ # Convert caption entry to location format for compatibility
1852
+ if selected_entry:
1853
+ primary_location = {
1854
+ 'location': selected_entry['location'],
1855
+ 'type': selected_entry['type'],
1856
+ 'confidence': 0.8, # Default confidence for caption-based selection
1857
+ 'reason': f"Selected from {selected_entry['type']} captions"
1858
+ }
1530
1859
 
1531
1860
  except Exception as e:
1532
1861
  log.warning(f"Failed to select best location for campaign {campaign.campaign_id}: {e}")
1533
- # Fall back to highest confidence location
1534
- primary_location = sorted(locations,
1535
- key=lambda x: x.get('confidence', 0) if isinstance(x, dict) else 0,
1536
- reverse=True)[0] if locations else None
1862
+ # Fall back to first caption entry
1863
+ if caption_entries:
1864
+ primary_location = {
1865
+ 'location': caption_entries[0]['location'],
1866
+ 'type': caption_entries[0]['type'],
1867
+ 'confidence': 0.5, # Lower confidence for fallback
1868
+ 'reason': f"Fallback to first {caption_entries[0]['type']} caption"
1869
+ }
1870
+ else:
1871
+ primary_location = None
1537
1872
 
1538
1873
  if not primary_location:
1539
1874
  log.warning(f"No location found for campaign {campaign.campaign_id}")
@@ -1675,6 +2010,97 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
1675
2010
  return []
1676
2011
 
1677
2012
  # --- 7.2 Page-based extraction helper ---------------------------------------
2013
+ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
2014
+ """Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
2015
+
2016
+ Args:
2017
+ prompt: The prompt to send to Gemini
2018
+ model: The Gemini model instance
2019
+ context: Additional context for logging (e.g., "validation" or "extraction")
2020
+
2021
+ Returns:
2022
+ The validated sequence or None if no consensus
2023
+ """
2024
+ sequences = []
2025
+ max_attempts = 5 # Increased from 3 to 5
2026
+
2027
+ # Try up to 5 times
2028
+ for attempt in range(max_attempts):
2029
+ try:
2030
+ response = model.generate_content(prompt)
2031
+ result = _extract_text(response).strip()
2032
+
2033
+ # Parse the result to extract just the sequence
2034
+ if result == "VALID":
2035
+ sequences.append("VALID")
2036
+ elif result == "UNCERTAIN":
2037
+ sequences.append("UNCERTAIN")
2038
+ elif result.startswith("M") and len(result) > 50:
2039
+ # Clean the sequence
2040
+ clean_seq = result.upper().replace(" ", "").replace("\n", "")
2041
+ if all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in clean_seq):
2042
+ sequences.append(clean_seq)
2043
+ else:
2044
+ sequences.append("INVALID")
2045
+ else:
2046
+ sequences.append("INVALID")
2047
+
2048
+ log.info(f"Gemini {context} attempt {attempt + 1}: {len(result) if result.startswith('M') else result}")
2049
+
2050
+ except Exception as e:
2051
+ log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
2052
+ sequences.append("ERROR")
2053
+
2054
+ # Check for early consensus after 2 attempts
2055
+ if len(sequences) == 2:
2056
+ # Clean sequences before comparison
2057
+ seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
2058
+ seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
2059
+
2060
+ if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
2061
+ log.info(f"Gemini {context} consensus reached after 2 attempts")
2062
+ return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
2063
+ else:
2064
+ log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
2065
+
2066
+ # After all attempts, find consensus
2067
+ valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
2068
+
2069
+ if not valid_sequences:
2070
+ log.error(f"All {max_attempts} {context} attempts failed")
2071
+ return None
2072
+
2073
+ # Find any matching pair
2074
+ for i in range(len(sequences)):
2075
+ for j in range(i + 1, len(sequences)):
2076
+ # Clean sequences before comparison
2077
+ seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
2078
+ seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
2079
+
2080
+ if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
2081
+ log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
2082
+ return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
2083
+
2084
+ # If no exact match, use adaptive validation
2085
+ # Count occurrences of each valid sequence
2086
+ sequence_counts = {}
2087
+ for seq in valid_sequences:
2088
+ if seq not in ["VALID", "UNCERTAIN"]:
2089
+ # Clean sequence before counting
2090
+ seq_clean = seq.replace(" ", "").replace("\n", "")
2091
+ sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
2092
+
2093
+ # Return the most common sequence if it appears at least twice
2094
+ if sequence_counts:
2095
+ most_common = max(sequence_counts.items(), key=lambda x: x[1])
2096
+ if most_common[1] >= 2:
2097
+ log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
2098
+ return most_common[0]
2099
+
2100
+ log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
2101
+ return None
2102
+
2103
+
1678
2104
  def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
1679
2105
  """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
1680
2106
 
@@ -1704,7 +2130,7 @@ def _validate_sequence_against_mutations(sequence: str, variants: List[Variant],
1704
2130
  if not local_issues:
1705
2131
  return None # No obvious issues found
1706
2132
 
1707
- log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation")
2133
+ log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
1708
2134
 
1709
2135
  prompt = f"""
1710
2136
  You are validating a protein sequence that was extracted from a scientific paper.
@@ -1729,26 +2155,14 @@ Return ONLY the corrected sequence if changes are needed, or "VALID" if no chang
1729
2155
  If you cannot determine the correct sequence, return "UNCERTAIN".
1730
2156
  """
1731
2157
 
1732
- try:
1733
- response = model.generate_content(prompt)
1734
- result = _extract_text(response).strip()
1735
-
1736
- if result == "VALID":
1737
- return None # No changes needed
1738
- elif result == "UNCERTAIN":
1739
- log.warning("Gemini could not validate sequence against mutations")
1740
- return None
1741
- elif result.startswith("M") and len(result) > 50:
1742
- # Gemini returned a corrected sequence
1743
- log.info(f"Gemini suggested sequence correction (length {len(result)})")
1744
- return result
1745
- else:
1746
- log.warning(f"Unexpected validation response: {result[:100]}...")
1747
- return None
1748
-
1749
- except Exception as e:
1750
- log.warning(f"Failed to validate sequence: {e}")
1751
- return None
2158
+ # Use triple validation
2159
+ result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
2160
+
2161
+ if result == "VALID" or result is None:
2162
+ return None # No changes needed
2163
+ else:
2164
+ log.info(f"Gemini suggested sequence correction (length {len(result)})")
2165
+ return result
1752
2166
 
1753
2167
 
1754
2168
  def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
@@ -1915,10 +2329,18 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
1915
2329
  - Extract the variant_id exactly as written where the sequence appears
1916
2330
  - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
1917
2331
 
2332
+ SEQUENCE EXTRACTION RULES:
2333
+ - Copy sequences EXACTLY as they appear in the text
2334
+ - Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
2335
+ - Do NOT add, remove, or modify any amino acids
2336
+ - Preserve the exact length and character sequence
2337
+ - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
2338
+ - Double-check that consecutive identical amino acids are copied correctly
2339
+
1918
2340
  For each variant return:
1919
2341
  * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
1920
- * aa_seq - amino-acid sequence (uppercase), or null
1921
- * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists)
2342
+ * aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
2343
+ * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
1922
2344
 
1923
2345
  Respond ONLY with **minified JSON** that matches the schema below.
1924
2346
  NO markdown, no code fences, no commentary.
@@ -1934,8 +2356,258 @@ TEXT (may be truncated):
1934
2356
  ```
1935
2357
  """.strip()
1936
2358
 
1937
- def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None) -> list[SequenceBlock]:
1938
- """Prompt Gemini and convert its JSON reply into SequenceBlock objects."""
2359
+ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
2360
+ """Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
2361
+
2362
+ Args:
2363
+ model: The Gemini model instance
2364
+ prompt: The prompt to send to Gemini
2365
+ schema_hint: The JSON schema hint
2366
+ debug_dir: Optional debug directory
2367
+
2368
+ Returns:
2369
+ The validated sequence JSON data or None if no consensus
2370
+ """
2371
+ responses = []
2372
+ max_attempts = 5 # Increased from 3 to 5
2373
+
2374
+ # Try up to 5 times
2375
+ for attempt in range(max_attempts):
2376
+ try:
2377
+ log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
2378
+ resp = model.generate_content(prompt)
2379
+ raw = _extract_text(resp).strip()
2380
+
2381
+ # Save debug info
2382
+ if debug_dir:
2383
+ debug_path = Path(debug_dir)
2384
+ debug_path.mkdir(parents=True, exist_ok=True)
2385
+ response_file = debug_path / f"sequences_attempt_{attempt + 1}_{int(time.time())}.txt"
2386
+ with open(response_file, 'w') as f:
2387
+ f.write(f"=== SEQUENCE EXTRACTION ATTEMPT {attempt + 1} ===\n")
2388
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
2389
+ f.write(f"Length: {len(raw)} characters\n")
2390
+ f.write("="*80 + "\n\n")
2391
+ f.write(raw)
2392
+
2393
+ # Parse JSON response (similar to generate_json_with_retry logic)
2394
+ fence_re = re.compile(r"```json|```", re.I)
2395
+ if raw.startswith("```"):
2396
+ raw = fence_re.sub("", raw).strip()
2397
+
2398
+ # Try to parse as JSON
2399
+ try:
2400
+ parsed = json.loads(raw)
2401
+ except json.JSONDecodeError:
2402
+ # Look for JSON array or object in the response
2403
+ json_start = -1
2404
+ json_end = -1
2405
+ bracket_stack = []
2406
+ in_string = False
2407
+ escape_next = False
2408
+
2409
+ for i, char in enumerate(raw):
2410
+ if escape_next:
2411
+ escape_next = False
2412
+ continue
2413
+
2414
+ if char == '\\':
2415
+ escape_next = True
2416
+ continue
2417
+
2418
+ if char == '"' and not escape_next:
2419
+ in_string = not in_string
2420
+ continue
2421
+
2422
+ if in_string:
2423
+ continue
2424
+
2425
+ if char in '[{':
2426
+ if json_start == -1:
2427
+ json_start = i
2428
+ bracket_stack.append(char)
2429
+ elif char in ']}':
2430
+ if bracket_stack:
2431
+ opening = bracket_stack.pop()
2432
+ if (opening == '[' and char == ']') or (opening == '{' and char == '}'):
2433
+ if not bracket_stack: # Found complete JSON
2434
+ json_end = i + 1
2435
+ break
2436
+
2437
+ if json_start >= 0 and json_end > json_start:
2438
+ json_str = raw[json_start:json_end]
2439
+ parsed = json.loads(json_str)
2440
+ else:
2441
+ if '[]' in raw:
2442
+ parsed = []
2443
+ else:
2444
+ raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
2445
+
2446
+ # Store both the original and normalized response
2447
+ normalized_response = _normalize_sequence_response(parsed)
2448
+ responses.append((parsed, normalized_response))
2449
+
2450
+ log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
2451
+
2452
+ except Exception as e:
2453
+ log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
2454
+ responses.append(None)
2455
+
2456
+ # Check for early consensus after 2 attempts
2457
+ if len(responses) == 2:
2458
+ if (responses[0] and responses[1] and
2459
+ _sequences_match(responses[0][1], responses[1][1])):
2460
+ log.info("Sequence extraction consensus reached after 2 attempts")
2461
+ return responses[0][0] # Return original parsed data
2462
+ else:
2463
+ log.info("Sequence extraction mismatch after 2 attempts - trying third")
2464
+
2465
+ # After all attempts, use adaptive validation
2466
+ valid_responses = [r for r in responses if r is not None]
2467
+
2468
+ if not valid_responses:
2469
+ log.error(f"All {max_attempts} sequence extraction attempts failed")
2470
+ return None
2471
+
2472
+ # First, try to find exact consensus (any matching pair)
2473
+ for i in range(len(valid_responses)):
2474
+ for j in range(i + 1, len(valid_responses)):
2475
+ if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
2476
+ log.info(f"Sequence extraction consensus found: attempts with matching content")
2477
+ return valid_responses[i][0] # Return original parsed data
2478
+
2479
+ # If no exact consensus, use adaptive validation
2480
+ log.info("No exact consensus found, applying adaptive validation...")
2481
+
2482
+ # Find sequences that appear consistently across multiple attempts
2483
+ consistent_sequences = _find_consistent_sequences(valid_responses)
2484
+
2485
+ if consistent_sequences:
2486
+ log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
2487
+ return consistent_sequences
2488
+
2489
+ # If still no consensus, use the attempt with the most sequences
2490
+ best_response = max(valid_responses,
2491
+ key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
2492
+
2493
+ if best_response and len(best_response[1]) > 0:
2494
+ log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
2495
+ return best_response[0]
2496
+
2497
+ log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
2498
+ return None
2499
+
2500
+
2501
+ def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
2502
+ """Find sequences that appear consistently across multiple extraction attempts.
2503
+
2504
+ Args:
2505
+ valid_responses: List of (original_data, normalized_data) tuples
2506
+
2507
+ Returns:
2508
+ List of consistent sequences with confidence scores, or None if none found
2509
+ """
2510
+ if not valid_responses:
2511
+ return None
2512
+
2513
+ # Count how many times each sequence appears
2514
+ sequence_counts = {}
2515
+ sequence_full_data = {}
2516
+
2517
+ for original, normalized in valid_responses:
2518
+ if not isinstance(normalized, list):
2519
+ continue
2520
+
2521
+ for seq in normalized:
2522
+ variant_id = seq.get("variant_id", "")
2523
+ aa_seq = seq.get("aa_seq", "")
2524
+ # Clean sequence before using in key
2525
+ aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
2526
+
2527
+ # Create a unique key for this sequence
2528
+ key = f"{variant_id}|{aa_seq_clean}"
2529
+
2530
+ if key not in sequence_counts:
2531
+ sequence_counts[key] = 0
2532
+ sequence_full_data[key] = []
2533
+
2534
+ sequence_counts[key] += 1
2535
+
2536
+ # Find the full data for this sequence from the original response
2537
+ if isinstance(original, list):
2538
+ for orig_seq in original:
2539
+ if (orig_seq.get("variant_id") == variant_id and
2540
+ orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
2541
+ sequence_full_data[key].append(orig_seq)
2542
+ break
2543
+
2544
+ # Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
2545
+ min_appearances = max(2, len(valid_responses) // 2)
2546
+ consistent_sequences = []
2547
+
2548
+ for key, count in sequence_counts.items():
2549
+ if count >= min_appearances:
2550
+ # Use the first occurrence of the full data
2551
+ if sequence_full_data[key]:
2552
+ seq_data = sequence_full_data[key][0].copy()
2553
+ # Add confidence based on how many times it appeared
2554
+ seq_data["confidence"] = count / len(valid_responses)
2555
+ seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
2556
+ consistent_sequences.append(seq_data)
2557
+
2558
+ return consistent_sequences if consistent_sequences else None
2559
+
2560
+
2561
+ def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
2562
+ """Normalize sequence response for comparison."""
2563
+ if not isinstance(data, list):
2564
+ return []
2565
+
2566
+ normalized = []
2567
+ for item in data:
2568
+ if isinstance(item, dict):
2569
+ # Extract key fields for comparison
2570
+ normalized_item = {
2571
+ "variant_id": item.get("variant_id", ""),
2572
+ "aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
2573
+ "dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
2574
+ "confidence": item.get("confidence", 0.0)
2575
+ }
2576
+ normalized.append(normalized_item)
2577
+
2578
+ # Sort by variant_id for consistent comparison
2579
+ return sorted(normalized, key=lambda x: x["variant_id"])
2580
+
2581
+
2582
+ def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
2583
+ """Check if two sequence response lists match on key fields."""
2584
+ if len(seq1) != len(seq2):
2585
+ return False
2586
+
2587
+ for i, (s1, s2) in enumerate(zip(seq1, seq2)):
2588
+ # Compare variant IDs
2589
+ if s1.get("variant_id") != s2.get("variant_id"):
2590
+ return False
2591
+
2592
+ # Compare amino acid sequences (most critical)
2593
+ aa1 = s1.get("aa_seq", "")
2594
+ aa2 = s2.get("aa_seq", "")
2595
+ if aa1 and aa2 and aa1 != aa2:
2596
+ return False
2597
+ elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
2598
+ return False
2599
+
2600
+ # Compare DNA sequences if present
2601
+ dna1 = s1.get("dna_seq", "")
2602
+ dna2 = s2.get("dna_seq", "")
2603
+ if dna1 and dna2 and dna1 != dna2:
2604
+ return False
2605
+
2606
+ return True
2607
+
2608
+
2609
+ def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
2610
+ """Prompt Gemini and convert its JSON reply into SequenceBlock objects with triple validation."""
1939
2611
  base_prompt = _SEQ_EXTRACTION_PROMPT.format(
1940
2612
  schema=_SEQUENCE_SCHEMA_HINT, text=text[:MAX_CHARS]
1941
2613
  )
@@ -1952,8 +2624,50 @@ Match sequences to these known variants when possible. Variants may be labeled d
1952
2624
  else:
1953
2625
  prompt = base_prompt
1954
2626
 
1955
- data = generate_json_with_retry(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir, tag="sequences")
1956
- return _parse_sequences(data)
2627
+ # Add mutation validation context if we have lineage variants with mutations
2628
+ if lineage_variants:
2629
+ mutation_context = _build_mutation_validation_context(lineage_variants)
2630
+ if mutation_context:
2631
+ prompt = f"""{prompt}
2632
+
2633
+ CRITICAL MUTATION VALIDATION:
2634
+ {mutation_context}
2635
+
2636
+ IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
2637
+ For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
2638
+ """
2639
+
2640
+ # Save the complete prompt for debugging
2641
+ if debug_dir:
2642
+ debug_path = Path(debug_dir)
2643
+ debug_path.mkdir(parents=True, exist_ok=True)
2644
+ prompt_file = debug_path / f"sequence_extraction_prompt_{int(time.time())}.txt"
2645
+ with open(prompt_file, 'w') as f:
2646
+ f.write(f"=== SEQUENCE EXTRACTION PROMPT ===\n")
2647
+ f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
2648
+ f.write(f"Text length: {len(text)} characters\n")
2649
+ f.write(f"Truncated to: {len(text[:MAX_CHARS])} characters\n")
2650
+ f.write(f"Total prompt length: {len(prompt)} characters\n")
2651
+ f.write("="*80 + "\n\n")
2652
+ f.write(prompt)
2653
+ log.info(f"Saved sequence extraction prompt to {prompt_file}")
2654
+
2655
+ # Use triple validation for sequence extraction
2656
+ log.info("Extracting sequences with triple validation to ensure accuracy")
2657
+ data = _extract_sequences_with_triple_validation(model, prompt, _SEQUENCE_SCHEMA_HINT, debug_dir=debug_dir)
2658
+
2659
+ if not data:
2660
+ log.warning("Failed to get consistent sequence extraction after triple validation")
2661
+ return []
2662
+
2663
+ extracted_sequences = _parse_sequences(data)
2664
+
2665
+ # Post-process: validate sequences against mutations if we have lineage info
2666
+ if lineage_variants:
2667
+ validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
2668
+ return validated_sequences
2669
+
2670
+ return extracted_sequences
1957
2671
 
1958
2672
  # --- 7.4 JSON -> dataclass helpers -------------------------------------------
1959
2673
  _VALID_AA = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codon
@@ -2004,6 +2718,167 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
2004
2718
  )
2005
2719
  return blocks
2006
2720
 
2721
+ def _build_mutation_validation_context(lineage_variants: List[Variant]) -> str:
2722
+ """Build mutation context for sequence validation."""
2723
+ mutation_info = []
2724
+
2725
+ for variant in lineage_variants:
2726
+ if variant.mutations and variant.parent_id:
2727
+ mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
2728
+ mutation_info.append(f"Variant '{variant.variant_id}' (parent: '{variant.parent_id}') has mutations: {mutations_str}")
2729
+
2730
+ if not mutation_info:
2731
+ return ""
2732
+
2733
+ context = "Known mutation relationships:\n" + "\n".join(mutation_info[:10]) # Limit to first 10 for context
2734
+ if len(mutation_info) > 10:
2735
+ context += f"\n... and {len(mutation_info) - 10} more variants with mutations"
2736
+
2737
+ return context
2738
+
2739
+ def _validate_sequences_against_mutations(sequences: List[SequenceBlock], lineage_variants: List[Variant], model, debug_dir: str | Path | None = None) -> List[SequenceBlock]:
2740
+ """Validate extracted sequences against known mutations and fix inconsistencies."""
2741
+ # Create lookups for easier access
2742
+ seq_lookup = {seq.variant_id: seq for seq in sequences}
2743
+ variant_lookup = {var.variant_id: var for var in lineage_variants}
2744
+
2745
+ validation_issues = []
2746
+ corrected_sequences = []
2747
+
2748
+ for seq in sequences:
2749
+ variant = variant_lookup.get(seq.variant_id)
2750
+ if not variant or not variant.parent_id or not variant.mutations or not seq.aa_seq:
2751
+ corrected_sequences.append(seq)
2752
+ continue
2753
+
2754
+ parent_seq = seq_lookup.get(variant.parent_id)
2755
+ if not parent_seq or not parent_seq.aa_seq:
2756
+ corrected_sequences.append(seq)
2757
+ continue
2758
+
2759
+ # Check if mutations are consistent
2760
+ issues = _check_mutation_consistency(seq.aa_seq, parent_seq.aa_seq, variant.mutations, seq.variant_id, variant.parent_id)
2761
+
2762
+ if issues:
2763
+ validation_issues.extend(issues)
2764
+ log.warning(f"Sequence validation issues for {seq.variant_id}: {'; '.join(issues)}")
2765
+
2766
+ # Try to get corrected sequence from Gemini
2767
+ corrected_seq = _get_corrected_sequence_from_gemini(seq, parent_seq, variant, issues, model, debug_dir)
2768
+ if corrected_seq:
2769
+ corrected_sequences.append(corrected_seq)
2770
+ log.info(f"Corrected sequence for {seq.variant_id} using Gemini validation")
2771
+ else:
2772
+ corrected_sequences.append(seq) # Keep original if correction fails
2773
+ else:
2774
+ corrected_sequences.append(seq)
2775
+
2776
+ if validation_issues:
2777
+ log.warning(f"Found {len(validation_issues)} sequence validation issues across {len([s for s in sequences if s.variant_id in [v.variant_id for v in lineage_variants if v.mutations]])} variants with mutations")
2778
+
2779
+ return corrected_sequences
2780
+
2781
+ def _check_mutation_consistency(child_seq: str, parent_seq: str, mutations, child_id: str, parent_id: str) -> List[str]:
2782
+ """Check if mutations are consistent between parent and child sequences."""
2783
+ import re
2784
+
2785
+ issues = []
2786
+
2787
+ # Parse mutations (handle both string and list formats)
2788
+ if isinstance(mutations, list):
2789
+ mutation_strs = mutations
2790
+ else:
2791
+ mutation_strs = [m.strip() for m in str(mutations).split(',') if m.strip()]
2792
+
2793
+ for mut_str in mutation_strs:
2794
+ # Parse mutation like "A100V"
2795
+ match = re.match(r'^([A-Z])(\d+)([A-Z])$', mut_str.strip())
2796
+ if not match:
2797
+ continue # Skip non-standard mutation formats
2798
+
2799
+ orig_aa, pos_str, new_aa = match.groups()
2800
+ pos = int(pos_str) - 1 # Convert to 0-based indexing
2801
+
2802
+ # Check bounds
2803
+ if pos >= len(parent_seq) or pos >= len(child_seq):
2804
+ issues.append(f"Mutation {mut_str} position out of bounds")
2805
+ continue
2806
+
2807
+ # Check parent sequence has expected original amino acid
2808
+ if parent_seq[pos] != orig_aa:
2809
+ issues.append(f"Mutation {mut_str}: parent {parent_id} has {parent_seq[pos]} at position {pos+1}, expected {orig_aa}")
2810
+
2811
+ # Check child sequence has expected new amino acid
2812
+ if child_seq[pos] != new_aa:
2813
+ issues.append(f"Mutation {mut_str}: child {child_id} has {child_seq[pos]} at position {pos+1}, expected {new_aa}")
2814
+
2815
+ return issues
2816
+
2817
+ def _get_corrected_sequence_from_gemini(seq: SequenceBlock, parent_seq: SequenceBlock, variant: Variant, issues: List[str], model, debug_dir: str | Path | None = None) -> SequenceBlock | None:
2818
+ """Use Gemini to get a corrected sequence based on mutation validation issues."""
2819
+ if not model:
2820
+ return None
2821
+
2822
+ mutations_str = "; ".join(variant.mutations) if isinstance(variant.mutations, list) else str(variant.mutations)
2823
+ issues_str = "; ".join(issues)
2824
+
2825
+ prompt = f"""You extracted a sequence for variant "{seq.variant_id}" but there are mutation validation issues:
2826
+
2827
+ ISSUES: {issues_str}
2828
+
2829
+ PARENT SEQUENCE ({variant.parent_id}):
2830
+ {parent_seq.aa_seq}
2831
+
2832
+ EXTRACTED SEQUENCE ({seq.variant_id}):
2833
+ {seq.aa_seq}
2834
+
2835
+ EXPECTED MUTATIONS: {mutations_str}
2836
+
2837
+ Based on the parent sequence and the expected mutations, provide the CORRECT sequence for {seq.variant_id}.
2838
+ Apply each mutation to the parent sequence in order.
2839
+
2840
+ For example, if parent has "A" at position 100 and mutation is "A100V", then child should have "V" at position 100.
2841
+
2842
+ IMPORTANT SEQUENCE RULES:
2843
+ - Copy the sequence EXACTLY - do not add, remove, or modify any amino acids
2844
+ - Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
2845
+ - Preserve the exact length of the sequence
2846
+ - Only change the specific positions indicated by the mutations
2847
+ - Double-check that consecutive identical amino acids are copied correctly
2848
+
2849
+ Return ONLY the corrected amino acid sequence (no explanation, no formatting).
2850
+ If you cannot determine the correct sequence, return "UNCERTAIN".
2851
+ """
2852
+
2853
+ try:
2854
+ if debug_dir:
2855
+ import time
2856
+ timestamp = int(time.time())
2857
+ prompt_file = Path(debug_dir) / f"sequence_validation_{seq.variant_id}_{timestamp}.txt"
2858
+ _dump(prompt, prompt_file)
2859
+
2860
+ # Use triple validation for sequence correction
2861
+ log.info(f"Correcting sequence for {seq.variant_id} with triple validation")
2862
+ corrected_seq = _extract_plain_sequence_with_triple_validation(prompt, model, f"correction for {seq.variant_id}")
2863
+
2864
+ if debug_dir and corrected_seq:
2865
+ response_file = Path(debug_dir) / f"sequence_validation_response_{seq.variant_id}_{timestamp}.txt"
2866
+ _dump(corrected_seq, response_file)
2867
+
2868
+ if corrected_seq and corrected_seq not in ["UNCERTAIN", "VALID"] and _clean_seq(corrected_seq, _VALID_AA):
2869
+ return SequenceBlock(
2870
+ variant_id=seq.variant_id,
2871
+ aa_seq=corrected_seq,
2872
+ dna_seq=seq.dna_seq,
2873
+ confidence=0.8, # Lower confidence for corrected sequences
2874
+ truncated=seq.truncated
2875
+ )
2876
+
2877
+ except Exception as e:
2878
+ log.warning(f"Failed to get corrected sequence for {seq.variant_id}: {e}")
2879
+
2880
+ return None
2881
+
2007
2882
  # --- 7.5 Convenience wrapper -------------------------------------------------
2008
2883
  def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir: str | Path | None = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
2009
2884
  # Phase 1: Identify where sequences might be located
@@ -2061,15 +2936,22 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2061
2936
 
2062
2937
  # Fallback to text search if page extraction didn't work
2063
2938
  if not focused_text:
2939
+ log.info("Page extraction did not return text, falling back to text search")
2064
2940
  focused_text = _extract_text_at_locations(
2065
2941
  text, [best_location],
2066
2942
  context_chars=max(min_length, 30000),
2067
2943
  validate_sequences=True
2068
2944
  )
2069
2945
 
2070
- if focused_text and len(focused_text) < len(text):
2071
- log.info("Reduced text from %d to %d chars using validated location",
2072
- len(text), len(focused_text))
2946
+ # Use focused text if we got any content, regardless of size
2947
+ if focused_text:
2948
+ if len(focused_text) < len(text):
2949
+ log.info("Reduced text from %d to %d chars using validated location",
2950
+ len(text), len(focused_text))
2951
+ else:
2952
+ log.info("Extracted focused text (%d chars) from validated location (full text: %d chars)",
2953
+ len(focused_text), len(text))
2954
+
2073
2955
  # Build lineage context if available
2074
2956
  lineage_context = None
2075
2957
  if lineage_variants:
@@ -2081,7 +2963,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2081
2963
  variant_info.append(info)
2082
2964
  lineage_context = "\n".join(variant_info)
2083
2965
 
2084
- return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context)
2966
+ return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
2967
+ else:
2968
+ log.warning("Failed to extract focused text from validated location, will use full text")
2085
2969
  else:
2086
2970
  log.warning("Location validation failed or returned invalid location: %s",
2087
2971
  validation.get("reason", "Unknown"))
@@ -2099,7 +2983,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2099
2983
  variant_info.append(info)
2100
2984
  lineage_context = "\n".join(variant_info)
2101
2985
 
2102
- return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context)
2986
+ return extract_sequences(text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
2103
2987
 
2104
2988
  # === 7.6 PDB SEQUENCE EXTRACTION === -----------------------------------------
2105
2989
  """When no sequences are found in the paper, attempt to fetch them from PDB."""
@@ -2165,6 +3049,7 @@ def fetch_pdb_sequences(pdb_id: str) -> Dict[str, str]:
2165
3049
  log.warning(f"Failed to fetch PDB {pdb_id}: {e}")
2166
3050
  return {}
2167
3051
 
3052
+
2168
3053
  def extract_enzyme_info_with_gemini(
2169
3054
  text: str,
2170
3055
  variants: List[Variant],
@@ -2234,7 +3119,7 @@ If you cannot determine certain fields, set them to null.
2234
3119
  # Validate it looks like a protein sequence
2235
3120
  if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
2236
3121
  # Sanity check the sequence against known mutations
2237
- validated_seq = _validate_sequence_against_mutations(seq, variants, lineage_text, model)
3122
+ validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
2238
3123
  if validated_seq:
2239
3124
  seq = validated_seq
2240
3125
  log.info(f"Sequence validated and potentially corrected by Gemini")
@@ -2802,7 +3687,7 @@ def run_pipeline(
2802
3687
 
2803
3688
  # 1. Prepare raw text ------------------------------------------------------
2804
3689
  # Always load both caption text (for identification) and full text (for extraction)
2805
- pdf_paths = [p for p in (si_path, manuscript) if p]
3690
+ pdf_paths = [p for p in (manuscript, si_path) if p]
2806
3691
  caption_text = limited_caption_concat(*pdf_paths)
2807
3692
  full_text = limited_concat(*pdf_paths)
2808
3693