debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/campaign_utils.py +146 -0
- debase/caption_pattern.py +44 -0
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +481 -106
- debase/lineage_format.py +44 -1
- debase/reaction_info_extractor.py +479 -135
- debase/substrate_scope_extractor.py +207 -80
- debase/wrapper.py +3 -3
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
- debase-0.6.2.dist-info/RECORD +18 -0
- debase-0.6.0.dist-info/RECORD +0 -16
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,13 @@ import fitz
|
|
28
28
|
import re
|
29
29
|
import json
|
30
30
|
import time
|
31
|
+
|
32
|
+
# Import universal caption pattern
|
33
|
+
try:
|
34
|
+
from .caption_pattern import get_universal_caption_pattern
|
35
|
+
except ImportError:
|
36
|
+
# Fallback if running as standalone script
|
37
|
+
from caption_pattern import get_universal_caption_pattern
|
31
38
|
import logging
|
32
39
|
from pathlib import Path
|
33
40
|
from dataclasses import dataclass, field
|
@@ -113,17 +120,8 @@ _DOI_REGEX = re.compile(r"10\.[0-9]{4,9}/[-._;()/:A-Z0-9]+", re.I)
|
|
113
120
|
# PDB ID regex - matches 4-character PDB codes
|
114
121
|
_PDB_REGEX = re.compile(r"\b[1-9][A-Z0-9]{3}\b")
|
115
122
|
|
116
|
-
#
|
117
|
-
_CAPTION_PREFIX_RE =
|
118
|
-
r"""
|
119
|
-
^\s*
|
120
|
-
(?:Fig(?:ure)?|Extended\s+Data\s+Fig|ED\s+Fig|Scheme|Chart|
|
121
|
-
Table|Supp(?:lementary|l|\.?)\s+(?:Fig(?:ure)?|Table)) # label part
|
122
|
-
\s*(?:S?\d+[A-Za-z]?|[IVX]+) # figure number
|
123
|
-
[.:]?\s* # trailing punctuation/space
|
124
|
-
""",
|
125
|
-
re.I | re.X,
|
126
|
-
)
|
123
|
+
# Use universal caption pattern
|
124
|
+
_CAPTION_PREFIX_RE = get_universal_caption_pattern()
|
127
125
|
|
128
126
|
|
129
127
|
def _open_doc(pdf_path: str | Path | bytes):
|
@@ -467,7 +465,7 @@ def get_model():
|
|
467
465
|
"temperature": 0.0, # Deterministic: always pick the most likely token
|
468
466
|
"top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
|
469
467
|
"top_k": 1, # Only consider the single most likely token
|
470
|
-
"max_output_tokens":
|
468
|
+
"max_output_tokens": 65536, # Increased to 2x for handling larger lineage tables and sequences
|
471
469
|
}
|
472
470
|
|
473
471
|
# For Gemini 2.5 Flash, disable thinking tokens to save costs
|
@@ -760,13 +758,24 @@ mutations were introduced){campaign_specific}. Pay attention to the provided con
|
|
760
758
|
ensure the location you return are actually lineage location with variants and mutations.
|
761
759
|
|
762
760
|
Respond with a JSON array of objects, each containing:
|
763
|
-
- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
761
|
+
- "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
764
762
|
- "type": one of "table", "figure", "section"
|
765
763
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
766
764
|
- "reason": brief explanation of why this location likely contains lineage
|
765
|
+
- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
|
766
|
+
- "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
|
767
767
|
{campaign_field}
|
768
|
-
|
769
|
-
|
768
|
+
CRITICAL INSTRUCTIONS:
|
769
|
+
1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
|
770
|
+
- Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
|
771
|
+
- Do NOT modify, standardize, or interpret the location - return it verbatim from the document
|
772
|
+
2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
|
773
|
+
- This should be the complete caption as it appears in the document
|
774
|
+
- Include at least 200-300 characters to ensure unique matching
|
775
|
+
3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
|
776
|
+
- Items like "Table S1", "Figure S2", etc. are typically in the SI
|
777
|
+
- Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
|
778
|
+
- If uncertain, use context clues from the text
|
770
779
|
|
771
780
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
772
781
|
mutation lists should be ranked higher than figures showing complete variant lineages.
|
@@ -776,9 +785,9 @@ Don't include oligonucleotide results or result from only one round.
|
|
776
785
|
|
777
786
|
Example output:
|
778
787
|
[
|
779
|
-
{{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
|
780
|
-
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
|
781
|
-
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
|
788
|
+
{{"location": "Table S1.", "type": "table", "confidence": 95, "reason": "Variant lineage table", "source": "si", "caption": "Table S1. Summary of mutations introduced during directed evolution of PA-G8. The table shows all variants tested in each round of SSM with their corresponding mutations and activities..."{campaign_example}}},
|
789
|
+
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram", "source": "manuscript", "caption": "Figure 2B Phylogenetic tree showing the evolutionary relationships between enzyme variants. Each node represents a variant with mutations indicated on branches..."{campaign_example}}},
|
790
|
+
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description", "source": "manuscript", "caption": "Section 3.2 Directed Evolution Campaign. We performed eight rounds of site-saturation mutagenesis..."{campaign_example}}}
|
782
791
|
]
|
783
792
|
""".strip()
|
784
793
|
|
@@ -956,6 +965,9 @@ def identify_evolution_locations(
|
|
956
965
|
campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
|
957
966
|
if hasattr(camp, 'notes') and camp.notes:
|
958
967
|
campaign_context += f"- Key identifiers: {camp.notes}\n"
|
968
|
+
if hasattr(camp, 'data_locations') and camp.data_locations:
|
969
|
+
campaign_context += f"- KNOWN DATA LOCATIONS: {', '.join(camp.data_locations)}\n"
|
970
|
+
campaign_context += " IMPORTANT: Prioritize these known locations highly!\n"
|
959
971
|
campaign_specific = f" for the '{camp.campaign_name}' campaign"
|
960
972
|
campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
|
961
973
|
campaign_example = f', "campaign_id": "{camp.campaign_id}"'
|
@@ -964,7 +976,10 @@ def identify_evolution_locations(
|
|
964
976
|
campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
|
965
977
|
for camp in campaigns:
|
966
978
|
campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
|
979
|
+
if hasattr(camp, 'data_locations') and camp.data_locations:
|
980
|
+
campaign_context += f" Known locations: {', '.join(camp.data_locations)}\n"
|
967
981
|
campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
|
982
|
+
campaign_context += "IMPORTANT: Prioritize the known locations listed above!\n"
|
968
983
|
campaign_specific = " for any of the identified campaigns"
|
969
984
|
campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
|
970
985
|
campaign_example = ', "campaign_id": "campaign_id_here"'
|
@@ -1041,6 +1056,7 @@ def extract_complete_lineage(
|
|
1041
1056
|
campaign_id: Optional[str] = None,
|
1042
1057
|
campaign_info: Optional[Campaign] = None,
|
1043
1058
|
pdf_paths: Optional[List[Path]] = None,
|
1059
|
+
location_str: Optional[str] = None,
|
1044
1060
|
) -> List[Variant]:
|
1045
1061
|
"""Prompt Gemini for the full lineage and return a list[Variant]."""
|
1046
1062
|
# Build campaign context
|
@@ -1060,6 +1076,21 @@ IMPORTANT:
|
|
1060
1076
|
2. Include "campaign_id": "{campaign_info.campaign_id}" for each variant in your response.
|
1061
1077
|
3. Use the lineage hint pattern above to identify which variants belong to this campaign.
|
1062
1078
|
4. Include parent variants only if they are direct ancestors in this campaign's lineage.
|
1079
|
+
"""
|
1080
|
+
|
1081
|
+
# Add location context if provided
|
1082
|
+
location_context = ""
|
1083
|
+
if location_str:
|
1084
|
+
location_context = f"""
|
1085
|
+
|
1086
|
+
LOCATION CONTEXT:
|
1087
|
+
You are extracting data SPECIFICALLY from: {location_str}
|
1088
|
+
|
1089
|
+
CRITICAL INSTRUCTIONS:
|
1090
|
+
- ONLY extract enzyme variants that appear in {location_str}
|
1091
|
+
- DO NOT include variants from other figures, tables, or sections
|
1092
|
+
- If {location_str} references variants from other locations, DO NOT include those unless they are explicitly shown in {location_str}
|
1093
|
+
- Focus strictly on the data presented within the boundaries of {location_str}
|
1063
1094
|
"""
|
1064
1095
|
|
1065
1096
|
# Extract table of contents from PDFs if available
|
@@ -1096,8 +1127,11 @@ IMPORTANT:
|
|
1096
1127
|
# Include TOC in the prompt text
|
1097
1128
|
combined_text = toc_text + text if toc_text else text
|
1098
1129
|
|
1130
|
+
# Combine campaign and location context
|
1131
|
+
full_context = campaign_context + location_context
|
1132
|
+
|
1099
1133
|
prompt = _LINEAGE_EXTRACT_PROMPT.format(
|
1100
|
-
campaign_context=
|
1134
|
+
campaign_context=full_context,
|
1101
1135
|
schema=_LINEAGE_SCHEMA_HINT,
|
1102
1136
|
text=combined_text[:MAX_CHARS],
|
1103
1137
|
)
|
@@ -1438,10 +1472,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1438
1472
|
|
1439
1473
|
# ---- 6.4 Public API -------------------------------------------------------
|
1440
1474
|
|
1441
|
-
def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
|
1442
|
-
"""Extract text from a specific location (table, section, etc.) in the full text.
|
1475
|
+
def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
|
1476
|
+
"""Extract text from a specific location (table, section, etc.) in the full text.
|
1477
|
+
|
1478
|
+
Args:
|
1479
|
+
full_text: The full text to search in
|
1480
|
+
location: The location identifier (e.g., "Table S1")
|
1481
|
+
location_type: Type of location ("table", "figure", "section")
|
1482
|
+
caption_hint: Optional full caption text for fuzzy matching
|
1483
|
+
"""
|
1443
1484
|
import re
|
1444
1485
|
|
1486
|
+
# If caption hint is provided, try fuzzy matching first
|
1487
|
+
if caption_hint and len(caption_hint) > 20:
|
1488
|
+
log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
|
1489
|
+
|
1490
|
+
# Normalize texts for better matching (similar to reaction_info_extractor)
|
1491
|
+
def normalize_for_matching(text):
|
1492
|
+
# Remove extra whitespace, normalize spaces around punctuation
|
1493
|
+
text = ' '.join(text.split())
|
1494
|
+
# Normalize different dash types
|
1495
|
+
text = text.replace('–', '-').replace('—', '-')
|
1496
|
+
return text
|
1497
|
+
|
1498
|
+
normalized_hint = normalize_for_matching(caption_hint[:150]) # Use first 150 chars
|
1499
|
+
normalized_text = normalize_for_matching(full_text)
|
1500
|
+
|
1501
|
+
# Try to find ALL caption matches using character-based fuzzy matching
|
1502
|
+
all_matches = []
|
1503
|
+
|
1504
|
+
# Slide through the text looking for all matches above threshold
|
1505
|
+
hint_len = len(normalized_hint)
|
1506
|
+
for i in range(len(normalized_text) - hint_len + 1):
|
1507
|
+
snippet = normalized_text[i:i + hint_len]
|
1508
|
+
# Simple character-based similarity
|
1509
|
+
matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
|
1510
|
+
score = matches / hint_len
|
1511
|
+
|
1512
|
+
if score > 0.7: # 70% similarity threshold
|
1513
|
+
all_matches.append({
|
1514
|
+
'norm_pos': i,
|
1515
|
+
'score': score
|
1516
|
+
})
|
1517
|
+
|
1518
|
+
# If we found matches, extract from all of them
|
1519
|
+
if all_matches:
|
1520
|
+
log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
|
1521
|
+
|
1522
|
+
# Collect all occurrences from fuzzy matches
|
1523
|
+
all_occurrences = []
|
1524
|
+
seen_positions = set()
|
1525
|
+
|
1526
|
+
for match_info in all_matches:
|
1527
|
+
# Get the matched text from normalized version
|
1528
|
+
matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
|
1529
|
+
|
1530
|
+
# Find where this appears in the original text
|
1531
|
+
best_original_pos = -1
|
1532
|
+
|
1533
|
+
# Search in the original text for this specific match
|
1534
|
+
for i in range(len(full_text) - len(caption_hint) + 1):
|
1535
|
+
if i in seen_positions:
|
1536
|
+
continue
|
1537
|
+
|
1538
|
+
original_snippet = full_text[i:i + len(caption_hint)]
|
1539
|
+
# Normalize and compare
|
1540
|
+
normalized_snippet = normalize_for_matching(original_snippet)
|
1541
|
+
if normalized_snippet[:hint_len] == matched_normalized:
|
1542
|
+
# Found exact match after normalization
|
1543
|
+
best_original_pos = i
|
1544
|
+
seen_positions.add(i)
|
1545
|
+
break
|
1546
|
+
|
1547
|
+
if best_original_pos >= 0:
|
1548
|
+
# Extract generous context from this match position
|
1549
|
+
start = max(0, best_original_pos - 1000)
|
1550
|
+
end = min(len(full_text), best_original_pos + 10000)
|
1551
|
+
context = full_text[start:end]
|
1552
|
+
|
1553
|
+
all_occurrences.append({
|
1554
|
+
'position': best_original_pos,
|
1555
|
+
'context': context,
|
1556
|
+
'score': match_info['score']
|
1557
|
+
})
|
1558
|
+
log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
|
1559
|
+
|
1560
|
+
if all_occurrences:
|
1561
|
+
# Sort by position to maintain document order
|
1562
|
+
all_occurrences.sort(key=lambda x: x['position'])
|
1563
|
+
|
1564
|
+
# Combine all occurrences
|
1565
|
+
combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
|
1566
|
+
|
1567
|
+
for i, occurrence in enumerate(all_occurrences, 1):
|
1568
|
+
combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
|
1569
|
+
combined_text += occurrence['context']
|
1570
|
+
combined_text += "\n\n"
|
1571
|
+
|
1572
|
+
# Apply same limit as table extraction
|
1573
|
+
if len(combined_text) > 150000:
|
1574
|
+
combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
|
1575
|
+
|
1576
|
+
log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
|
1577
|
+
return combined_text
|
1578
|
+
else:
|
1579
|
+
log.warning(f"Could not map any fuzzy matches back to original text")
|
1580
|
+
else:
|
1581
|
+
log.warning(f"No fuzzy matches found for caption above 70% threshold")
|
1582
|
+
|
1445
1583
|
if location_type == 'table':
|
1446
1584
|
# Find ALL mentions of this table and combine them
|
1447
1585
|
location_clean = location.strip()
|
@@ -1483,6 +1621,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
|
|
1483
1621
|
|
1484
1622
|
log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
|
1485
1623
|
|
1624
|
+
# Sort occurrences by position to maintain document order
|
1625
|
+
all_occurrences.sort(key=lambda x: x['position'])
|
1626
|
+
|
1486
1627
|
# Combine all occurrences into one text for Gemini to analyze
|
1487
1628
|
combined_text = f"=== All occurrences of {location_clean} ===\n\n"
|
1488
1629
|
|
@@ -1492,8 +1633,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
|
|
1492
1633
|
combined_text += "\n\n"
|
1493
1634
|
|
1494
1635
|
# Limit total length to avoid overwhelming the model
|
1495
|
-
|
1496
|
-
|
1636
|
+
# Increased limit to ensure actual table content is included
|
1637
|
+
if len(combined_text) > 150000:
|
1638
|
+
combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
|
1497
1639
|
|
1498
1640
|
return combined_text
|
1499
1641
|
|
@@ -1577,6 +1719,8 @@ def get_lineage(
|
|
1577
1719
|
*,
|
1578
1720
|
pdf_paths: Optional[List[Path]] = None,
|
1579
1721
|
debug_dir: str | Path | None = None,
|
1722
|
+
manuscript_text: Optional[str] = None,
|
1723
|
+
si_text: Optional[str] = None,
|
1580
1724
|
) -> Tuple[List[Variant], List[Campaign]]:
|
1581
1725
|
"""
|
1582
1726
|
High-level wrapper used by the pipeline.
|
@@ -1690,8 +1834,21 @@ def get_lineage(
|
|
1690
1834
|
if location_type in ['table', 'text', 'section'] and not extracted_variants:
|
1691
1835
|
log.info(f"Attempting text extraction for {location_type}: {location_str}")
|
1692
1836
|
|
1693
|
-
#
|
1694
|
-
|
1837
|
+
# Determine which text to use based on source
|
1838
|
+
location_source = location.get('source', 'manuscript')
|
1839
|
+
if location_source == 'si' and si_text:
|
1840
|
+
text_to_search = si_text
|
1841
|
+
log.info(f"Using SI text for location {location_str}")
|
1842
|
+
elif location_source == 'manuscript' and manuscript_text:
|
1843
|
+
text_to_search = manuscript_text
|
1844
|
+
log.info(f"Using manuscript text for location {location_str}")
|
1845
|
+
else:
|
1846
|
+
text_to_search = full_text
|
1847
|
+
log.info(f"Using combined text for location {location_str} (fallback)")
|
1848
|
+
|
1849
|
+
# Extract the specific section/table from appropriate text
|
1850
|
+
caption_hint = location.get('caption', '')
|
1851
|
+
section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
|
1695
1852
|
if section_text:
|
1696
1853
|
log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
|
1697
1854
|
# Save extracted section if debug enabled
|
@@ -1705,7 +1862,8 @@ def get_lineage(
|
|
1705
1862
|
debug_dir=debug_dir,
|
1706
1863
|
campaign_id=campaign.campaign_id,
|
1707
1864
|
campaign_info=campaign,
|
1708
|
-
pdf_paths=pdf_paths
|
1865
|
+
pdf_paths=pdf_paths,
|
1866
|
+
location_str=location_str
|
1709
1867
|
)
|
1710
1868
|
if variants:
|
1711
1869
|
log.info(f"Extracted {len(variants)} variants from {location_type}")
|
@@ -2004,17 +2162,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
|
2004
2162
|
|
2005
2163
|
Look for table of contents entries or section listings that mention sequences.
|
2006
2164
|
Return a JSON array where each element has:
|
2007
|
-
- "section": the section heading or description
|
2165
|
+
- "section": the section heading or description EXACTLY as it appears
|
2008
2166
|
- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
|
2167
|
+
- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
|
2168
|
+
- "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
|
2009
2169
|
|
2010
2170
|
Focus on:
|
2011
2171
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
2012
2172
|
- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
|
2013
2173
|
- Prioritize sections that mention "protein" or "amino acid" sequences
|
2014
2174
|
|
2015
|
-
CRITICAL:
|
2016
|
-
|
2017
|
-
-
|
2175
|
+
CRITICAL:
|
2176
|
+
1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
|
2177
|
+
- Correct: "53", "S12", "147"
|
2178
|
+
- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
|
2179
|
+
2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
|
2180
|
+
- Pages with "S" prefix (e.g., "S53") are typically in the SI
|
2181
|
+
- Regular page numbers (e.g., "53") are typically in the main manuscript
|
2182
|
+
- Use context clues from the document structure
|
2018
2183
|
|
2019
2184
|
Return [] if no sequence sections are found.
|
2020
2185
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
@@ -2254,44 +2419,34 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
2254
2419
|
|
2255
2420
|
# --- 7.3 Main extraction prompt ---------------------------------------------
|
2256
2421
|
_SEQ_EXTRACTION_PROMPT = """
|
2257
|
-
Extract
|
2258
|
-
|
2259
|
-
|
2260
|
-
|
2261
|
-
|
2262
|
-
|
2263
|
-
|
2264
|
-
|
2265
|
-
-
|
2266
|
-
-
|
2267
|
-
|
2268
|
-
|
2269
|
-
|
2270
|
-
|
2271
|
-
-
|
2272
|
-
-
|
2273
|
-
|
2274
|
-
|
2275
|
-
-
|
2276
|
-
-
|
2277
|
-
|
2278
|
-
|
2279
|
-
|
2280
|
-
|
2281
|
-
|
2282
|
-
|
2283
|
-
Respond ONLY with **minified JSON** that matches the schema below.
|
2284
|
-
NO markdown, no code fences, no commentary.
|
2285
|
-
|
2286
|
-
Schema:
|
2287
|
-
```json
|
2288
|
-
{schema}
|
2289
|
-
```
|
2422
|
+
Extract ALL enzyme variant sequences from the text.
|
2423
|
+
|
2424
|
+
Rules:
|
2425
|
+
1. Use EXACT variant IDs as they appear with each sequence
|
2426
|
+
2. Copy sequences EXACTLY - preserve all amino acids/nucleotides including repeats
|
2427
|
+
3. For each variant:
|
2428
|
+
- If amino acid sequence exists: set aa_seq to the sequence, set dna_seq to null
|
2429
|
+
- If ONLY DNA sequence exists: set dna_seq to the sequence, set aa_seq to null
|
2430
|
+
- NEVER include both aa_seq and dna_seq for the same variant
|
2431
|
+
- IMPORTANT: Always prefer amino acid sequences over DNA sequences when both are available
|
2432
|
+
4. Return ONLY minified JSON, no markdown or commentary
|
2433
|
+
|
2434
|
+
CRITICAL SEQUENCE PRIORITY RULE:
|
2435
|
+
- If you find BOTH amino acid sequence AND DNA sequence for the same variant, ONLY return the amino acid sequence
|
2436
|
+
- Set dna_seq to null when aa_seq is available, even if DNA sequence is present in the text
|
2437
|
+
- Only return dna_seq when NO amino acid sequence exists for that variant
|
2438
|
+
|
2439
|
+
CRITICAL ACCURACY REQUIREMENTS:
|
2440
|
+
- Extract ONLY sequences that are explicitly present in the provided text
|
2441
|
+
- DO NOT generate, infer, or hallucinate any sequences
|
2442
|
+
- Every character in the sequence must be directly copied from the text
|
2443
|
+
- If a sequence appears truncated or incomplete in the text, extract only what is shown
|
2444
|
+
- Be extremely careful and accurate - sequence accuracy is critical for scientific validity
|
2445
|
+
|
2446
|
+
Schema: {schema}
|
2290
2447
|
|
2291
|
-
TEXT
|
2292
|
-
```
|
2448
|
+
TEXT:
|
2293
2449
|
{text}
|
2294
|
-
```
|
2295
2450
|
""".strip()
|
2296
2451
|
|
2297
2452
|
def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
|
@@ -2366,7 +2521,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
|
|
2366
2521
|
|
2367
2522
|
|
2368
2523
|
def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
|
2369
|
-
"""Extract sequence JSON using Gemini with up to
|
2524
|
+
"""Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
|
2370
2525
|
|
2371
2526
|
Can exit early after 2 attempts if the responses match exactly.
|
2372
2527
|
|
@@ -2380,9 +2535,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2380
2535
|
The most common sequence JSON data or None if all attempts failed
|
2381
2536
|
"""
|
2382
2537
|
responses = []
|
2383
|
-
max_attempts = 6
|
2538
|
+
max_attempts = 3 # Reduced from 6 to 3 for performance
|
2384
2539
|
|
2385
|
-
# Try
|
2540
|
+
# Try 3 times with early match detection
|
2386
2541
|
for attempt in range(max_attempts):
|
2387
2542
|
try:
|
2388
2543
|
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
@@ -2408,8 +2563,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2408
2563
|
|
2409
2564
|
# Try to parse as JSON
|
2410
2565
|
try:
|
2411
|
-
|
2412
|
-
|
2566
|
+
# First clean the response - remove any BOM or invisible characters
|
2567
|
+
raw_clean = raw.strip()
|
2568
|
+
if raw_clean.startswith('\ufeff'): # Remove BOM if present
|
2569
|
+
raw_clean = raw_clean[1:]
|
2570
|
+
parsed = json.loads(raw_clean)
|
2571
|
+
except json.JSONDecodeError as e:
|
2572
|
+
log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
|
2413
2573
|
# Look for JSON array or object in the response
|
2414
2574
|
json_start = -1
|
2415
2575
|
json_end = -1
|
@@ -2458,17 +2618,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2458
2618
|
responses.append(parsed)
|
2459
2619
|
log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
|
2460
2620
|
|
2461
|
-
#
|
2462
|
-
if
|
2463
|
-
|
2464
|
-
if
|
2465
|
-
|
2466
|
-
if
|
2467
|
-
|
2468
|
-
|
2469
|
-
|
2470
|
-
|
2471
|
-
|
2621
|
+
# If we got a good response with sequences, we can check for early termination
|
2622
|
+
if isinstance(parsed, list) and len(parsed) > 0:
|
2623
|
+
# Early match detection after 2 attempts
|
2624
|
+
if attempt >= 1: # After 2nd attempt (0-indexed)
|
2625
|
+
valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
|
2626
|
+
if len(valid_responses_so_far) >= 2:
|
2627
|
+
# Check if the last two valid responses match
|
2628
|
+
if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
|
2629
|
+
log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
|
2630
|
+
# Add the matching response to fill remaining attempts
|
2631
|
+
for _ in range(max_attempts - attempt - 1):
|
2632
|
+
responses.append(valid_responses_so_far[-1])
|
2633
|
+
break
|
2634
|
+
# If this is the first attempt and we got sequences, continue to validate with at least one more
|
2635
|
+
elif attempt == 0 and len(parsed) > 5: # Got substantial sequences on first try
|
2636
|
+
log.info("Got substantial sequences on first attempt, will validate with one more")
|
2472
2637
|
|
2473
2638
|
except Exception as e:
|
2474
2639
|
log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
|
@@ -2828,9 +2993,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2828
2993
|
focused_text = ""
|
2829
2994
|
if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
|
2830
2995
|
page_num = best_location['page']
|
2831
|
-
# Extract current page plus next
|
2996
|
+
# Extract current page plus next 5 pages (6 total) to prevent hallucination
|
2832
2997
|
all_pages = []
|
2833
|
-
for i in range(
|
2998
|
+
for i in range(6): # Current + next 5 (6 pages total)
|
2834
2999
|
if isinstance(page_num, str) and page_num.upper().startswith('S'):
|
2835
3000
|
next_page = f"S{int(page_num[1:]) + i}"
|
2836
3001
|
else:
|
@@ -2842,7 +3007,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2842
3007
|
break
|
2843
3008
|
if all_pages:
|
2844
3009
|
focused_text = "\n".join(all_pages)
|
2845
|
-
log.info("Extracted %d chars from pages %s through %d more pages",
|
3010
|
+
log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
|
2846
3011
|
len(focused_text), page_num, len(all_pages) - 1)
|
2847
3012
|
|
2848
3013
|
# Fallback to text search if page extraction didn't work
|
@@ -3128,6 +3293,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
|
|
3128
3293
|
return {}
|
3129
3294
|
|
3130
3295
|
|
3296
|
+
def _match_variant_ids_with_gemini(
|
3297
|
+
lineage_variant_ids: List[str],
|
3298
|
+
pdb_variant_ids: List[str],
|
3299
|
+
model
|
3300
|
+
) -> Dict[str, str]:
|
3301
|
+
"""Use Gemini to match variant IDs that may have slight formatting differences.
|
3302
|
+
|
3303
|
+
Args:
|
3304
|
+
lineage_variant_ids: List of variant IDs from the lineage
|
3305
|
+
pdb_variant_ids: List of variant IDs from PDB matching
|
3306
|
+
model: Gemini model for matching
|
3307
|
+
|
3308
|
+
Returns:
|
3309
|
+
Dictionary mapping lineage_variant_id -> pdb_variant_id
|
3310
|
+
"""
|
3311
|
+
if not lineage_variant_ids or not pdb_variant_ids or not model:
|
3312
|
+
return {}
|
3313
|
+
|
3314
|
+
# If the lists are identical, return direct mapping
|
3315
|
+
if set(lineage_variant_ids) == set(pdb_variant_ids):
|
3316
|
+
return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
|
3317
|
+
|
3318
|
+
# Use Gemini to match variant IDs that may have formatting differences
|
3319
|
+
prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
|
3320
|
+
These represent the same enzyme variants but may be formatted differently.
|
3321
|
+
|
3322
|
+
Lineage variant IDs:
|
3323
|
+
{json.dumps(lineage_variant_ids, indent=2)}
|
3324
|
+
|
3325
|
+
PDB variant IDs:
|
3326
|
+
{json.dumps(pdb_variant_ids, indent=2)}
|
3327
|
+
|
3328
|
+
Match variants that represent the SAME enzyme variant, accounting for:
|
3329
|
+
- Whitespace differences (extra spaces, tabs)
|
3330
|
+
- Character encoding differences
|
3331
|
+
- Minor formatting variations
|
3332
|
+
|
3333
|
+
Return ONLY a JSON object mapping lineage IDs to PDB IDs.
|
3334
|
+
Format: {{"lineage_id": "pdb_id", ...}}
|
3335
|
+
Only include matches you are confident represent the same variant.
|
3336
|
+
Return an empty object {{}} if no matches can be confidently made.
|
3337
|
+
"""
|
3338
|
+
|
3339
|
+
try:
|
3340
|
+
response = model.generate_content(prompt)
|
3341
|
+
text = _extract_text(response).strip()
|
3342
|
+
|
3343
|
+
# Parse JSON response
|
3344
|
+
if text.startswith("```"):
|
3345
|
+
text = text.split("```")[1].strip()
|
3346
|
+
if text.startswith("json"):
|
3347
|
+
text = text[4:].strip()
|
3348
|
+
|
3349
|
+
# Clean up the text
|
3350
|
+
text = text.strip()
|
3351
|
+
if not text or text == "{}":
|
3352
|
+
return {}
|
3353
|
+
|
3354
|
+
matches = json.loads(text)
|
3355
|
+
log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
|
3356
|
+
|
3357
|
+
# Validate matches
|
3358
|
+
valid_matches = {}
|
3359
|
+
for lineage_id, pdb_id in matches.items():
|
3360
|
+
if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
|
3361
|
+
valid_matches[lineage_id] = pdb_id
|
3362
|
+
log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
|
3363
|
+
else:
|
3364
|
+
log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
|
3365
|
+
|
3366
|
+
return valid_matches
|
3367
|
+
|
3368
|
+
except Exception as e:
|
3369
|
+
log.warning(f"Failed to match variant IDs with Gemini: {e}")
|
3370
|
+
return {}
|
3371
|
+
|
3372
|
+
|
3131
3373
|
def match_pdb_to_variants(
|
3132
3374
|
pdb_sequences: Dict[str, str],
|
3133
3375
|
variants: List[Variant],
|
@@ -3211,24 +3453,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
|
|
3211
3453
|
text = _extract_text(response).strip()
|
3212
3454
|
|
3213
3455
|
# Parse JSON response (expecting a single string)
|
3214
|
-
|
3456
|
+
# Look for JSON code blocks first
|
3457
|
+
if "```json" in text:
|
3458
|
+
# Extract content between ```json and ```
|
3459
|
+
import re
|
3460
|
+
json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
|
3461
|
+
if json_match:
|
3462
|
+
json_content = json_match.group(1).strip()
|
3463
|
+
try:
|
3464
|
+
# Parse as JSON and extract the string value
|
3465
|
+
parsed = json.loads(json_content)
|
3466
|
+
matched_variant = str(parsed).strip('"\'')
|
3467
|
+
except:
|
3468
|
+
# If JSON parsing fails, try to extract the quoted string
|
3469
|
+
quoted_match = re.search(r'"([^"]+)"', json_content)
|
3470
|
+
if quoted_match:
|
3471
|
+
matched_variant = quoted_match.group(1)
|
3472
|
+
else:
|
3473
|
+
matched_variant = json_content.strip('"\'')
|
3474
|
+
else:
|
3475
|
+
matched_variant = text.strip('"\'')
|
3476
|
+
elif text.startswith("```"):
|
3477
|
+
# Handle other code blocks
|
3215
3478
|
text = text.split("```")[1].strip()
|
3216
3479
|
if text.startswith("json"):
|
3217
3480
|
text = text[4:].strip()
|
3481
|
+
matched_variant = text.strip('"\'')
|
3482
|
+
else:
|
3483
|
+
# Look for quoted strings in the response
|
3484
|
+
import re
|
3485
|
+
quoted_match = re.search(r'"([^"]+)"', text)
|
3486
|
+
if quoted_match:
|
3487
|
+
matched_variant = quoted_match.group(1)
|
3488
|
+
else:
|
3489
|
+
# Remove quotes if present
|
3490
|
+
matched_variant = text.strip('"\'')
|
3218
3491
|
|
3219
|
-
|
3220
|
-
text = text.strip('"\'')
|
3221
|
-
|
3222
|
-
matched_variant = text
|
3492
|
+
log.info(f"Extracted variant name: '{matched_variant}' from response")
|
3223
3493
|
log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
|
3224
3494
|
|
3225
3495
|
# Return mapping with all chains pointing to the same variant
|
3226
3496
|
mapping = {}
|
3227
|
-
if matched_variant
|
3228
|
-
|
3229
|
-
|
3230
|
-
|
3497
|
+
if matched_variant:
|
3498
|
+
# Debug logging
|
3499
|
+
variant_ids = [v.variant_id for v in variants]
|
3500
|
+
log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
|
3501
|
+
|
3502
|
+
# Check if the matched variant exists in the lineage
|
3503
|
+
found_variant = any(v.variant_id == matched_variant for v in variants)
|
3504
|
+
log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
|
3505
|
+
|
3506
|
+
if found_variant:
|
3507
|
+
for chain_id in pdb_sequences:
|
3508
|
+
mapping[matched_variant] = chain_id
|
3509
|
+
log.info(f"Created mapping: {matched_variant} -> {chain_id}")
|
3510
|
+
break # Only use the first chain
|
3511
|
+
else:
|
3512
|
+
log.warning(f"Variant '{matched_variant}' not found in lineage variants")
|
3513
|
+
# Try fuzzy matching
|
3514
|
+
for variant in variants:
|
3515
|
+
if variant.variant_id.strip() == matched_variant.strip():
|
3516
|
+
log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
|
3517
|
+
for chain_id in pdb_sequences:
|
3518
|
+
mapping[variant.variant_id] = chain_id
|
3519
|
+
log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
|
3520
|
+
break
|
3521
|
+
break
|
3522
|
+
else:
|
3523
|
+
log.warning("No matched variant extracted from response")
|
3231
3524
|
|
3525
|
+
log.info(f"Final mapping result: {mapping}")
|
3232
3526
|
return mapping
|
3233
3527
|
|
3234
3528
|
except Exception as e:
|
@@ -3364,6 +3658,9 @@ Only match variants that represent the SAME enzyme, accounting for different nam
|
|
3364
3658
|
Return ONLY a JSON object mapping lineage IDs to sequence IDs.
|
3365
3659
|
Format: {{"lineage_id": "sequence_id", ...}}
|
3366
3660
|
Only include matches you are confident represent the same variant.
|
3661
|
+
|
3662
|
+
DO NOT include any explanation, reasoning, or text other than the JSON object.
|
3663
|
+
Response must be valid JSON that starts with {{ and ends with }}
|
3367
3664
|
"""
|
3368
3665
|
|
3369
3666
|
try:
|
@@ -3406,17 +3703,28 @@ Only include matches you are confident represent the same variant.
|
|
3406
3703
|
log.error(f"Full cleaned text: {text}")
|
3407
3704
|
# Try to extract JSON from within the response
|
3408
3705
|
import re
|
3409
|
-
|
3410
|
-
|
3706
|
+
# First try to find JSON in code blocks
|
3707
|
+
code_block_match = re.search(r'```json\s*(\{[^`]*\})\s*```', text, re.DOTALL)
|
3708
|
+
if code_block_match:
|
3411
3709
|
try:
|
3412
|
-
matches = json.loads(
|
3413
|
-
log.info(f"Successfully extracted JSON from
|
3710
|
+
matches = json.loads(code_block_match.group(1))
|
3711
|
+
log.info(f"Successfully extracted JSON from code block: {len(matches)} matches")
|
3414
3712
|
except json.JSONDecodeError:
|
3415
|
-
log.error("Failed to
|
3713
|
+
log.error("Failed to parse JSON from code block")
|
3416
3714
|
matches = {}
|
3417
3715
|
else:
|
3418
|
-
|
3419
|
-
|
3716
|
+
# Try to find standalone JSON object (non-greedy, looking for balanced braces)
|
3717
|
+
json_match = re.search(r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})', text)
|
3718
|
+
if json_match:
|
3719
|
+
try:
|
3720
|
+
matches = json.loads(json_match.group(1))
|
3721
|
+
log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
|
3722
|
+
except json.JSONDecodeError:
|
3723
|
+
log.error("Failed to extract JSON from response")
|
3724
|
+
matches = {}
|
3725
|
+
else:
|
3726
|
+
log.error("No JSON object found in response")
|
3727
|
+
matches = {}
|
3420
3728
|
|
3421
3729
|
# Create a mapping of sequence IDs to their data for efficient lookup
|
3422
3730
|
seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
|
@@ -3596,14 +3904,28 @@ def run_pipeline(
|
|
3596
3904
|
caption_text = limited_caption_concat(*pdf_paths)
|
3597
3905
|
full_text = limited_concat(*pdf_paths)
|
3598
3906
|
|
3907
|
+
# Also load separate texts for manuscript and SI
|
3908
|
+
manuscript_text = limited_concat(manuscript) if manuscript else None
|
3909
|
+
si_text = limited_concat(si_path) if si_path else None
|
3910
|
+
|
3599
3911
|
log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
|
3600
3912
|
len(caption_text), len(full_text))
|
3913
|
+
if manuscript_text:
|
3914
|
+
log.info("Loaded %d chars from manuscript", len(manuscript_text))
|
3915
|
+
if si_text:
|
3916
|
+
log.info("Loaded %d chars from SI", len(si_text))
|
3601
3917
|
|
3602
3918
|
# 2. Connect to Gemini -----------------------------------------------------
|
3603
3919
|
model = get_model()
|
3604
3920
|
|
3605
3921
|
# 3. Extract lineage (Section 6) ------------------------------------------
|
3606
|
-
lineage, campaigns = get_lineage(
|
3922
|
+
lineage, campaigns = get_lineage(
|
3923
|
+
caption_text, full_text, model,
|
3924
|
+
pdf_paths=pdf_paths,
|
3925
|
+
debug_dir=debug_dir,
|
3926
|
+
manuscript_text=manuscript_text,
|
3927
|
+
si_text=si_text
|
3928
|
+
)
|
3607
3929
|
|
3608
3930
|
if not lineage:
|
3609
3931
|
raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
|
@@ -3683,12 +4005,40 @@ def run_pipeline(
|
|
3683
4005
|
pdb_sequences, lineage, full_text, model, pdb_id
|
3684
4006
|
)
|
3685
4007
|
|
4008
|
+
log.info(f"PDB matching result: {variant_to_chain}")
|
4009
|
+
log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
|
4010
|
+
log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
|
4011
|
+
|
3686
4012
|
# Convert to SequenceBlock objects
|
3687
4013
|
pdb_seq_blocks = []
|
3688
|
-
|
3689
|
-
|
3690
|
-
|
3691
|
-
|
4014
|
+
|
4015
|
+
# Use Gemini-based matching for robust variant ID comparison
|
4016
|
+
if variant_to_chain and model:
|
4017
|
+
# Create a mapping using Gemini for robust string matching
|
4018
|
+
gemini_mapping = _match_variant_ids_with_gemini(
|
4019
|
+
lineage_variant_ids=[v.variant_id for v in lineage],
|
4020
|
+
pdb_variant_ids=list(variant_to_chain.keys()),
|
4021
|
+
model=model
|
4022
|
+
)
|
4023
|
+
|
4024
|
+
for variant in lineage:
|
4025
|
+
log.info(f"Processing variant: {variant.variant_id}")
|
4026
|
+
|
4027
|
+
# Try direct match first
|
4028
|
+
chain_id = variant_to_chain.get(variant.variant_id)
|
4029
|
+
log.info(f"Direct match for {variant.variant_id}: {chain_id}")
|
4030
|
+
|
4031
|
+
# If no direct match, try Gemini-based matching
|
4032
|
+
if not chain_id:
|
4033
|
+
matched_pdb_variant = gemini_mapping.get(variant.variant_id)
|
4034
|
+
log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
|
4035
|
+
if matched_pdb_variant:
|
4036
|
+
chain_id = variant_to_chain.get(matched_pdb_variant)
|
4037
|
+
log.info(f"Chain ID from Gemini match: {chain_id}")
|
4038
|
+
|
4039
|
+
if chain_id and chain_id in pdb_sequences:
|
4040
|
+
seq_length = len(pdb_sequences[chain_id])
|
4041
|
+
log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
|
3692
4042
|
seq_block = SequenceBlock(
|
3693
4043
|
variant_id=variant.variant_id,
|
3694
4044
|
aa_seq=pdb_sequences[chain_id],
|
@@ -3699,6 +4049,26 @@ def run_pipeline(
|
|
3699
4049
|
)
|
3700
4050
|
pdb_seq_blocks.append(seq_block)
|
3701
4051
|
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
4052
|
+
else:
|
4053
|
+
log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
|
4054
|
+
else:
|
4055
|
+
# Fallback to direct matching if no model or no matches
|
4056
|
+
for variant in lineage:
|
4057
|
+
if variant.variant_id in variant_to_chain:
|
4058
|
+
chain_id = variant_to_chain[variant.variant_id]
|
4059
|
+
if chain_id in pdb_sequences:
|
4060
|
+
seq_block = SequenceBlock(
|
4061
|
+
variant_id=variant.variant_id,
|
4062
|
+
aa_seq=pdb_sequences[chain_id],
|
4063
|
+
dna_seq=None,
|
4064
|
+
confidence=1.0, # High confidence for PDB sequences
|
4065
|
+
truncated=False,
|
4066
|
+
metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
|
4067
|
+
)
|
4068
|
+
pdb_seq_blocks.append(seq_block)
|
4069
|
+
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
4070
|
+
|
4071
|
+
log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
|
3702
4072
|
|
3703
4073
|
if pdb_seq_blocks:
|
3704
4074
|
# Update the dataframe with PDB sequences
|
@@ -3708,8 +4078,13 @@ def run_pipeline(
|
|
3708
4078
|
df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
|
3709
4079
|
df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
|
3710
4080
|
df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
|
4081
|
+
log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
|
4082
|
+
else:
|
4083
|
+
log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
|
3711
4084
|
log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
|
3712
4085
|
break
|
4086
|
+
else:
|
4087
|
+
log.warning(f"No PDB sequence blocks were created for {pdb_id}")
|
3713
4088
|
else:
|
3714
4089
|
log.warning(f"No sequences found in PDB {pdb_id}")
|
3715
4090
|
else:
|