debase 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/caption_pattern.py +7 -2
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +423 -86
- debase/lineage_format.py +44 -1
- debase/reaction_info_extractor.py +73 -61
- debase/substrate_scope_extractor.py +84 -32
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
- debase-0.6.2.dist-info/RECORD +18 -0
- debase-0.6.1.dist-info/RECORD +0 -18
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.1.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0
@@ -465,7 +465,7 @@ def get_model():
|
|
465
465
|
"temperature": 0.0, # Deterministic: always pick the most likely token
|
466
466
|
"top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
|
467
467
|
"top_k": 1, # Only consider the single most likely token
|
468
|
-
"max_output_tokens":
|
468
|
+
"max_output_tokens": 65536, # Increased to 2x for handling larger lineage tables and sequences
|
469
469
|
}
|
470
470
|
|
471
471
|
# For Gemini 2.5 Flash, disable thinking tokens to save costs
|
@@ -758,13 +758,24 @@ mutations were introduced){campaign_specific}. Pay attention to the provided con
|
|
758
758
|
ensure the location you return are actually lineage location with variants and mutations.
|
759
759
|
|
760
760
|
Respond with a JSON array of objects, each containing:
|
761
|
-
- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
761
|
+
- "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
762
762
|
- "type": one of "table", "figure", "section"
|
763
763
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
764
764
|
- "reason": brief explanation of why this location likely contains lineage
|
765
|
+
- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
|
766
|
+
- "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
|
765
767
|
{campaign_field}
|
766
|
-
|
767
|
-
|
768
|
+
CRITICAL INSTRUCTIONS:
|
769
|
+
1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
|
770
|
+
- Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
|
771
|
+
- Do NOT modify, standardize, or interpret the location - return it verbatim from the document
|
772
|
+
2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
|
773
|
+
- This should be the complete caption as it appears in the document
|
774
|
+
- Include at least 200-300 characters to ensure unique matching
|
775
|
+
3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
|
776
|
+
- Items like "Table S1", "Figure S2", etc. are typically in the SI
|
777
|
+
- Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
|
778
|
+
- If uncertain, use context clues from the text
|
768
779
|
|
769
780
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
770
781
|
mutation lists should be ranked higher than figures showing complete variant lineages.
|
@@ -774,9 +785,9 @@ Don't include oligonucleotide results or result from only one round.
|
|
774
785
|
|
775
786
|
Example output:
|
776
787
|
[
|
777
|
-
{{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
|
778
|
-
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
|
779
|
-
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
|
788
|
+
{{"location": "Table S1.", "type": "table", "confidence": 95, "reason": "Variant lineage table", "source": "si", "caption": "Table S1. Summary of mutations introduced during directed evolution of PA-G8. The table shows all variants tested in each round of SSM with their corresponding mutations and activities..."{campaign_example}}},
|
789
|
+
{{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram", "source": "manuscript", "caption": "Figure 2B Phylogenetic tree showing the evolutionary relationships between enzyme variants. Each node represents a variant with mutations indicated on branches..."{campaign_example}}},
|
790
|
+
{{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description", "source": "manuscript", "caption": "Section 3.2 Directed Evolution Campaign. We performed eight rounds of site-saturation mutagenesis..."{campaign_example}}}
|
780
791
|
]
|
781
792
|
""".strip()
|
782
793
|
|
@@ -1461,10 +1472,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1461
1472
|
|
1462
1473
|
# ---- 6.4 Public API -------------------------------------------------------
|
1463
1474
|
|
1464
|
-
def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
|
1465
|
-
"""Extract text from a specific location (table, section, etc.) in the full text.
|
1475
|
+
def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
|
1476
|
+
"""Extract text from a specific location (table, section, etc.) in the full text.
|
1477
|
+
|
1478
|
+
Args:
|
1479
|
+
full_text: The full text to search in
|
1480
|
+
location: The location identifier (e.g., "Table S1")
|
1481
|
+
location_type: Type of location ("table", "figure", "section")
|
1482
|
+
caption_hint: Optional full caption text for fuzzy matching
|
1483
|
+
"""
|
1466
1484
|
import re
|
1467
1485
|
|
1486
|
+
# If caption hint is provided, try fuzzy matching first
|
1487
|
+
if caption_hint and len(caption_hint) > 20:
|
1488
|
+
log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
|
1489
|
+
|
1490
|
+
# Normalize texts for better matching (similar to reaction_info_extractor)
|
1491
|
+
def normalize_for_matching(text):
|
1492
|
+
# Remove extra whitespace, normalize spaces around punctuation
|
1493
|
+
text = ' '.join(text.split())
|
1494
|
+
# Normalize different dash types
|
1495
|
+
text = text.replace('–', '-').replace('—', '-')
|
1496
|
+
return text
|
1497
|
+
|
1498
|
+
normalized_hint = normalize_for_matching(caption_hint[:150]) # Use first 150 chars
|
1499
|
+
normalized_text = normalize_for_matching(full_text)
|
1500
|
+
|
1501
|
+
# Try to find ALL caption matches using character-based fuzzy matching
|
1502
|
+
all_matches = []
|
1503
|
+
|
1504
|
+
# Slide through the text looking for all matches above threshold
|
1505
|
+
hint_len = len(normalized_hint)
|
1506
|
+
for i in range(len(normalized_text) - hint_len + 1):
|
1507
|
+
snippet = normalized_text[i:i + hint_len]
|
1508
|
+
# Simple character-based similarity
|
1509
|
+
matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
|
1510
|
+
score = matches / hint_len
|
1511
|
+
|
1512
|
+
if score > 0.7: # 70% similarity threshold
|
1513
|
+
all_matches.append({
|
1514
|
+
'norm_pos': i,
|
1515
|
+
'score': score
|
1516
|
+
})
|
1517
|
+
|
1518
|
+
# If we found matches, extract from all of them
|
1519
|
+
if all_matches:
|
1520
|
+
log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
|
1521
|
+
|
1522
|
+
# Collect all occurrences from fuzzy matches
|
1523
|
+
all_occurrences = []
|
1524
|
+
seen_positions = set()
|
1525
|
+
|
1526
|
+
for match_info in all_matches:
|
1527
|
+
# Get the matched text from normalized version
|
1528
|
+
matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
|
1529
|
+
|
1530
|
+
# Find where this appears in the original text
|
1531
|
+
best_original_pos = -1
|
1532
|
+
|
1533
|
+
# Search in the original text for this specific match
|
1534
|
+
for i in range(len(full_text) - len(caption_hint) + 1):
|
1535
|
+
if i in seen_positions:
|
1536
|
+
continue
|
1537
|
+
|
1538
|
+
original_snippet = full_text[i:i + len(caption_hint)]
|
1539
|
+
# Normalize and compare
|
1540
|
+
normalized_snippet = normalize_for_matching(original_snippet)
|
1541
|
+
if normalized_snippet[:hint_len] == matched_normalized:
|
1542
|
+
# Found exact match after normalization
|
1543
|
+
best_original_pos = i
|
1544
|
+
seen_positions.add(i)
|
1545
|
+
break
|
1546
|
+
|
1547
|
+
if best_original_pos >= 0:
|
1548
|
+
# Extract generous context from this match position
|
1549
|
+
start = max(0, best_original_pos - 1000)
|
1550
|
+
end = min(len(full_text), best_original_pos + 10000)
|
1551
|
+
context = full_text[start:end]
|
1552
|
+
|
1553
|
+
all_occurrences.append({
|
1554
|
+
'position': best_original_pos,
|
1555
|
+
'context': context,
|
1556
|
+
'score': match_info['score']
|
1557
|
+
})
|
1558
|
+
log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
|
1559
|
+
|
1560
|
+
if all_occurrences:
|
1561
|
+
# Sort by position to maintain document order
|
1562
|
+
all_occurrences.sort(key=lambda x: x['position'])
|
1563
|
+
|
1564
|
+
# Combine all occurrences
|
1565
|
+
combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
|
1566
|
+
|
1567
|
+
for i, occurrence in enumerate(all_occurrences, 1):
|
1568
|
+
combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
|
1569
|
+
combined_text += occurrence['context']
|
1570
|
+
combined_text += "\n\n"
|
1571
|
+
|
1572
|
+
# Apply same limit as table extraction
|
1573
|
+
if len(combined_text) > 150000:
|
1574
|
+
combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
|
1575
|
+
|
1576
|
+
log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
|
1577
|
+
return combined_text
|
1578
|
+
else:
|
1579
|
+
log.warning(f"Could not map any fuzzy matches back to original text")
|
1580
|
+
else:
|
1581
|
+
log.warning(f"No fuzzy matches found for caption above 70% threshold")
|
1582
|
+
|
1468
1583
|
if location_type == 'table':
|
1469
1584
|
# Find ALL mentions of this table and combine them
|
1470
1585
|
location_clean = location.strip()
|
@@ -1506,6 +1621,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
|
|
1506
1621
|
|
1507
1622
|
log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
|
1508
1623
|
|
1624
|
+
# Sort occurrences by position to maintain document order
|
1625
|
+
all_occurrences.sort(key=lambda x: x['position'])
|
1626
|
+
|
1509
1627
|
# Combine all occurrences into one text for Gemini to analyze
|
1510
1628
|
combined_text = f"=== All occurrences of {location_clean} ===\n\n"
|
1511
1629
|
|
@@ -1515,8 +1633,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
|
|
1515
1633
|
combined_text += "\n\n"
|
1516
1634
|
|
1517
1635
|
# Limit total length to avoid overwhelming the model
|
1518
|
-
|
1519
|
-
|
1636
|
+
# Increased limit to ensure actual table content is included
|
1637
|
+
if len(combined_text) > 150000:
|
1638
|
+
combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
|
1520
1639
|
|
1521
1640
|
return combined_text
|
1522
1641
|
|
@@ -1600,6 +1719,8 @@ def get_lineage(
|
|
1600
1719
|
*,
|
1601
1720
|
pdf_paths: Optional[List[Path]] = None,
|
1602
1721
|
debug_dir: str | Path | None = None,
|
1722
|
+
manuscript_text: Optional[str] = None,
|
1723
|
+
si_text: Optional[str] = None,
|
1603
1724
|
) -> Tuple[List[Variant], List[Campaign]]:
|
1604
1725
|
"""
|
1605
1726
|
High-level wrapper used by the pipeline.
|
@@ -1713,8 +1834,21 @@ def get_lineage(
|
|
1713
1834
|
if location_type in ['table', 'text', 'section'] and not extracted_variants:
|
1714
1835
|
log.info(f"Attempting text extraction for {location_type}: {location_str}")
|
1715
1836
|
|
1716
|
-
#
|
1717
|
-
|
1837
|
+
# Determine which text to use based on source
|
1838
|
+
location_source = location.get('source', 'manuscript')
|
1839
|
+
if location_source == 'si' and si_text:
|
1840
|
+
text_to_search = si_text
|
1841
|
+
log.info(f"Using SI text for location {location_str}")
|
1842
|
+
elif location_source == 'manuscript' and manuscript_text:
|
1843
|
+
text_to_search = manuscript_text
|
1844
|
+
log.info(f"Using manuscript text for location {location_str}")
|
1845
|
+
else:
|
1846
|
+
text_to_search = full_text
|
1847
|
+
log.info(f"Using combined text for location {location_str} (fallback)")
|
1848
|
+
|
1849
|
+
# Extract the specific section/table from appropriate text
|
1850
|
+
caption_hint = location.get('caption', '')
|
1851
|
+
section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
|
1718
1852
|
if section_text:
|
1719
1853
|
log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
|
1720
1854
|
# Save extracted section if debug enabled
|
@@ -2028,17 +2162,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
|
2028
2162
|
|
2029
2163
|
Look for table of contents entries or section listings that mention sequences.
|
2030
2164
|
Return a JSON array where each element has:
|
2031
|
-
- "section": the section heading or description
|
2165
|
+
- "section": the section heading or description EXACTLY as it appears
|
2032
2166
|
- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
|
2167
|
+
- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
|
2168
|
+
- "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
|
2033
2169
|
|
2034
2170
|
Focus on:
|
2035
2171
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
2036
2172
|
- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
|
2037
2173
|
- Prioritize sections that mention "protein" or "amino acid" sequences
|
2038
2174
|
|
2039
|
-
CRITICAL:
|
2040
|
-
|
2041
|
-
-
|
2175
|
+
CRITICAL:
|
2176
|
+
1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
|
2177
|
+
- Correct: "53", "S12", "147"
|
2178
|
+
- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
|
2179
|
+
2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
|
2180
|
+
- Pages with "S" prefix (e.g., "S53") are typically in the SI
|
2181
|
+
- Regular page numbers (e.g., "53") are typically in the main manuscript
|
2182
|
+
- Use context clues from the document structure
|
2042
2183
|
|
2043
2184
|
Return [] if no sequence sections are found.
|
2044
2185
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
@@ -2278,44 +2419,34 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
2278
2419
|
|
2279
2420
|
# --- 7.3 Main extraction prompt ---------------------------------------------
|
2280
2421
|
_SEQ_EXTRACTION_PROMPT = """
|
2281
|
-
Extract
|
2282
|
-
|
2283
|
-
|
2284
|
-
|
2285
|
-
|
2286
|
-
|
2287
|
-
|
2288
|
-
|
2289
|
-
-
|
2290
|
-
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
-
|
2296
|
-
-
|
2297
|
-
|
2298
|
-
|
2299
|
-
-
|
2300
|
-
-
|
2301
|
-
|
2302
|
-
|
2303
|
-
|
2304
|
-
|
2305
|
-
|
2306
|
-
|
2307
|
-
Respond ONLY with **minified JSON** that matches the schema below.
|
2308
|
-
NO markdown, no code fences, no commentary.
|
2309
|
-
|
2310
|
-
Schema:
|
2311
|
-
```json
|
2312
|
-
{schema}
|
2313
|
-
```
|
2422
|
+
Extract ALL enzyme variant sequences from the text.
|
2423
|
+
|
2424
|
+
Rules:
|
2425
|
+
1. Use EXACT variant IDs as they appear with each sequence
|
2426
|
+
2. Copy sequences EXACTLY - preserve all amino acids/nucleotides including repeats
|
2427
|
+
3. For each variant:
|
2428
|
+
- If amino acid sequence exists: set aa_seq to the sequence, set dna_seq to null
|
2429
|
+
- If ONLY DNA sequence exists: set dna_seq to the sequence, set aa_seq to null
|
2430
|
+
- NEVER include both aa_seq and dna_seq for the same variant
|
2431
|
+
- IMPORTANT: Always prefer amino acid sequences over DNA sequences when both are available
|
2432
|
+
4. Return ONLY minified JSON, no markdown or commentary
|
2433
|
+
|
2434
|
+
CRITICAL SEQUENCE PRIORITY RULE:
|
2435
|
+
- If you find BOTH amino acid sequence AND DNA sequence for the same variant, ONLY return the amino acid sequence
|
2436
|
+
- Set dna_seq to null when aa_seq is available, even if DNA sequence is present in the text
|
2437
|
+
- Only return dna_seq when NO amino acid sequence exists for that variant
|
2438
|
+
|
2439
|
+
CRITICAL ACCURACY REQUIREMENTS:
|
2440
|
+
- Extract ONLY sequences that are explicitly present in the provided text
|
2441
|
+
- DO NOT generate, infer, or hallucinate any sequences
|
2442
|
+
- Every character in the sequence must be directly copied from the text
|
2443
|
+
- If a sequence appears truncated or incomplete in the text, extract only what is shown
|
2444
|
+
- Be extremely careful and accurate - sequence accuracy is critical for scientific validity
|
2445
|
+
|
2446
|
+
Schema: {schema}
|
2314
2447
|
|
2315
|
-
TEXT
|
2316
|
-
```
|
2448
|
+
TEXT:
|
2317
2449
|
{text}
|
2318
|
-
```
|
2319
2450
|
""".strip()
|
2320
2451
|
|
2321
2452
|
def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
|
@@ -2390,7 +2521,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
|
|
2390
2521
|
|
2391
2522
|
|
2392
2523
|
def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
|
2393
|
-
"""Extract sequence JSON using Gemini with up to
|
2524
|
+
"""Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
|
2394
2525
|
|
2395
2526
|
Can exit early after 2 attempts if the responses match exactly.
|
2396
2527
|
|
@@ -2404,9 +2535,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2404
2535
|
The most common sequence JSON data or None if all attempts failed
|
2405
2536
|
"""
|
2406
2537
|
responses = []
|
2407
|
-
max_attempts = 6
|
2538
|
+
max_attempts = 3 # Reduced from 6 to 3 for performance
|
2408
2539
|
|
2409
|
-
# Try
|
2540
|
+
# Try 3 times with early match detection
|
2410
2541
|
for attempt in range(max_attempts):
|
2411
2542
|
try:
|
2412
2543
|
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
@@ -2432,8 +2563,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2432
2563
|
|
2433
2564
|
# Try to parse as JSON
|
2434
2565
|
try:
|
2435
|
-
|
2436
|
-
|
2566
|
+
# First clean the response - remove any BOM or invisible characters
|
2567
|
+
raw_clean = raw.strip()
|
2568
|
+
if raw_clean.startswith('\ufeff'): # Remove BOM if present
|
2569
|
+
raw_clean = raw_clean[1:]
|
2570
|
+
parsed = json.loads(raw_clean)
|
2571
|
+
except json.JSONDecodeError as e:
|
2572
|
+
log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
|
2437
2573
|
# Look for JSON array or object in the response
|
2438
2574
|
json_start = -1
|
2439
2575
|
json_end = -1
|
@@ -2482,17 +2618,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2482
2618
|
responses.append(parsed)
|
2483
2619
|
log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
|
2484
2620
|
|
2485
|
-
#
|
2486
|
-
if
|
2487
|
-
|
2488
|
-
if
|
2489
|
-
|
2490
|
-
if
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
|
2621
|
+
# If we got a good response with sequences, we can check for early termination
|
2622
|
+
if isinstance(parsed, list) and len(parsed) > 0:
|
2623
|
+
# Early match detection after 2 attempts
|
2624
|
+
if attempt >= 1: # After 2nd attempt (0-indexed)
|
2625
|
+
valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
|
2626
|
+
if len(valid_responses_so_far) >= 2:
|
2627
|
+
# Check if the last two valid responses match
|
2628
|
+
if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
|
2629
|
+
log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
|
2630
|
+
# Add the matching response to fill remaining attempts
|
2631
|
+
for _ in range(max_attempts - attempt - 1):
|
2632
|
+
responses.append(valid_responses_so_far[-1])
|
2633
|
+
break
|
2634
|
+
# If this is the first attempt and we got sequences, continue to validate with at least one more
|
2635
|
+
elif attempt == 0 and len(parsed) > 5: # Got substantial sequences on first try
|
2636
|
+
log.info("Got substantial sequences on first attempt, will validate with one more")
|
2496
2637
|
|
2497
2638
|
except Exception as e:
|
2498
2639
|
log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
|
@@ -2852,9 +2993,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2852
2993
|
focused_text = ""
|
2853
2994
|
if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
|
2854
2995
|
page_num = best_location['page']
|
2855
|
-
# Extract current page plus next
|
2996
|
+
# Extract current page plus next 5 pages (6 total) to prevent hallucination
|
2856
2997
|
all_pages = []
|
2857
|
-
for i in range(
|
2998
|
+
for i in range(6): # Current + next 5 (6 pages total)
|
2858
2999
|
if isinstance(page_num, str) and page_num.upper().startswith('S'):
|
2859
3000
|
next_page = f"S{int(page_num[1:]) + i}"
|
2860
3001
|
else:
|
@@ -2866,7 +3007,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2866
3007
|
break
|
2867
3008
|
if all_pages:
|
2868
3009
|
focused_text = "\n".join(all_pages)
|
2869
|
-
log.info("Extracted %d chars from pages %s through %d more pages",
|
3010
|
+
log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
|
2870
3011
|
len(focused_text), page_num, len(all_pages) - 1)
|
2871
3012
|
|
2872
3013
|
# Fallback to text search if page extraction didn't work
|
@@ -3152,6 +3293,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
|
|
3152
3293
|
return {}
|
3153
3294
|
|
3154
3295
|
|
3296
|
+
def _match_variant_ids_with_gemini(
|
3297
|
+
lineage_variant_ids: List[str],
|
3298
|
+
pdb_variant_ids: List[str],
|
3299
|
+
model
|
3300
|
+
) -> Dict[str, str]:
|
3301
|
+
"""Use Gemini to match variant IDs that may have slight formatting differences.
|
3302
|
+
|
3303
|
+
Args:
|
3304
|
+
lineage_variant_ids: List of variant IDs from the lineage
|
3305
|
+
pdb_variant_ids: List of variant IDs from PDB matching
|
3306
|
+
model: Gemini model for matching
|
3307
|
+
|
3308
|
+
Returns:
|
3309
|
+
Dictionary mapping lineage_variant_id -> pdb_variant_id
|
3310
|
+
"""
|
3311
|
+
if not lineage_variant_ids or not pdb_variant_ids or not model:
|
3312
|
+
return {}
|
3313
|
+
|
3314
|
+
# If the lists are identical, return direct mapping
|
3315
|
+
if set(lineage_variant_ids) == set(pdb_variant_ids):
|
3316
|
+
return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
|
3317
|
+
|
3318
|
+
# Use Gemini to match variant IDs that may have formatting differences
|
3319
|
+
prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
|
3320
|
+
These represent the same enzyme variants but may be formatted differently.
|
3321
|
+
|
3322
|
+
Lineage variant IDs:
|
3323
|
+
{json.dumps(lineage_variant_ids, indent=2)}
|
3324
|
+
|
3325
|
+
PDB variant IDs:
|
3326
|
+
{json.dumps(pdb_variant_ids, indent=2)}
|
3327
|
+
|
3328
|
+
Match variants that represent the SAME enzyme variant, accounting for:
|
3329
|
+
- Whitespace differences (extra spaces, tabs)
|
3330
|
+
- Character encoding differences
|
3331
|
+
- Minor formatting variations
|
3332
|
+
|
3333
|
+
Return ONLY a JSON object mapping lineage IDs to PDB IDs.
|
3334
|
+
Format: {{"lineage_id": "pdb_id", ...}}
|
3335
|
+
Only include matches you are confident represent the same variant.
|
3336
|
+
Return an empty object {{}} if no matches can be confidently made.
|
3337
|
+
"""
|
3338
|
+
|
3339
|
+
try:
|
3340
|
+
response = model.generate_content(prompt)
|
3341
|
+
text = _extract_text(response).strip()
|
3342
|
+
|
3343
|
+
# Parse JSON response
|
3344
|
+
if text.startswith("```"):
|
3345
|
+
text = text.split("```")[1].strip()
|
3346
|
+
if text.startswith("json"):
|
3347
|
+
text = text[4:].strip()
|
3348
|
+
|
3349
|
+
# Clean up the text
|
3350
|
+
text = text.strip()
|
3351
|
+
if not text or text == "{}":
|
3352
|
+
return {}
|
3353
|
+
|
3354
|
+
matches = json.loads(text)
|
3355
|
+
log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
|
3356
|
+
|
3357
|
+
# Validate matches
|
3358
|
+
valid_matches = {}
|
3359
|
+
for lineage_id, pdb_id in matches.items():
|
3360
|
+
if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
|
3361
|
+
valid_matches[lineage_id] = pdb_id
|
3362
|
+
log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
|
3363
|
+
else:
|
3364
|
+
log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
|
3365
|
+
|
3366
|
+
return valid_matches
|
3367
|
+
|
3368
|
+
except Exception as e:
|
3369
|
+
log.warning(f"Failed to match variant IDs with Gemini: {e}")
|
3370
|
+
return {}
|
3371
|
+
|
3372
|
+
|
3155
3373
|
def match_pdb_to_variants(
|
3156
3374
|
pdb_sequences: Dict[str, str],
|
3157
3375
|
variants: List[Variant],
|
@@ -3235,24 +3453,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
|
|
3235
3453
|
text = _extract_text(response).strip()
|
3236
3454
|
|
3237
3455
|
# Parse JSON response (expecting a single string)
|
3238
|
-
|
3456
|
+
# Look for JSON code blocks first
|
3457
|
+
if "```json" in text:
|
3458
|
+
# Extract content between ```json and ```
|
3459
|
+
import re
|
3460
|
+
json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
|
3461
|
+
if json_match:
|
3462
|
+
json_content = json_match.group(1).strip()
|
3463
|
+
try:
|
3464
|
+
# Parse as JSON and extract the string value
|
3465
|
+
parsed = json.loads(json_content)
|
3466
|
+
matched_variant = str(parsed).strip('"\'')
|
3467
|
+
except:
|
3468
|
+
# If JSON parsing fails, try to extract the quoted string
|
3469
|
+
quoted_match = re.search(r'"([^"]+)"', json_content)
|
3470
|
+
if quoted_match:
|
3471
|
+
matched_variant = quoted_match.group(1)
|
3472
|
+
else:
|
3473
|
+
matched_variant = json_content.strip('"\'')
|
3474
|
+
else:
|
3475
|
+
matched_variant = text.strip('"\'')
|
3476
|
+
elif text.startswith("```"):
|
3477
|
+
# Handle other code blocks
|
3239
3478
|
text = text.split("```")[1].strip()
|
3240
3479
|
if text.startswith("json"):
|
3241
3480
|
text = text[4:].strip()
|
3481
|
+
matched_variant = text.strip('"\'')
|
3482
|
+
else:
|
3483
|
+
# Look for quoted strings in the response
|
3484
|
+
import re
|
3485
|
+
quoted_match = re.search(r'"([^"]+)"', text)
|
3486
|
+
if quoted_match:
|
3487
|
+
matched_variant = quoted_match.group(1)
|
3488
|
+
else:
|
3489
|
+
# Remove quotes if present
|
3490
|
+
matched_variant = text.strip('"\'')
|
3242
3491
|
|
3243
|
-
|
3244
|
-
text = text.strip('"\'')
|
3245
|
-
|
3246
|
-
matched_variant = text
|
3492
|
+
log.info(f"Extracted variant name: '{matched_variant}' from response")
|
3247
3493
|
log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
|
3248
3494
|
|
3249
3495
|
# Return mapping with all chains pointing to the same variant
|
3250
3496
|
mapping = {}
|
3251
|
-
if matched_variant
|
3252
|
-
|
3253
|
-
|
3254
|
-
|
3497
|
+
if matched_variant:
|
3498
|
+
# Debug logging
|
3499
|
+
variant_ids = [v.variant_id for v in variants]
|
3500
|
+
log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
|
3501
|
+
|
3502
|
+
# Check if the matched variant exists in the lineage
|
3503
|
+
found_variant = any(v.variant_id == matched_variant for v in variants)
|
3504
|
+
log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
|
3505
|
+
|
3506
|
+
if found_variant:
|
3507
|
+
for chain_id in pdb_sequences:
|
3508
|
+
mapping[matched_variant] = chain_id
|
3509
|
+
log.info(f"Created mapping: {matched_variant} -> {chain_id}")
|
3510
|
+
break # Only use the first chain
|
3511
|
+
else:
|
3512
|
+
log.warning(f"Variant '{matched_variant}' not found in lineage variants")
|
3513
|
+
# Try fuzzy matching
|
3514
|
+
for variant in variants:
|
3515
|
+
if variant.variant_id.strip() == matched_variant.strip():
|
3516
|
+
log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
|
3517
|
+
for chain_id in pdb_sequences:
|
3518
|
+
mapping[variant.variant_id] = chain_id
|
3519
|
+
log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
|
3520
|
+
break
|
3521
|
+
break
|
3522
|
+
else:
|
3523
|
+
log.warning("No matched variant extracted from response")
|
3255
3524
|
|
3525
|
+
log.info(f"Final mapping result: {mapping}")
|
3256
3526
|
return mapping
|
3257
3527
|
|
3258
3528
|
except Exception as e:
|
@@ -3634,14 +3904,28 @@ def run_pipeline(
|
|
3634
3904
|
caption_text = limited_caption_concat(*pdf_paths)
|
3635
3905
|
full_text = limited_concat(*pdf_paths)
|
3636
3906
|
|
3907
|
+
# Also load separate texts for manuscript and SI
|
3908
|
+
manuscript_text = limited_concat(manuscript) if manuscript else None
|
3909
|
+
si_text = limited_concat(si_path) if si_path else None
|
3910
|
+
|
3637
3911
|
log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
|
3638
3912
|
len(caption_text), len(full_text))
|
3913
|
+
if manuscript_text:
|
3914
|
+
log.info("Loaded %d chars from manuscript", len(manuscript_text))
|
3915
|
+
if si_text:
|
3916
|
+
log.info("Loaded %d chars from SI", len(si_text))
|
3639
3917
|
|
3640
3918
|
# 2. Connect to Gemini -----------------------------------------------------
|
3641
3919
|
model = get_model()
|
3642
3920
|
|
3643
3921
|
# 3. Extract lineage (Section 6) ------------------------------------------
|
3644
|
-
lineage, campaigns = get_lineage(
|
3922
|
+
lineage, campaigns = get_lineage(
|
3923
|
+
caption_text, full_text, model,
|
3924
|
+
pdf_paths=pdf_paths,
|
3925
|
+
debug_dir=debug_dir,
|
3926
|
+
manuscript_text=manuscript_text,
|
3927
|
+
si_text=si_text
|
3928
|
+
)
|
3645
3929
|
|
3646
3930
|
if not lineage:
|
3647
3931
|
raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
|
@@ -3721,12 +4005,40 @@ def run_pipeline(
|
|
3721
4005
|
pdb_sequences, lineage, full_text, model, pdb_id
|
3722
4006
|
)
|
3723
4007
|
|
4008
|
+
log.info(f"PDB matching result: {variant_to_chain}")
|
4009
|
+
log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
|
4010
|
+
log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
|
4011
|
+
|
3724
4012
|
# Convert to SequenceBlock objects
|
3725
4013
|
pdb_seq_blocks = []
|
3726
|
-
|
3727
|
-
|
3728
|
-
|
3729
|
-
|
4014
|
+
|
4015
|
+
# Use Gemini-based matching for robust variant ID comparison
|
4016
|
+
if variant_to_chain and model:
|
4017
|
+
# Create a mapping using Gemini for robust string matching
|
4018
|
+
gemini_mapping = _match_variant_ids_with_gemini(
|
4019
|
+
lineage_variant_ids=[v.variant_id for v in lineage],
|
4020
|
+
pdb_variant_ids=list(variant_to_chain.keys()),
|
4021
|
+
model=model
|
4022
|
+
)
|
4023
|
+
|
4024
|
+
for variant in lineage:
|
4025
|
+
log.info(f"Processing variant: {variant.variant_id}")
|
4026
|
+
|
4027
|
+
# Try direct match first
|
4028
|
+
chain_id = variant_to_chain.get(variant.variant_id)
|
4029
|
+
log.info(f"Direct match for {variant.variant_id}: {chain_id}")
|
4030
|
+
|
4031
|
+
# If no direct match, try Gemini-based matching
|
4032
|
+
if not chain_id:
|
4033
|
+
matched_pdb_variant = gemini_mapping.get(variant.variant_id)
|
4034
|
+
log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
|
4035
|
+
if matched_pdb_variant:
|
4036
|
+
chain_id = variant_to_chain.get(matched_pdb_variant)
|
4037
|
+
log.info(f"Chain ID from Gemini match: {chain_id}")
|
4038
|
+
|
4039
|
+
if chain_id and chain_id in pdb_sequences:
|
4040
|
+
seq_length = len(pdb_sequences[chain_id])
|
4041
|
+
log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
|
3730
4042
|
seq_block = SequenceBlock(
|
3731
4043
|
variant_id=variant.variant_id,
|
3732
4044
|
aa_seq=pdb_sequences[chain_id],
|
@@ -3737,6 +4049,26 @@ def run_pipeline(
|
|
3737
4049
|
)
|
3738
4050
|
pdb_seq_blocks.append(seq_block)
|
3739
4051
|
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
4052
|
+
else:
|
4053
|
+
log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
|
4054
|
+
else:
|
4055
|
+
# Fallback to direct matching if no model or no matches
|
4056
|
+
for variant in lineage:
|
4057
|
+
if variant.variant_id in variant_to_chain:
|
4058
|
+
chain_id = variant_to_chain[variant.variant_id]
|
4059
|
+
if chain_id in pdb_sequences:
|
4060
|
+
seq_block = SequenceBlock(
|
4061
|
+
variant_id=variant.variant_id,
|
4062
|
+
aa_seq=pdb_sequences[chain_id],
|
4063
|
+
dna_seq=None,
|
4064
|
+
confidence=1.0, # High confidence for PDB sequences
|
4065
|
+
truncated=False,
|
4066
|
+
metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
|
4067
|
+
)
|
4068
|
+
pdb_seq_blocks.append(seq_block)
|
4069
|
+
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
4070
|
+
|
4071
|
+
log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
|
3740
4072
|
|
3741
4073
|
if pdb_seq_blocks:
|
3742
4074
|
# Update the dataframe with PDB sequences
|
@@ -3746,8 +4078,13 @@ def run_pipeline(
|
|
3746
4078
|
df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
|
3747
4079
|
df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
|
3748
4080
|
df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
|
4081
|
+
log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
|
4082
|
+
else:
|
4083
|
+
log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
|
3749
4084
|
log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
|
3750
4085
|
break
|
4086
|
+
else:
|
4087
|
+
log.warning(f"No PDB sequence blocks were created for {pdb_id}")
|
3751
4088
|
else:
|
3752
4089
|
log.warning(f"No sequences found in PDB {pdb_id}")
|
3753
4090
|
else:
|