debase 0.4.5__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +123 -0
- debase/enzyme_lineage_extractor.py +243 -309
- debase/reaction_info_extractor.py +192 -68
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/METADATA +1 -1
- debase-0.5.1.dist-info/RECORD +16 -0
- debase-0.4.5.dist-info/RECORD +0 -16
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/WHEEL +0 -0
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/entry_points.txt +0 -0
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.5.dist-info → debase-0.5.1.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ import pandas as pd
|
|
24
24
|
import networkx as nx # light dependency, used only for generation inference
|
25
25
|
|
26
26
|
import os
|
27
|
+
import fitz
|
27
28
|
import re
|
28
29
|
import json
|
29
30
|
import time
|
@@ -460,8 +461,32 @@ def get_model():
|
|
460
461
|
if not api_key:
|
461
462
|
raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
|
462
463
|
_genai.configure(api_key=api_key)
|
463
|
-
|
464
|
-
|
464
|
+
|
465
|
+
# Create generation config to optimize performance and costs
|
466
|
+
generation_config = {
|
467
|
+
"temperature": 0.0, # Deterministic: always pick the most likely token
|
468
|
+
"top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
|
469
|
+
"top_k": 1, # Only consider the single most likely token
|
470
|
+
"max_output_tokens": 32768, # Increased from 8192 to handle larger sequence extractions
|
471
|
+
}
|
472
|
+
|
473
|
+
# For Gemini 2.5 Flash, disable thinking tokens to save costs
|
474
|
+
# thinking_budget=0 disables thinking, -1 enables dynamic thinking (default)
|
475
|
+
# Only add if SDK supports it to maintain compatibility
|
476
|
+
try:
|
477
|
+
# Test if thinking_budget is supported by making a minimal API call
|
478
|
+
test_config = {"thinking_budget": 0, "max_output_tokens": 10}
|
479
|
+
test_model = _genai.GenerativeModel(MODEL_NAME, generation_config=test_config)
|
480
|
+
# Actually test the API call to see if thinking_budget is supported
|
481
|
+
test_response = test_model.generate_content("Return 'OK'")
|
482
|
+
# If no error, add thinking_budget to main config
|
483
|
+
generation_config["thinking_budget"] = 0
|
484
|
+
log.debug("Disabled thinking tokens (thinking_budget=0)")
|
485
|
+
except Exception as e:
|
486
|
+
# SDK doesn't support thinking_budget, continue without it
|
487
|
+
log.debug(f"thinking_budget not supported: {e}")
|
488
|
+
|
489
|
+
return _genai.GenerativeModel(MODEL_NAME, generation_config=generation_config)
|
465
490
|
|
466
491
|
# === 5.3 Unified call helper ----------------------------------------------
|
467
492
|
|
@@ -728,22 +753,24 @@ Return a JSON object with:
|
|
728
753
|
_LINEAGE_LOC_PROMPT = """
|
729
754
|
You are an expert reader of protein engineering manuscripts.
|
730
755
|
{campaign_context}
|
731
|
-
Given the following article text, list up to {max_results} *locations* (
|
732
|
-
|
733
|
-
|
734
|
-
|
756
|
+
Given the following article text, list up to {max_results} *locations* (figure/table IDs
|
757
|
+
or section headings) that you would review first to find the COMPLETE evolutionary
|
758
|
+
lineage of enzyme variants (i.e. which variant came from which parent and what
|
759
|
+
mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
|
760
|
+
ensure the location you return are actually lineage location with variants and mutations.
|
735
761
|
|
736
762
|
Respond with a JSON array of objects, each containing:
|
737
|
-
- "location": the identifier (e.g. "Table S1", "Figure 2B", "
|
738
|
-
- "type": one of "table", "figure", "
|
763
|
+
- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
764
|
+
- "type": one of "table", "figure", "section"
|
739
765
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
740
766
|
- "reason": brief explanation of why this location likely contains lineage
|
741
767
|
{campaign_field}
|
742
|
-
IMPORTANT:
|
768
|
+
IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
|
769
|
+
NOT page numbers. Focus on the actual figure/table titles and numbers.
|
743
770
|
|
744
771
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
745
|
-
mutation lists should be ranked higher than
|
746
|
-
|
772
|
+
mutation lists should be ranked higher than figures showing complete variant lineages.
|
773
|
+
Sections are used when no suitable tables/figures exist.
|
747
774
|
|
748
775
|
Don't include oligonucleotide results or result from only one round.
|
749
776
|
|
@@ -1713,7 +1740,6 @@ def get_lineage(
|
|
1713
1740
|
for pdf_path in pdf_paths:
|
1714
1741
|
# Extract first few pages looking for TOC
|
1715
1742
|
try:
|
1716
|
-
import fitz # PyMuPDF
|
1717
1743
|
doc = fitz.open(pdf_path)
|
1718
1744
|
toc_text = ""
|
1719
1745
|
for page_num in range(min(5, doc.page_count)): # First 5 pages
|
@@ -2011,7 +2037,7 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
|
|
2011
2037
|
|
2012
2038
|
# --- 7.2 Page-based extraction helper ---------------------------------------
|
2013
2039
|
def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
|
2014
|
-
"""Extract plain text sequence using Gemini with
|
2040
|
+
"""Extract plain text sequence using Gemini with 6 attempts, returning most common result.
|
2015
2041
|
|
2016
2042
|
Args:
|
2017
2043
|
prompt: The prompt to send to Gemini
|
@@ -2019,12 +2045,12 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
|
|
2019
2045
|
context: Additional context for logging (e.g., "validation" or "extraction")
|
2020
2046
|
|
2021
2047
|
Returns:
|
2022
|
-
The
|
2048
|
+
The most common sequence or None if all attempts failed
|
2023
2049
|
"""
|
2024
2050
|
sequences = []
|
2025
|
-
max_attempts =
|
2051
|
+
max_attempts = 6
|
2026
2052
|
|
2027
|
-
# Try
|
2053
|
+
# Try 6 times
|
2028
2054
|
for attempt in range(max_attempts):
|
2029
2055
|
try:
|
2030
2056
|
response = model.generate_content(prompt)
|
@@ -2050,38 +2076,14 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
|
|
2050
2076
|
except Exception as e:
|
2051
2077
|
log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
|
2052
2078
|
sequences.append("ERROR")
|
2053
|
-
|
2054
|
-
# Check for early consensus after 2 attempts
|
2055
|
-
if len(sequences) == 2:
|
2056
|
-
# Clean sequences before comparison
|
2057
|
-
seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
|
2058
|
-
seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
|
2059
|
-
|
2060
|
-
if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
|
2061
|
-
log.info(f"Gemini {context} consensus reached after 2 attempts")
|
2062
|
-
return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
|
2063
|
-
else:
|
2064
|
-
log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
|
2065
2079
|
|
2066
|
-
# After all attempts, find
|
2080
|
+
# After all attempts, find most common result
|
2067
2081
|
valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
|
2068
2082
|
|
2069
2083
|
if not valid_sequences:
|
2070
2084
|
log.error(f"All {max_attempts} {context} attempts failed")
|
2071
2085
|
return None
|
2072
2086
|
|
2073
|
-
# Find any matching pair
|
2074
|
-
for i in range(len(sequences)):
|
2075
|
-
for j in range(i + 1, len(sequences)):
|
2076
|
-
# Clean sequences before comparison
|
2077
|
-
seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
|
2078
|
-
seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
|
2079
|
-
|
2080
|
-
if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
|
2081
|
-
log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
|
2082
|
-
return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
|
2083
|
-
|
2084
|
-
# If no exact match, use adaptive validation
|
2085
2087
|
# Count occurrences of each valid sequence
|
2086
2088
|
sequence_counts = {}
|
2087
2089
|
for seq in valid_sequences:
|
@@ -2090,80 +2092,16 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
|
|
2090
2092
|
seq_clean = seq.replace(" ", "").replace("\n", "")
|
2091
2093
|
sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
|
2092
2094
|
|
2093
|
-
# Return the most common sequence
|
2095
|
+
# Return the most common sequence
|
2094
2096
|
if sequence_counts:
|
2095
2097
|
most_common = max(sequence_counts.items(), key=lambda x: x[1])
|
2096
|
-
|
2097
|
-
|
2098
|
-
return most_common[0]
|
2098
|
+
log.info(f"Gemini {context} most common: sequence appeared {most_common[1]}/{max_attempts} times")
|
2099
|
+
return most_common[0]
|
2099
2100
|
|
2100
|
-
log.warning(f"Gemini {context} no
|
2101
|
+
log.warning(f"Gemini {context} no valid sequences after {max_attempts} attempts")
|
2101
2102
|
return None
|
2102
2103
|
|
2103
2104
|
|
2104
|
-
def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
|
2105
|
-
"""Validate and potentially correct a sequence using Gemini by checking against known mutations."""
|
2106
|
-
|
2107
|
-
# Extract mutations from variants
|
2108
|
-
mutations = []
|
2109
|
-
for variant in variants:
|
2110
|
-
if variant.mutations:
|
2111
|
-
mutations.extend(variant.mutations)
|
2112
|
-
|
2113
|
-
if not mutations:
|
2114
|
-
return None
|
2115
|
-
|
2116
|
-
# Take a sample of mutations for validation
|
2117
|
-
sample_mutations = mutations[:10] # Check first 10 mutations
|
2118
|
-
|
2119
|
-
# First do a quick local check for obvious inconsistencies
|
2120
|
-
local_issues = []
|
2121
|
-
for mutation in sample_mutations:
|
2122
|
-
if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
|
2123
|
-
pos = mutation.position - 1 # Convert to 0-indexed
|
2124
|
-
if 0 <= pos < len(sequence):
|
2125
|
-
actual_aa = sequence[pos]
|
2126
|
-
expected_aa = mutation.original
|
2127
|
-
if actual_aa != expected_aa:
|
2128
|
-
local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
|
2129
|
-
|
2130
|
-
if not local_issues:
|
2131
|
-
return None # No obvious issues found
|
2132
|
-
|
2133
|
-
log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
|
2134
|
-
|
2135
|
-
prompt = f"""
|
2136
|
-
You are validating a protein sequence that was extracted from a scientific paper.
|
2137
|
-
The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
|
2138
|
-
|
2139
|
-
Original sequence (length {len(sequence)}):
|
2140
|
-
{sequence}
|
2141
|
-
|
2142
|
-
Known mutations that should be applicable to this sequence:
|
2143
|
-
{', '.join(str(m) for m in sample_mutations)}
|
2144
|
-
|
2145
|
-
Potential issues detected:
|
2146
|
-
{chr(10).join(local_issues)}
|
2147
|
-
|
2148
|
-
Please check if the sequence is consistent with these mutations:
|
2149
|
-
1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
|
2150
|
-
2. If you find inconsistencies, suggest the most likely correction
|
2151
|
-
3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
|
2152
|
-
4. Pay special attention to consecutive identical amino acids that might be OCR errors
|
2153
|
-
|
2154
|
-
Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
|
2155
|
-
If you cannot determine the correct sequence, return "UNCERTAIN".
|
2156
|
-
"""
|
2157
|
-
|
2158
|
-
# Use triple validation
|
2159
|
-
result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
|
2160
|
-
|
2161
|
-
if result == "VALID" or result is None:
|
2162
|
-
return None # No changes needed
|
2163
|
-
else:
|
2164
|
-
log.info(f"Gemini suggested sequence correction (length {len(result)})")
|
2165
|
-
return result
|
2166
|
-
|
2167
2105
|
|
2168
2106
|
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
|
2169
2107
|
"""Extract text from a specific page number in the PDFs.
|
@@ -2331,11 +2269,11 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
|
|
2331
2269
|
|
2332
2270
|
SEQUENCE EXTRACTION RULES:
|
2333
2271
|
- Copy sequences EXACTLY as they appear in the text
|
2334
|
-
- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
|
2335
|
-
- Do NOT add, remove, or modify any amino acids
|
2272
|
+
- Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
|
2273
|
+
- Do NOT add, remove, or modify any amino acids, or nucleotides
|
2336
2274
|
- Preserve the exact length and character sequence
|
2337
2275
|
- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
|
2338
|
-
- Double-check that consecutive identical amino acids are copied correctly
|
2276
|
+
- Double-check that consecutive identical amino acids or nucleotides are copied correctly
|
2339
2277
|
|
2340
2278
|
For each variant return:
|
2341
2279
|
* variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
|
@@ -2356,8 +2294,81 @@ TEXT (may be truncated):
|
|
2356
2294
|
```
|
2357
2295
|
""".strip()
|
2358
2296
|
|
2297
|
+
def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
|
2298
|
+
"""
|
2299
|
+
Check if two sequence extraction responses match.
|
2300
|
+
|
2301
|
+
Args:
|
2302
|
+
resp1: First response (list of sequences or dict)
|
2303
|
+
resp2: Second response (list of sequences or dict)
|
2304
|
+
|
2305
|
+
Returns:
|
2306
|
+
True if responses match, False otherwise
|
2307
|
+
"""
|
2308
|
+
# Handle None cases
|
2309
|
+
if resp1 is None or resp2 is None:
|
2310
|
+
return False
|
2311
|
+
|
2312
|
+
# Both should be the same type
|
2313
|
+
if type(resp1) != type(resp2):
|
2314
|
+
return False
|
2315
|
+
|
2316
|
+
# If both are lists
|
2317
|
+
if isinstance(resp1, list) and isinstance(resp2, list):
|
2318
|
+
# Must have same length
|
2319
|
+
if len(resp1) != len(resp2):
|
2320
|
+
return False
|
2321
|
+
|
2322
|
+
# Create normalized sequence sets for comparison
|
2323
|
+
seq_set1 = set()
|
2324
|
+
seq_set2 = set()
|
2325
|
+
|
2326
|
+
for seq in resp1:
|
2327
|
+
if isinstance(seq, dict):
|
2328
|
+
variant_id = seq.get("variant_id", "")
|
2329
|
+
aa_seq = seq.get("aa_seq")
|
2330
|
+
dna_seq = seq.get("dna_seq")
|
2331
|
+
# Handle None/null values - convert to empty string for comparison
|
2332
|
+
if aa_seq is None:
|
2333
|
+
aa_seq = ""
|
2334
|
+
else:
|
2335
|
+
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2336
|
+
if dna_seq is None:
|
2337
|
+
dna_seq = ""
|
2338
|
+
else:
|
2339
|
+
dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
|
2340
|
+
seq_set1.add(f"{variant_id}|{aa_seq}|{dna_seq}")
|
2341
|
+
|
2342
|
+
for seq in resp2:
|
2343
|
+
if isinstance(seq, dict):
|
2344
|
+
variant_id = seq.get("variant_id", "")
|
2345
|
+
aa_seq = seq.get("aa_seq")
|
2346
|
+
dna_seq = seq.get("dna_seq")
|
2347
|
+
# Handle None/null values - convert to empty string for comparison
|
2348
|
+
if aa_seq is None:
|
2349
|
+
aa_seq = ""
|
2350
|
+
else:
|
2351
|
+
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2352
|
+
if dna_seq is None:
|
2353
|
+
dna_seq = ""
|
2354
|
+
else:
|
2355
|
+
dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
|
2356
|
+
seq_set2.add(f"{variant_id}|{aa_seq}|{dna_seq}")
|
2357
|
+
|
2358
|
+
return seq_set1 == seq_set2
|
2359
|
+
|
2360
|
+
# If both are dicts, compare normalized content
|
2361
|
+
if isinstance(resp1, dict) and isinstance(resp2, dict):
|
2362
|
+
# Normalize and compare
|
2363
|
+
return json.dumps(resp1, sort_keys=True) == json.dumps(resp2, sort_keys=True)
|
2364
|
+
|
2365
|
+
return False
|
2366
|
+
|
2367
|
+
|
2359
2368
|
def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
|
2360
|
-
"""Extract sequence JSON using Gemini with
|
2369
|
+
"""Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
|
2370
|
+
|
2371
|
+
Can exit early after 2 attempts if the responses match exactly.
|
2361
2372
|
|
2362
2373
|
Args:
|
2363
2374
|
model: The Gemini model instance
|
@@ -2366,12 +2377,12 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2366
2377
|
debug_dir: Optional debug directory
|
2367
2378
|
|
2368
2379
|
Returns:
|
2369
|
-
The
|
2380
|
+
The most common sequence JSON data or None if all attempts failed
|
2370
2381
|
"""
|
2371
2382
|
responses = []
|
2372
|
-
max_attempts =
|
2383
|
+
max_attempts = 6
|
2373
2384
|
|
2374
|
-
# Try
|
2385
|
+
# Try 6 times with early match detection
|
2375
2386
|
for attempt in range(max_attempts):
|
2376
2387
|
try:
|
2377
2388
|
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
@@ -2443,167 +2454,69 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2443
2454
|
else:
|
2444
2455
|
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
2445
2456
|
|
2446
|
-
# Store
|
2447
|
-
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2457
|
+
# Store the response
|
2458
|
+
responses.append(parsed)
|
2459
|
+
log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
|
2460
|
+
|
2461
|
+
# Early match detection after 2 attempts
|
2462
|
+
if attempt >= 1: # After 2nd attempt (0-indexed)
|
2463
|
+
valid_responses_so_far = [r for r in responses if r is not None]
|
2464
|
+
if len(valid_responses_so_far) >= 2:
|
2465
|
+
# Check if the last two valid responses match
|
2466
|
+
if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
|
2467
|
+
log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
|
2468
|
+
# Add the matching response 4 more times to simulate consensus
|
2469
|
+
for _ in range(max_attempts - attempt - 1):
|
2470
|
+
responses.append(valid_responses_so_far[-1])
|
2471
|
+
break
|
2451
2472
|
|
2452
2473
|
except Exception as e:
|
2453
2474
|
log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
|
2454
2475
|
responses.append(None)
|
2455
|
-
|
2456
|
-
# Check for early consensus after 2 attempts
|
2457
|
-
if len(responses) == 2:
|
2458
|
-
if (responses[0] and responses[1] and
|
2459
|
-
_sequences_match(responses[0][1], responses[1][1])):
|
2460
|
-
log.info("Sequence extraction consensus reached after 2 attempts")
|
2461
|
-
return responses[0][0] # Return original parsed data
|
2462
|
-
else:
|
2463
|
-
log.info("Sequence extraction mismatch after 2 attempts - trying third")
|
2464
2476
|
|
2465
|
-
# After all attempts,
|
2477
|
+
# After all attempts, find most common sequences
|
2466
2478
|
valid_responses = [r for r in responses if r is not None]
|
2467
2479
|
|
2468
2480
|
if not valid_responses:
|
2469
2481
|
log.error(f"All {max_attempts} sequence extraction attempts failed")
|
2470
2482
|
return None
|
2471
2483
|
|
2472
|
-
#
|
2473
|
-
for i in range(len(valid_responses)):
|
2474
|
-
for j in range(i + 1, len(valid_responses)):
|
2475
|
-
if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
|
2476
|
-
log.info(f"Sequence extraction consensus found: attempts with matching content")
|
2477
|
-
return valid_responses[i][0] # Return original parsed data
|
2478
|
-
|
2479
|
-
# If no exact consensus, use adaptive validation
|
2480
|
-
log.info("No exact consensus found, applying adaptive validation...")
|
2481
|
-
|
2482
|
-
# Find sequences that appear consistently across multiple attempts
|
2483
|
-
consistent_sequences = _find_consistent_sequences(valid_responses)
|
2484
|
-
|
2485
|
-
if consistent_sequences:
|
2486
|
-
log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
|
2487
|
-
return consistent_sequences
|
2488
|
-
|
2489
|
-
# If still no consensus, use the attempt with the most sequences
|
2490
|
-
best_response = max(valid_responses,
|
2491
|
-
key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
|
2492
|
-
|
2493
|
-
if best_response and len(best_response[1]) > 0:
|
2494
|
-
log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
|
2495
|
-
return best_response[0]
|
2496
|
-
|
2497
|
-
log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
|
2498
|
-
return None
|
2499
|
-
|
2500
|
-
|
2501
|
-
def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
|
2502
|
-
"""Find sequences that appear consistently across multiple extraction attempts.
|
2503
|
-
|
2504
|
-
Args:
|
2505
|
-
valid_responses: List of (original_data, normalized_data) tuples
|
2506
|
-
|
2507
|
-
Returns:
|
2508
|
-
List of consistent sequences with confidence scores, or None if none found
|
2509
|
-
"""
|
2510
|
-
if not valid_responses:
|
2511
|
-
return None
|
2512
|
-
|
2513
|
-
# Count how many times each sequence appears
|
2484
|
+
# Count occurrences of each individual sequence across all attempts
|
2514
2485
|
sequence_counts = {}
|
2515
|
-
|
2516
|
-
|
2517
|
-
|
2518
|
-
|
2519
|
-
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2524
|
-
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2531
|
-
|
2532
|
-
|
2533
|
-
|
2534
|
-
|
2535
|
-
|
2536
|
-
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2541
|
-
|
2542
|
-
break
|
2543
|
-
|
2544
|
-
# Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
|
2545
|
-
min_appearances = max(2, len(valid_responses) // 2)
|
2546
|
-
consistent_sequences = []
|
2547
|
-
|
2548
|
-
for key, count in sequence_counts.items():
|
2549
|
-
if count >= min_appearances:
|
2550
|
-
# Use the first occurrence of the full data
|
2551
|
-
if sequence_full_data[key]:
|
2552
|
-
seq_data = sequence_full_data[key][0].copy()
|
2553
|
-
# Add confidence based on how many times it appeared
|
2554
|
-
seq_data["confidence"] = count / len(valid_responses)
|
2555
|
-
seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
|
2556
|
-
consistent_sequences.append(seq_data)
|
2557
|
-
|
2558
|
-
return consistent_sequences if consistent_sequences else None
|
2559
|
-
|
2560
|
-
|
2561
|
-
def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
|
2562
|
-
"""Normalize sequence response for comparison."""
|
2563
|
-
if not isinstance(data, list):
|
2564
|
-
return []
|
2565
|
-
|
2566
|
-
normalized = []
|
2567
|
-
for item in data:
|
2568
|
-
if isinstance(item, dict):
|
2569
|
-
# Extract key fields for comparison
|
2570
|
-
normalized_item = {
|
2571
|
-
"variant_id": item.get("variant_id", ""),
|
2572
|
-
"aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
|
2573
|
-
"dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
|
2574
|
-
"confidence": item.get("confidence", 0.0)
|
2575
|
-
}
|
2576
|
-
normalized.append(normalized_item)
|
2486
|
+
for resp in valid_responses:
|
2487
|
+
if isinstance(resp, list):
|
2488
|
+
for seq in resp:
|
2489
|
+
if isinstance(seq, dict) and "variant_id" in seq:
|
2490
|
+
# Create a key for this sequence (variant_id + cleaned aa_seq)
|
2491
|
+
variant_id = seq.get("variant_id", "")
|
2492
|
+
aa_seq = seq.get("aa_seq", "")
|
2493
|
+
if aa_seq:
|
2494
|
+
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2495
|
+
key = f"{variant_id}|{aa_seq}"
|
2496
|
+
|
2497
|
+
if key not in sequence_counts:
|
2498
|
+
sequence_counts[key] = {"count": 0, "data": seq}
|
2499
|
+
sequence_counts[key]["count"] += 1
|
2500
|
+
|
2501
|
+
# Build result with sequences that appear in at least 3 attempts
|
2502
|
+
result = []
|
2503
|
+
for key, info in sequence_counts.items():
|
2504
|
+
if info["count"] >= 3: # Appears in at least 3/6 attempts
|
2505
|
+
seq_data = info["data"].copy()
|
2506
|
+
seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
|
2507
|
+
result.append(seq_data)
|
2508
|
+
log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
|
2509
|
+
|
2510
|
+
if result:
|
2511
|
+
log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
|
2512
|
+
return result
|
2577
2513
|
|
2578
|
-
#
|
2579
|
-
|
2514
|
+
# If no sequences appear twice, return the most complete attempt
|
2515
|
+
best_attempt = max(valid_responses, key=lambda x: len(x) if isinstance(x, list) else 0)
|
2516
|
+
log.warning(f"No consensus sequences found, returning best attempt with {len(best_attempt)} sequences")
|
2517
|
+
return best_attempt
|
2580
2518
|
|
2581
2519
|
|
2582
|
-
def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
|
2583
|
-
"""Check if two sequence response lists match on key fields."""
|
2584
|
-
if len(seq1) != len(seq2):
|
2585
|
-
return False
|
2586
|
-
|
2587
|
-
for i, (s1, s2) in enumerate(zip(seq1, seq2)):
|
2588
|
-
# Compare variant IDs
|
2589
|
-
if s1.get("variant_id") != s2.get("variant_id"):
|
2590
|
-
return False
|
2591
|
-
|
2592
|
-
# Compare amino acid sequences (most critical)
|
2593
|
-
aa1 = s1.get("aa_seq", "")
|
2594
|
-
aa2 = s2.get("aa_seq", "")
|
2595
|
-
if aa1 and aa2 and aa1 != aa2:
|
2596
|
-
return False
|
2597
|
-
elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
|
2598
|
-
return False
|
2599
|
-
|
2600
|
-
# Compare DNA sequences if present
|
2601
|
-
dna1 = s1.get("dna_seq", "")
|
2602
|
-
dna2 = s2.get("dna_seq", "")
|
2603
|
-
if dna1 and dna2 and dna1 != dna2:
|
2604
|
-
return False
|
2605
|
-
|
2606
|
-
return True
|
2607
2520
|
|
2608
2521
|
|
2609
2522
|
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
@@ -2624,18 +2537,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
|
|
2624
2537
|
else:
|
2625
2538
|
prompt = base_prompt
|
2626
2539
|
|
2627
|
-
#
|
2628
|
-
if lineage_variants:
|
2629
|
-
mutation_context = _build_mutation_validation_context(lineage_variants)
|
2630
|
-
if mutation_context:
|
2631
|
-
prompt = f"""{prompt}
|
2632
|
-
|
2633
|
-
CRITICAL MUTATION VALIDATION:
|
2634
|
-
{mutation_context}
|
2635
|
-
|
2636
|
-
IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
|
2637
|
-
For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
|
2638
|
-
"""
|
2540
|
+
# Skip mutation validation context
|
2639
2541
|
|
2640
2542
|
# Save the complete prompt for debugging
|
2641
2543
|
if debug_dir:
|
@@ -2662,11 +2564,7 @@ For example, if variant "III" has mutation "A100V" from parent "II", then positi
|
|
2662
2564
|
|
2663
2565
|
extracted_sequences = _parse_sequences(data)
|
2664
2566
|
|
2665
|
-
#
|
2666
|
-
if lineage_variants:
|
2667
|
-
validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
|
2668
|
-
return validated_sequences
|
2669
|
-
|
2567
|
+
# Return extracted sequences without mutation validation
|
2670
2568
|
return extracted_sequences
|
2671
2569
|
|
2672
2570
|
# --- 7.4 JSON -> dataclass helpers -------------------------------------------
|
@@ -2701,6 +2599,19 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
2701
2599
|
aa = _clean_seq(entry.get("aa_seq"), _VALID_AA)
|
2702
2600
|
dna = _clean_seq(entry.get("dna_seq"), _VALID_DNA)
|
2703
2601
|
|
2602
|
+
# Check minimum length requirements
|
2603
|
+
# AA sequences should be > 50, DNA sequences should be > 150
|
2604
|
+
if aa and len(aa) <= 50:
|
2605
|
+
log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
|
2606
|
+
aa = None
|
2607
|
+
if dna and len(dna) <= 150:
|
2608
|
+
log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
|
2609
|
+
dna = None
|
2610
|
+
|
2611
|
+
# Skip if both sequences are too short or missing
|
2612
|
+
if not aa and not dna:
|
2613
|
+
continue
|
2614
|
+
|
2704
2615
|
conf: float | None = None
|
2705
2616
|
if aa:
|
2706
2617
|
conf = sum(c in _VALID_AA for c in aa) / len(aa)
|
@@ -3118,12 +3029,6 @@ If you cannot determine certain fields, set them to null.
|
|
3118
3029
|
seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
|
3119
3030
|
# Validate it looks like a protein sequence
|
3120
3031
|
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
|
3121
|
-
# Sanity check the sequence against known mutations
|
3122
|
-
validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
|
3123
|
-
if validated_seq:
|
3124
|
-
seq = validated_seq
|
3125
|
-
log.info(f"Sequence validated and potentially corrected by Gemini")
|
3126
|
-
|
3127
3032
|
# Map to the first variant or wild-type
|
3128
3033
|
wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
|
3129
3034
|
if wt_variant:
|
@@ -3427,7 +3332,7 @@ def _merge_lineage_and_sequences(
|
|
3427
3332
|
log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
|
3428
3333
|
|
3429
3334
|
# 3. If we have unmatched sequences and a model, use Gemini to match
|
3430
|
-
if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
|
3335
|
+
if model and len(df_seq) > 0 and (df['aa_seq'].isna().any() or df['dna_seq'].isna().any()):
|
3431
3336
|
# Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
|
3432
3337
|
missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
|
3433
3338
|
unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
|
@@ -3442,14 +3347,9 @@ def _merge_lineage_and_sequences(
|
|
3442
3347
|
log.info("Using Gemini to match variants")
|
3443
3348
|
|
3444
3349
|
# Build prompt for Gemini
|
3445
|
-
prompt = f"""Match enzyme variant IDs between two lists from the same paper.
|
3350
|
+
prompt = f"""Match enzyme variant IDs between two lists from the same paper using your best judgment.
|
3446
3351
|
|
3447
|
-
|
3448
|
-
- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
|
3449
|
-
- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
|
3450
|
-
|
3451
|
-
Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
|
3452
|
-
use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
|
3352
|
+
These IDs come from different sections of the paper and may use different naming conventions for the same variant.
|
3453
3353
|
|
3454
3354
|
Lineage variant IDs (need sequences):
|
3455
3355
|
{json.dumps(unmatched_lineage_ids)}
|
@@ -3457,8 +3357,13 @@ Lineage variant IDs (need sequences):
|
|
3457
3357
|
Sequence variant IDs (have sequences):
|
3458
3358
|
{json.dumps(unmatched_seqs['variant_id'].tolist())}
|
3459
3359
|
|
3360
|
+
IMPORTANT: A variant with mutations (indicated by mutation codes like letters and numbers after an underscore or space) is a DIFFERENT enzyme from its parent. Do not match mutation variants to their base sequences - they are distinct entities with different sequences due to the mutations.
|
3361
|
+
|
3362
|
+
Only match variants that represent the SAME enzyme, accounting for different naming conventions between sections.
|
3363
|
+
|
3460
3364
|
Return ONLY a JSON object mapping lineage IDs to sequence IDs.
|
3461
3365
|
Format: {{"lineage_id": "sequence_id", ...}}
|
3366
|
+
Only include matches you are confident represent the same variant.
|
3462
3367
|
"""
|
3463
3368
|
|
3464
3369
|
try:
|
@@ -3738,16 +3643,27 @@ def run_pipeline(
|
|
3738
3643
|
# 4. Extract sequences (Section 7) ----------------------------------------
|
3739
3644
|
sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
|
3740
3645
|
|
3741
|
-
# 4a.
|
3742
|
-
#
|
3743
|
-
|
3744
|
-
|
3745
|
-
|
3746
|
-
|
3646
|
+
# 4a. First try to merge extracted sequences with lineage using Gemini matching
|
3647
|
+
# This allows fuzzy matching of complex variant IDs before external lookups
|
3648
|
+
doi = extract_doi(manuscript)
|
3649
|
+
df_merged = merge_and_score(lineage, sequences, doi, model)
|
3650
|
+
|
3651
|
+
# 4b. Check if ALL variants are missing sequences after merging
|
3652
|
+
# Only try external sources if no sequences were successfully matched
|
3653
|
+
all_missing_sequences = True
|
3654
|
+
if 'aa_seq' in df_merged.columns or 'dna_seq' in df_merged.columns:
|
3655
|
+
for _, row in df_merged.iterrows():
|
3656
|
+
has_aa = pd.notna(row.get('aa_seq'))
|
3657
|
+
has_dna = pd.notna(row.get('dna_seq'))
|
3658
|
+
if has_aa or has_dna:
|
3659
|
+
all_missing_sequences = False
|
3660
|
+
break
|
3747
3661
|
|
3748
|
-
if
|
3749
|
-
|
3750
|
-
|
3662
|
+
if all_missing_sequences:
|
3663
|
+
MIN_PROTEIN_LENGTH = 50 # Most proteins are >50 AA
|
3664
|
+
MIN_DNA_LENGTH = 150 # DNA sequences should be >150 nt
|
3665
|
+
log.info("No full-length sequences found in paper (only partial sequences < %d AA or < %d nt), attempting PDB extraction...",
|
3666
|
+
MIN_PROTEIN_LENGTH, MIN_DNA_LENGTH)
|
3751
3667
|
|
3752
3668
|
# Extract PDB IDs from all PDFs
|
3753
3669
|
pdb_ids = []
|
@@ -3785,7 +3701,13 @@ def run_pipeline(
|
|
3785
3701
|
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
3786
3702
|
|
3787
3703
|
if pdb_seq_blocks:
|
3788
|
-
|
3704
|
+
# Update the dataframe with PDB sequences
|
3705
|
+
for seq_block in pdb_seq_blocks:
|
3706
|
+
mask = df_merged['variant_id'] == seq_block.variant_id
|
3707
|
+
if mask.any():
|
3708
|
+
df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
|
3709
|
+
df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
|
3710
|
+
df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
|
3789
3711
|
log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
|
3790
3712
|
break
|
3791
3713
|
else:
|
@@ -3793,8 +3715,15 @@ def run_pipeline(
|
|
3793
3715
|
else:
|
3794
3716
|
log.warning("No PDB IDs found in paper")
|
3795
3717
|
|
3796
|
-
#
|
3797
|
-
|
3718
|
+
# 4c. If still no sequences after PDB, try Gemini extraction as last resort
|
3719
|
+
# Re-check if all variants are still missing sequences
|
3720
|
+
still_all_missing = True
|
3721
|
+
for _, row in df_merged.iterrows():
|
3722
|
+
if pd.notna(row.get('aa_seq')) or pd.notna(row.get('dna_seq')):
|
3723
|
+
still_all_missing = False
|
3724
|
+
break
|
3725
|
+
|
3726
|
+
if still_all_missing:
|
3798
3727
|
log.info("No sequences from PDB, attempting Gemini-based extraction...")
|
3799
3728
|
|
3800
3729
|
gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
|
@@ -3818,14 +3747,19 @@ def run_pipeline(
|
|
3818
3747
|
log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
|
3819
3748
|
|
3820
3749
|
if gemini_seq_blocks:
|
3821
|
-
|
3750
|
+
# Update the dataframe with Gemini/UniProt sequences
|
3751
|
+
for seq_block in gemini_seq_blocks:
|
3752
|
+
mask = df_merged['variant_id'] == seq_block.variant_id
|
3753
|
+
if mask.any():
|
3754
|
+
df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
|
3755
|
+
df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
|
3756
|
+
df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'Gemini/UniProt')
|
3822
3757
|
log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
|
3823
3758
|
else:
|
3824
3759
|
log.warning("Failed to extract sequences via Gemini")
|
3825
3760
|
|
3826
|
-
# 5.
|
3827
|
-
|
3828
|
-
df_final = merge_and_score(lineage, sequences, doi, model)
|
3761
|
+
# 5. Use the merged dataframe (already merged above)
|
3762
|
+
df_final = df_merged
|
3829
3763
|
|
3830
3764
|
# 6. Write FINAL CSV -------------------------------------------------------
|
3831
3765
|
if output_csv:
|