debase 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +123 -0
- debase/enzyme_lineage_extractor.py +243 -309
- debase/reaction_info_extractor.py +152 -68
- {debase-0.4.5.dist-info → debase-0.5.0.dist-info}/METADATA +1 -1
- debase-0.5.0.dist-info/RECORD +16 -0
- debase-0.4.5.dist-info/RECORD +0 -16
- {debase-0.4.5.dist-info → debase-0.5.0.dist-info}/WHEEL +0 -0
- {debase-0.4.5.dist-info → debase-0.5.0.dist-info}/entry_points.txt +0 -0
- {debase-0.4.5.dist-info → debase-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.5.dist-info → debase-0.5.0.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
debase/cleanup_sequence.py
CHANGED
@@ -30,6 +30,27 @@ except ImportError: # pragma: no cover
|
|
30
30
|
# === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
|
31
31
|
|
32
32
|
VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
|
33
|
+
VALID_DNA_BASES = set("ACGT")
|
34
|
+
|
35
|
+
# Genetic code table for DNA to amino acid translation
|
36
|
+
GENETIC_CODE = {
|
37
|
+
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
38
|
+
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
39
|
+
'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
|
40
|
+
'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
|
41
|
+
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
42
|
+
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
43
|
+
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
44
|
+
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
45
|
+
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
46
|
+
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
47
|
+
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
48
|
+
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
49
|
+
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
50
|
+
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
51
|
+
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
52
|
+
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
|
53
|
+
}
|
33
54
|
|
34
55
|
# Gemini API configuration
|
35
56
|
GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
|
@@ -182,6 +203,44 @@ class SequenceManipulator:
|
|
182
203
|
"""Validate that a sequence contains only valid amino acids."""
|
183
204
|
return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
|
184
205
|
|
206
|
+
@staticmethod
|
207
|
+
def is_dna_sequence(seq: str) -> bool:
|
208
|
+
"""Check if a sequence is DNA (contains only ACGT)."""
|
209
|
+
seq_upper = seq.upper().replace(" ", "").replace("\n", "")
|
210
|
+
return all(base in VALID_DNA_BASES for base in seq_upper) and len(seq_upper) > 0
|
211
|
+
|
212
|
+
@staticmethod
|
213
|
+
def translate_dna_to_protein(dna_seq: str) -> str:
|
214
|
+
"""Translate DNA sequence to protein sequence.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
dna_seq: DNA sequence string
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
Protein sequence string
|
221
|
+
"""
|
222
|
+
# Clean the DNA sequence
|
223
|
+
dna_seq = dna_seq.upper().replace(" ", "").replace("\n", "")
|
224
|
+
|
225
|
+
# Check if sequence length is multiple of 3
|
226
|
+
if len(dna_seq) % 3 != 0:
|
227
|
+
log.warning(f"DNA sequence length ({len(dna_seq)}) is not a multiple of 3. Truncating to nearest codon.")
|
228
|
+
dna_seq = dna_seq[:-(len(dna_seq) % 3)]
|
229
|
+
|
230
|
+
protein_seq = []
|
231
|
+
for i in range(0, len(dna_seq), 3):
|
232
|
+
codon = dna_seq[i:i+3]
|
233
|
+
if len(codon) == 3:
|
234
|
+
# Handle unknown codons (with N or other non-standard bases)
|
235
|
+
if codon in GENETIC_CODE:
|
236
|
+
protein_seq.append(GENETIC_CODE[codon])
|
237
|
+
else:
|
238
|
+
# If codon contains non-standard bases, add 'X' for unknown amino acid
|
239
|
+
protein_seq.append('X')
|
240
|
+
log.debug(f"Unknown codon '{codon}' at position {i}, using 'X' for unknown amino acid")
|
241
|
+
|
242
|
+
return ''.join(protein_seq)
|
243
|
+
|
185
244
|
@staticmethod
|
186
245
|
def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
|
187
246
|
"""Determine whether mutations use 0-based or 1-based indexing."""
|
@@ -1141,6 +1200,9 @@ class SequenceProcessor:
|
|
1141
1200
|
# Detect and handle column format automatically
|
1142
1201
|
self._normalize_columns()
|
1143
1202
|
|
1203
|
+
# Translate DNA sequences to protein sequences if needed
|
1204
|
+
self._translate_dna_sequences()
|
1205
|
+
|
1144
1206
|
log.info(
|
1145
1207
|
f"Loaded {len(self.df)} rows, "
|
1146
1208
|
f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
|
@@ -1153,6 +1215,67 @@ class SequenceProcessor:
|
|
1153
1215
|
# Initialize generator
|
1154
1216
|
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
1155
1217
|
|
1218
|
+
def _translate_dna_sequences(self) -> None:
|
1219
|
+
"""Translate DNA sequences to protein sequences if no amino acid sequences exist."""
|
1220
|
+
manipulator = SequenceManipulator()
|
1221
|
+
|
1222
|
+
# First check if ANY sequences are amino acid sequences
|
1223
|
+
has_amino_acid = False
|
1224
|
+
for idx, row in self.df.iterrows():
|
1225
|
+
seq = str(row.get("protein_sequence", "")).strip()
|
1226
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1227
|
+
if not manipulator.is_dna_sequence(seq):
|
1228
|
+
has_amino_acid = True
|
1229
|
+
break
|
1230
|
+
|
1231
|
+
# If we found amino acid sequences, don't translate anything
|
1232
|
+
if has_amino_acid:
|
1233
|
+
log.info("Found amino acid sequences in data, skipping DNA translation")
|
1234
|
+
return
|
1235
|
+
|
1236
|
+
# No amino acid sequences found, check for DNA sequences in dna_seq column
|
1237
|
+
if "dna_seq" in self.df.columns:
|
1238
|
+
dna_count = 0
|
1239
|
+
for idx, row in self.df.iterrows():
|
1240
|
+
protein_seq = str(row.get("protein_sequence", "")).strip()
|
1241
|
+
dna_seq = str(row.get("dna_seq", "")).strip()
|
1242
|
+
|
1243
|
+
# If protein_sequence is empty but dna_seq has content, translate it
|
1244
|
+
if (not protein_seq or protein_seq.lower() in ["nan", "none", ""]) and \
|
1245
|
+
(dna_seq and dna_seq.lower() not in ["nan", "none", ""]):
|
1246
|
+
if manipulator.is_dna_sequence(dna_seq):
|
1247
|
+
# Translate DNA to protein
|
1248
|
+
translated_seq = manipulator.translate_dna_to_protein(dna_seq)
|
1249
|
+
self.df.at[idx, "protein_sequence"] = translated_seq
|
1250
|
+
|
1251
|
+
# Add flag to indicate this was translated from DNA
|
1252
|
+
if "flag" not in self.df.columns:
|
1253
|
+
self.df["flag"] = ""
|
1254
|
+
existing_flag = str(self.df.at[idx, "flag"]).strip()
|
1255
|
+
self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
|
1256
|
+
dna_count += 1
|
1257
|
+
|
1258
|
+
if dna_count > 0:
|
1259
|
+
log.info(f"Translated {dna_count} DNA sequences from dna_seq column to protein sequences")
|
1260
|
+
|
1261
|
+
# Also check if DNA sequences are mistakenly in protein_sequence column
|
1262
|
+
dna_count = 0
|
1263
|
+
for idx, row in self.df.iterrows():
|
1264
|
+
seq = str(row.get("protein_sequence", "")).strip()
|
1265
|
+
if seq and seq.lower() not in ["nan", "none", ""]:
|
1266
|
+
if manipulator.is_dna_sequence(seq):
|
1267
|
+
# Translate DNA to protein
|
1268
|
+
protein_seq = manipulator.translate_dna_to_protein(seq)
|
1269
|
+
self.df.at[idx, "protein_sequence"] = protein_seq
|
1270
|
+
|
1271
|
+
# Add flag to indicate this was translated from DNA
|
1272
|
+
existing_flag = str(self.df.at[idx, "flag"]).strip()
|
1273
|
+
self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
|
1274
|
+
dna_count += 1
|
1275
|
+
|
1276
|
+
if dna_count > 0:
|
1277
|
+
log.info(f"Translated {dna_count} DNA sequences to protein sequences")
|
1278
|
+
|
1156
1279
|
def _normalize_columns(self) -> None:
|
1157
1280
|
"""Automatically detect and normalize column names from different formats."""
|
1158
1281
|
# Check if this is enzyme_lineage_extractor format
|
@@ -24,6 +24,7 @@ import pandas as pd
|
|
24
24
|
import networkx as nx # light dependency, used only for generation inference
|
25
25
|
|
26
26
|
import os
|
27
|
+
import fitz
|
27
28
|
import re
|
28
29
|
import json
|
29
30
|
import time
|
@@ -460,8 +461,32 @@ def get_model():
|
|
460
461
|
if not api_key:
|
461
462
|
raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
|
462
463
|
_genai.configure(api_key=api_key)
|
463
|
-
|
464
|
-
|
464
|
+
|
465
|
+
# Create generation config to optimize performance and costs
|
466
|
+
generation_config = {
|
467
|
+
"temperature": 0.0, # Deterministic: always pick the most likely token
|
468
|
+
"top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
|
469
|
+
"top_k": 1, # Only consider the single most likely token
|
470
|
+
"max_output_tokens": 32768, # Increased from 8192 to handle larger sequence extractions
|
471
|
+
}
|
472
|
+
|
473
|
+
# For Gemini 2.5 Flash, disable thinking tokens to save costs
|
474
|
+
# thinking_budget=0 disables thinking, -1 enables dynamic thinking (default)
|
475
|
+
# Only add if SDK supports it to maintain compatibility
|
476
|
+
try:
|
477
|
+
# Test if thinking_budget is supported by making a minimal API call
|
478
|
+
test_config = {"thinking_budget": 0, "max_output_tokens": 10}
|
479
|
+
test_model = _genai.GenerativeModel(MODEL_NAME, generation_config=test_config)
|
480
|
+
# Actually test the API call to see if thinking_budget is supported
|
481
|
+
test_response = test_model.generate_content("Return 'OK'")
|
482
|
+
# If no error, add thinking_budget to main config
|
483
|
+
generation_config["thinking_budget"] = 0
|
484
|
+
log.debug("Disabled thinking tokens (thinking_budget=0)")
|
485
|
+
except Exception as e:
|
486
|
+
# SDK doesn't support thinking_budget, continue without it
|
487
|
+
log.debug(f"thinking_budget not supported: {e}")
|
488
|
+
|
489
|
+
return _genai.GenerativeModel(MODEL_NAME, generation_config=generation_config)
|
465
490
|
|
466
491
|
# === 5.3 Unified call helper ----------------------------------------------
|
467
492
|
|
@@ -728,22 +753,24 @@ Return a JSON object with:
|
|
728
753
|
_LINEAGE_LOC_PROMPT = """
|
729
754
|
You are an expert reader of protein engineering manuscripts.
|
730
755
|
{campaign_context}
|
731
|
-
Given the following article text, list up to {max_results} *locations* (
|
732
|
-
|
733
|
-
|
734
|
-
|
756
|
+
Given the following article text, list up to {max_results} *locations* (figure/table IDs
|
757
|
+
or section headings) that you would review first to find the COMPLETE evolutionary
|
758
|
+
lineage of enzyme variants (i.e. which variant came from which parent and what
|
759
|
+
mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
|
760
|
+
ensure the location you return are actually lineage location with variants and mutations.
|
735
761
|
|
736
762
|
Respond with a JSON array of objects, each containing:
|
737
|
-
- "location": the identifier (e.g. "Table S1", "Figure 2B", "
|
738
|
-
- "type": one of "table", "figure", "
|
763
|
+
- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
764
|
+
- "type": one of "table", "figure", "section"
|
739
765
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
740
766
|
- "reason": brief explanation of why this location likely contains lineage
|
741
767
|
{campaign_field}
|
742
|
-
IMPORTANT:
|
768
|
+
IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
|
769
|
+
NOT page numbers. Focus on the actual figure/table titles and numbers.
|
743
770
|
|
744
771
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
745
|
-
mutation lists should be ranked higher than
|
746
|
-
|
772
|
+
mutation lists should be ranked higher than figures showing complete variant lineages.
|
773
|
+
Sections are used when no suitable tables/figures exist.
|
747
774
|
|
748
775
|
Don't include oligonucleotide results or result from only one round.
|
749
776
|
|
@@ -1713,7 +1740,6 @@ def get_lineage(
|
|
1713
1740
|
for pdf_path in pdf_paths:
|
1714
1741
|
# Extract first few pages looking for TOC
|
1715
1742
|
try:
|
1716
|
-
import fitz # PyMuPDF
|
1717
1743
|
doc = fitz.open(pdf_path)
|
1718
1744
|
toc_text = ""
|
1719
1745
|
for page_num in range(min(5, doc.page_count)): # First 5 pages
|
@@ -2011,7 +2037,7 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
|
|
2011
2037
|
|
2012
2038
|
# --- 7.2 Page-based extraction helper ---------------------------------------
|
2013
2039
|
def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
|
2014
|
-
"""Extract plain text sequence using Gemini with
|
2040
|
+
"""Extract plain text sequence using Gemini with 6 attempts, returning most common result.
|
2015
2041
|
|
2016
2042
|
Args:
|
2017
2043
|
prompt: The prompt to send to Gemini
|
@@ -2019,12 +2045,12 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
|
|
2019
2045
|
context: Additional context for logging (e.g., "validation" or "extraction")
|
2020
2046
|
|
2021
2047
|
Returns:
|
2022
|
-
The
|
2048
|
+
The most common sequence or None if all attempts failed
|
2023
2049
|
"""
|
2024
2050
|
sequences = []
|
2025
|
-
max_attempts =
|
2051
|
+
max_attempts = 6
|
2026
2052
|
|
2027
|
-
# Try
|
2053
|
+
# Try 6 times
|
2028
2054
|
for attempt in range(max_attempts):
|
2029
2055
|
try:
|
2030
2056
|
response = model.generate_content(prompt)
|
@@ -2050,38 +2076,14 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
|
|
2050
2076
|
except Exception as e:
|
2051
2077
|
log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
|
2052
2078
|
sequences.append("ERROR")
|
2053
|
-
|
2054
|
-
# Check for early consensus after 2 attempts
|
2055
|
-
if len(sequences) == 2:
|
2056
|
-
# Clean sequences before comparison
|
2057
|
-
seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
|
2058
|
-
seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
|
2059
|
-
|
2060
|
-
if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
|
2061
|
-
log.info(f"Gemini {context} consensus reached after 2 attempts")
|
2062
|
-
return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
|
2063
|
-
else:
|
2064
|
-
log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
|
2065
2079
|
|
2066
|
-
# After all attempts, find
|
2080
|
+
# After all attempts, find most common result
|
2067
2081
|
valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
|
2068
2082
|
|
2069
2083
|
if not valid_sequences:
|
2070
2084
|
log.error(f"All {max_attempts} {context} attempts failed")
|
2071
2085
|
return None
|
2072
2086
|
|
2073
|
-
# Find any matching pair
|
2074
|
-
for i in range(len(sequences)):
|
2075
|
-
for j in range(i + 1, len(sequences)):
|
2076
|
-
# Clean sequences before comparison
|
2077
|
-
seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
|
2078
|
-
seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
|
2079
|
-
|
2080
|
-
if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
|
2081
|
-
log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
|
2082
|
-
return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
|
2083
|
-
|
2084
|
-
# If no exact match, use adaptive validation
|
2085
2087
|
# Count occurrences of each valid sequence
|
2086
2088
|
sequence_counts = {}
|
2087
2089
|
for seq in valid_sequences:
|
@@ -2090,80 +2092,16 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
|
|
2090
2092
|
seq_clean = seq.replace(" ", "").replace("\n", "")
|
2091
2093
|
sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
|
2092
2094
|
|
2093
|
-
# Return the most common sequence
|
2095
|
+
# Return the most common sequence
|
2094
2096
|
if sequence_counts:
|
2095
2097
|
most_common = max(sequence_counts.items(), key=lambda x: x[1])
|
2096
|
-
|
2097
|
-
|
2098
|
-
return most_common[0]
|
2098
|
+
log.info(f"Gemini {context} most common: sequence appeared {most_common[1]}/{max_attempts} times")
|
2099
|
+
return most_common[0]
|
2099
2100
|
|
2100
|
-
log.warning(f"Gemini {context} no
|
2101
|
+
log.warning(f"Gemini {context} no valid sequences after {max_attempts} attempts")
|
2101
2102
|
return None
|
2102
2103
|
|
2103
2104
|
|
2104
|
-
def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
|
2105
|
-
"""Validate and potentially correct a sequence using Gemini by checking against known mutations."""
|
2106
|
-
|
2107
|
-
# Extract mutations from variants
|
2108
|
-
mutations = []
|
2109
|
-
for variant in variants:
|
2110
|
-
if variant.mutations:
|
2111
|
-
mutations.extend(variant.mutations)
|
2112
|
-
|
2113
|
-
if not mutations:
|
2114
|
-
return None
|
2115
|
-
|
2116
|
-
# Take a sample of mutations for validation
|
2117
|
-
sample_mutations = mutations[:10] # Check first 10 mutations
|
2118
|
-
|
2119
|
-
# First do a quick local check for obvious inconsistencies
|
2120
|
-
local_issues = []
|
2121
|
-
for mutation in sample_mutations:
|
2122
|
-
if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
|
2123
|
-
pos = mutation.position - 1 # Convert to 0-indexed
|
2124
|
-
if 0 <= pos < len(sequence):
|
2125
|
-
actual_aa = sequence[pos]
|
2126
|
-
expected_aa = mutation.original
|
2127
|
-
if actual_aa != expected_aa:
|
2128
|
-
local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
|
2129
|
-
|
2130
|
-
if not local_issues:
|
2131
|
-
return None # No obvious issues found
|
2132
|
-
|
2133
|
-
log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
|
2134
|
-
|
2135
|
-
prompt = f"""
|
2136
|
-
You are validating a protein sequence that was extracted from a scientific paper.
|
2137
|
-
The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
|
2138
|
-
|
2139
|
-
Original sequence (length {len(sequence)}):
|
2140
|
-
{sequence}
|
2141
|
-
|
2142
|
-
Known mutations that should be applicable to this sequence:
|
2143
|
-
{', '.join(str(m) for m in sample_mutations)}
|
2144
|
-
|
2145
|
-
Potential issues detected:
|
2146
|
-
{chr(10).join(local_issues)}
|
2147
|
-
|
2148
|
-
Please check if the sequence is consistent with these mutations:
|
2149
|
-
1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
|
2150
|
-
2. If you find inconsistencies, suggest the most likely correction
|
2151
|
-
3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
|
2152
|
-
4. Pay special attention to consecutive identical amino acids that might be OCR errors
|
2153
|
-
|
2154
|
-
Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
|
2155
|
-
If you cannot determine the correct sequence, return "UNCERTAIN".
|
2156
|
-
"""
|
2157
|
-
|
2158
|
-
# Use triple validation
|
2159
|
-
result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
|
2160
|
-
|
2161
|
-
if result == "VALID" or result is None:
|
2162
|
-
return None # No changes needed
|
2163
|
-
else:
|
2164
|
-
log.info(f"Gemini suggested sequence correction (length {len(result)})")
|
2165
|
-
return result
|
2166
|
-
|
2167
2105
|
|
2168
2106
|
def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
|
2169
2107
|
"""Extract text from a specific page number in the PDFs.
|
@@ -2331,11 +2269,11 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
|
|
2331
2269
|
|
2332
2270
|
SEQUENCE EXTRACTION RULES:
|
2333
2271
|
- Copy sequences EXACTLY as they appear in the text
|
2334
|
-
- Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
|
2335
|
-
- Do NOT add, remove, or modify any amino acids
|
2272
|
+
- Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
|
2273
|
+
- Do NOT add, remove, or modify any amino acids, or nucleotides
|
2336
2274
|
- Preserve the exact length and character sequence
|
2337
2275
|
- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
|
2338
|
-
- Double-check that consecutive identical amino acids are copied correctly
|
2276
|
+
- Double-check that consecutive identical amino acids or nucleotides are copied correctly
|
2339
2277
|
|
2340
2278
|
For each variant return:
|
2341
2279
|
* variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
|
@@ -2356,8 +2294,81 @@ TEXT (may be truncated):
|
|
2356
2294
|
```
|
2357
2295
|
""".strip()
|
2358
2296
|
|
2297
|
+
def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
|
2298
|
+
"""
|
2299
|
+
Check if two sequence extraction responses match.
|
2300
|
+
|
2301
|
+
Args:
|
2302
|
+
resp1: First response (list of sequences or dict)
|
2303
|
+
resp2: Second response (list of sequences or dict)
|
2304
|
+
|
2305
|
+
Returns:
|
2306
|
+
True if responses match, False otherwise
|
2307
|
+
"""
|
2308
|
+
# Handle None cases
|
2309
|
+
if resp1 is None or resp2 is None:
|
2310
|
+
return False
|
2311
|
+
|
2312
|
+
# Both should be the same type
|
2313
|
+
if type(resp1) != type(resp2):
|
2314
|
+
return False
|
2315
|
+
|
2316
|
+
# If both are lists
|
2317
|
+
if isinstance(resp1, list) and isinstance(resp2, list):
|
2318
|
+
# Must have same length
|
2319
|
+
if len(resp1) != len(resp2):
|
2320
|
+
return False
|
2321
|
+
|
2322
|
+
# Create normalized sequence sets for comparison
|
2323
|
+
seq_set1 = set()
|
2324
|
+
seq_set2 = set()
|
2325
|
+
|
2326
|
+
for seq in resp1:
|
2327
|
+
if isinstance(seq, dict):
|
2328
|
+
variant_id = seq.get("variant_id", "")
|
2329
|
+
aa_seq = seq.get("aa_seq")
|
2330
|
+
dna_seq = seq.get("dna_seq")
|
2331
|
+
# Handle None/null values - convert to empty string for comparison
|
2332
|
+
if aa_seq is None:
|
2333
|
+
aa_seq = ""
|
2334
|
+
else:
|
2335
|
+
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2336
|
+
if dna_seq is None:
|
2337
|
+
dna_seq = ""
|
2338
|
+
else:
|
2339
|
+
dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
|
2340
|
+
seq_set1.add(f"{variant_id}|{aa_seq}|{dna_seq}")
|
2341
|
+
|
2342
|
+
for seq in resp2:
|
2343
|
+
if isinstance(seq, dict):
|
2344
|
+
variant_id = seq.get("variant_id", "")
|
2345
|
+
aa_seq = seq.get("aa_seq")
|
2346
|
+
dna_seq = seq.get("dna_seq")
|
2347
|
+
# Handle None/null values - convert to empty string for comparison
|
2348
|
+
if aa_seq is None:
|
2349
|
+
aa_seq = ""
|
2350
|
+
else:
|
2351
|
+
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2352
|
+
if dna_seq is None:
|
2353
|
+
dna_seq = ""
|
2354
|
+
else:
|
2355
|
+
dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
|
2356
|
+
seq_set2.add(f"{variant_id}|{aa_seq}|{dna_seq}")
|
2357
|
+
|
2358
|
+
return seq_set1 == seq_set2
|
2359
|
+
|
2360
|
+
# If both are dicts, compare normalized content
|
2361
|
+
if isinstance(resp1, dict) and isinstance(resp2, dict):
|
2362
|
+
# Normalize and compare
|
2363
|
+
return json.dumps(resp1, sort_keys=True) == json.dumps(resp2, sort_keys=True)
|
2364
|
+
|
2365
|
+
return False
|
2366
|
+
|
2367
|
+
|
2359
2368
|
def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
|
2360
|
-
"""Extract sequence JSON using Gemini with
|
2369
|
+
"""Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
|
2370
|
+
|
2371
|
+
Can exit early after 2 attempts if the responses match exactly.
|
2361
2372
|
|
2362
2373
|
Args:
|
2363
2374
|
model: The Gemini model instance
|
@@ -2366,12 +2377,12 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2366
2377
|
debug_dir: Optional debug directory
|
2367
2378
|
|
2368
2379
|
Returns:
|
2369
|
-
The
|
2380
|
+
The most common sequence JSON data or None if all attempts failed
|
2370
2381
|
"""
|
2371
2382
|
responses = []
|
2372
|
-
max_attempts =
|
2383
|
+
max_attempts = 6
|
2373
2384
|
|
2374
|
-
# Try
|
2385
|
+
# Try 6 times with early match detection
|
2375
2386
|
for attempt in range(max_attempts):
|
2376
2387
|
try:
|
2377
2388
|
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
@@ -2443,167 +2454,69 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2443
2454
|
else:
|
2444
2455
|
raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
|
2445
2456
|
|
2446
|
-
# Store
|
2447
|
-
|
2448
|
-
|
2449
|
-
|
2450
|
-
|
2457
|
+
# Store the response
|
2458
|
+
responses.append(parsed)
|
2459
|
+
log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
|
2460
|
+
|
2461
|
+
# Early match detection after 2 attempts
|
2462
|
+
if attempt >= 1: # After 2nd attempt (0-indexed)
|
2463
|
+
valid_responses_so_far = [r for r in responses if r is not None]
|
2464
|
+
if len(valid_responses_so_far) >= 2:
|
2465
|
+
# Check if the last two valid responses match
|
2466
|
+
if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
|
2467
|
+
log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
|
2468
|
+
# Add the matching response 4 more times to simulate consensus
|
2469
|
+
for _ in range(max_attempts - attempt - 1):
|
2470
|
+
responses.append(valid_responses_so_far[-1])
|
2471
|
+
break
|
2451
2472
|
|
2452
2473
|
except Exception as e:
|
2453
2474
|
log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
|
2454
2475
|
responses.append(None)
|
2455
|
-
|
2456
|
-
# Check for early consensus after 2 attempts
|
2457
|
-
if len(responses) == 2:
|
2458
|
-
if (responses[0] and responses[1] and
|
2459
|
-
_sequences_match(responses[0][1], responses[1][1])):
|
2460
|
-
log.info("Sequence extraction consensus reached after 2 attempts")
|
2461
|
-
return responses[0][0] # Return original parsed data
|
2462
|
-
else:
|
2463
|
-
log.info("Sequence extraction mismatch after 2 attempts - trying third")
|
2464
2476
|
|
2465
|
-
# After all attempts,
|
2477
|
+
# After all attempts, find most common sequences
|
2466
2478
|
valid_responses = [r for r in responses if r is not None]
|
2467
2479
|
|
2468
2480
|
if not valid_responses:
|
2469
2481
|
log.error(f"All {max_attempts} sequence extraction attempts failed")
|
2470
2482
|
return None
|
2471
2483
|
|
2472
|
-
#
|
2473
|
-
for i in range(len(valid_responses)):
|
2474
|
-
for j in range(i + 1, len(valid_responses)):
|
2475
|
-
if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
|
2476
|
-
log.info(f"Sequence extraction consensus found: attempts with matching content")
|
2477
|
-
return valid_responses[i][0] # Return original parsed data
|
2478
|
-
|
2479
|
-
# If no exact consensus, use adaptive validation
|
2480
|
-
log.info("No exact consensus found, applying adaptive validation...")
|
2481
|
-
|
2482
|
-
# Find sequences that appear consistently across multiple attempts
|
2483
|
-
consistent_sequences = _find_consistent_sequences(valid_responses)
|
2484
|
-
|
2485
|
-
if consistent_sequences:
|
2486
|
-
log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
|
2487
|
-
return consistent_sequences
|
2488
|
-
|
2489
|
-
# If still no consensus, use the attempt with the most sequences
|
2490
|
-
best_response = max(valid_responses,
|
2491
|
-
key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
|
2492
|
-
|
2493
|
-
if best_response and len(best_response[1]) > 0:
|
2494
|
-
log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
|
2495
|
-
return best_response[0]
|
2496
|
-
|
2497
|
-
log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
|
2498
|
-
return None
|
2499
|
-
|
2500
|
-
|
2501
|
-
def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
|
2502
|
-
"""Find sequences that appear consistently across multiple extraction attempts.
|
2503
|
-
|
2504
|
-
Args:
|
2505
|
-
valid_responses: List of (original_data, normalized_data) tuples
|
2506
|
-
|
2507
|
-
Returns:
|
2508
|
-
List of consistent sequences with confidence scores, or None if none found
|
2509
|
-
"""
|
2510
|
-
if not valid_responses:
|
2511
|
-
return None
|
2512
|
-
|
2513
|
-
# Count how many times each sequence appears
|
2484
|
+
# Count occurrences of each individual sequence across all attempts
|
2514
2485
|
sequence_counts = {}
|
2515
|
-
|
2516
|
-
|
2517
|
-
|
2518
|
-
|
2519
|
-
|
2520
|
-
|
2521
|
-
|
2522
|
-
|
2523
|
-
|
2524
|
-
|
2525
|
-
|
2526
|
-
|
2527
|
-
|
2528
|
-
|
2529
|
-
|
2530
|
-
|
2531
|
-
|
2532
|
-
|
2533
|
-
|
2534
|
-
|
2535
|
-
|
2536
|
-
|
2537
|
-
|
2538
|
-
|
2539
|
-
|
2540
|
-
|
2541
|
-
|
2542
|
-
break
|
2543
|
-
|
2544
|
-
# Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
|
2545
|
-
min_appearances = max(2, len(valid_responses) // 2)
|
2546
|
-
consistent_sequences = []
|
2547
|
-
|
2548
|
-
for key, count in sequence_counts.items():
|
2549
|
-
if count >= min_appearances:
|
2550
|
-
# Use the first occurrence of the full data
|
2551
|
-
if sequence_full_data[key]:
|
2552
|
-
seq_data = sequence_full_data[key][0].copy()
|
2553
|
-
# Add confidence based on how many times it appeared
|
2554
|
-
seq_data["confidence"] = count / len(valid_responses)
|
2555
|
-
seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
|
2556
|
-
consistent_sequences.append(seq_data)
|
2557
|
-
|
2558
|
-
return consistent_sequences if consistent_sequences else None
|
2559
|
-
|
2560
|
-
|
2561
|
-
def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
|
2562
|
-
"""Normalize sequence response for comparison."""
|
2563
|
-
if not isinstance(data, list):
|
2564
|
-
return []
|
2565
|
-
|
2566
|
-
normalized = []
|
2567
|
-
for item in data:
|
2568
|
-
if isinstance(item, dict):
|
2569
|
-
# Extract key fields for comparison
|
2570
|
-
normalized_item = {
|
2571
|
-
"variant_id": item.get("variant_id", ""),
|
2572
|
-
"aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
|
2573
|
-
"dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
|
2574
|
-
"confidence": item.get("confidence", 0.0)
|
2575
|
-
}
|
2576
|
-
normalized.append(normalized_item)
|
2486
|
+
for resp in valid_responses:
|
2487
|
+
if isinstance(resp, list):
|
2488
|
+
for seq in resp:
|
2489
|
+
if isinstance(seq, dict) and "variant_id" in seq:
|
2490
|
+
# Create a key for this sequence (variant_id + cleaned aa_seq)
|
2491
|
+
variant_id = seq.get("variant_id", "")
|
2492
|
+
aa_seq = seq.get("aa_seq", "")
|
2493
|
+
if aa_seq:
|
2494
|
+
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2495
|
+
key = f"{variant_id}|{aa_seq}"
|
2496
|
+
|
2497
|
+
if key not in sequence_counts:
|
2498
|
+
sequence_counts[key] = {"count": 0, "data": seq}
|
2499
|
+
sequence_counts[key]["count"] += 1
|
2500
|
+
|
2501
|
+
# Build result with sequences that appear in at least 3 attempts
|
2502
|
+
result = []
|
2503
|
+
for key, info in sequence_counts.items():
|
2504
|
+
if info["count"] >= 3: # Appears in at least 3/6 attempts
|
2505
|
+
seq_data = info["data"].copy()
|
2506
|
+
seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
|
2507
|
+
result.append(seq_data)
|
2508
|
+
log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
|
2509
|
+
|
2510
|
+
if result:
|
2511
|
+
log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
|
2512
|
+
return result
|
2577
2513
|
|
2578
|
-
#
|
2579
|
-
|
2514
|
+
# If no sequences appear twice, return the most complete attempt
|
2515
|
+
best_attempt = max(valid_responses, key=lambda x: len(x) if isinstance(x, list) else 0)
|
2516
|
+
log.warning(f"No consensus sequences found, returning best attempt with {len(best_attempt)} sequences")
|
2517
|
+
return best_attempt
|
2580
2518
|
|
2581
2519
|
|
2582
|
-
def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
|
2583
|
-
"""Check if two sequence response lists match on key fields."""
|
2584
|
-
if len(seq1) != len(seq2):
|
2585
|
-
return False
|
2586
|
-
|
2587
|
-
for i, (s1, s2) in enumerate(zip(seq1, seq2)):
|
2588
|
-
# Compare variant IDs
|
2589
|
-
if s1.get("variant_id") != s2.get("variant_id"):
|
2590
|
-
return False
|
2591
|
-
|
2592
|
-
# Compare amino acid sequences (most critical)
|
2593
|
-
aa1 = s1.get("aa_seq", "")
|
2594
|
-
aa2 = s2.get("aa_seq", "")
|
2595
|
-
if aa1 and aa2 and aa1 != aa2:
|
2596
|
-
return False
|
2597
|
-
elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
|
2598
|
-
return False
|
2599
|
-
|
2600
|
-
# Compare DNA sequences if present
|
2601
|
-
dna1 = s1.get("dna_seq", "")
|
2602
|
-
dna2 = s2.get("dna_seq", "")
|
2603
|
-
if dna1 and dna2 and dna1 != dna2:
|
2604
|
-
return False
|
2605
|
-
|
2606
|
-
return True
|
2607
2520
|
|
2608
2521
|
|
2609
2522
|
def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
|
@@ -2624,18 +2537,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
|
|
2624
2537
|
else:
|
2625
2538
|
prompt = base_prompt
|
2626
2539
|
|
2627
|
-
#
|
2628
|
-
if lineage_variants:
|
2629
|
-
mutation_context = _build_mutation_validation_context(lineage_variants)
|
2630
|
-
if mutation_context:
|
2631
|
-
prompt = f"""{prompt}
|
2632
|
-
|
2633
|
-
CRITICAL MUTATION VALIDATION:
|
2634
|
-
{mutation_context}
|
2635
|
-
|
2636
|
-
IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
|
2637
|
-
For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
|
2638
|
-
"""
|
2540
|
+
# Skip mutation validation context
|
2639
2541
|
|
2640
2542
|
# Save the complete prompt for debugging
|
2641
2543
|
if debug_dir:
|
@@ -2662,11 +2564,7 @@ For example, if variant "III" has mutation "A100V" from parent "II", then positi
|
|
2662
2564
|
|
2663
2565
|
extracted_sequences = _parse_sequences(data)
|
2664
2566
|
|
2665
|
-
#
|
2666
|
-
if lineage_variants:
|
2667
|
-
validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
|
2668
|
-
return validated_sequences
|
2669
|
-
|
2567
|
+
# Return extracted sequences without mutation validation
|
2670
2568
|
return extracted_sequences
|
2671
2569
|
|
2672
2570
|
# --- 7.4 JSON -> dataclass helpers -------------------------------------------
|
@@ -2701,6 +2599,19 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
2701
2599
|
aa = _clean_seq(entry.get("aa_seq"), _VALID_AA)
|
2702
2600
|
dna = _clean_seq(entry.get("dna_seq"), _VALID_DNA)
|
2703
2601
|
|
2602
|
+
# Check minimum length requirements
|
2603
|
+
# AA sequences should be > 50, DNA sequences should be > 150
|
2604
|
+
if aa and len(aa) <= 50:
|
2605
|
+
log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
|
2606
|
+
aa = None
|
2607
|
+
if dna and len(dna) <= 150:
|
2608
|
+
log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
|
2609
|
+
dna = None
|
2610
|
+
|
2611
|
+
# Skip if both sequences are too short or missing
|
2612
|
+
if not aa and not dna:
|
2613
|
+
continue
|
2614
|
+
|
2704
2615
|
conf: float | None = None
|
2705
2616
|
if aa:
|
2706
2617
|
conf = sum(c in _VALID_AA for c in aa) / len(aa)
|
@@ -3118,12 +3029,6 @@ If you cannot determine certain fields, set them to null.
|
|
3118
3029
|
seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
|
3119
3030
|
# Validate it looks like a protein sequence
|
3120
3031
|
if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
|
3121
|
-
# Sanity check the sequence against known mutations
|
3122
|
-
validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
|
3123
|
-
if validated_seq:
|
3124
|
-
seq = validated_seq
|
3125
|
-
log.info(f"Sequence validated and potentially corrected by Gemini")
|
3126
|
-
|
3127
3032
|
# Map to the first variant or wild-type
|
3128
3033
|
wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
|
3129
3034
|
if wt_variant:
|
@@ -3427,7 +3332,7 @@ def _merge_lineage_and_sequences(
|
|
3427
3332
|
log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
|
3428
3333
|
|
3429
3334
|
# 3. If we have unmatched sequences and a model, use Gemini to match
|
3430
|
-
if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
|
3335
|
+
if model and len(df_seq) > 0 and (df['aa_seq'].isna().any() or df['dna_seq'].isna().any()):
|
3431
3336
|
# Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
|
3432
3337
|
missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
|
3433
3338
|
unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
|
@@ -3442,14 +3347,9 @@ def _merge_lineage_and_sequences(
|
|
3442
3347
|
log.info("Using Gemini to match variants")
|
3443
3348
|
|
3444
3349
|
# Build prompt for Gemini
|
3445
|
-
prompt = f"""Match enzyme variant IDs between two lists from the same paper.
|
3350
|
+
prompt = f"""Match enzyme variant IDs between two lists from the same paper using your best judgment.
|
3446
3351
|
|
3447
|
-
|
3448
|
-
- Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
|
3449
|
-
- Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
|
3450
|
-
|
3451
|
-
Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
|
3452
|
-
use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
|
3352
|
+
These IDs come from different sections of the paper and may use different naming conventions for the same variant.
|
3453
3353
|
|
3454
3354
|
Lineage variant IDs (need sequences):
|
3455
3355
|
{json.dumps(unmatched_lineage_ids)}
|
@@ -3457,8 +3357,13 @@ Lineage variant IDs (need sequences):
|
|
3457
3357
|
Sequence variant IDs (have sequences):
|
3458
3358
|
{json.dumps(unmatched_seqs['variant_id'].tolist())}
|
3459
3359
|
|
3360
|
+
IMPORTANT: A variant with mutations (indicated by mutation codes like letters and numbers after an underscore or space) is a DIFFERENT enzyme from its parent. Do not match mutation variants to their base sequences - they are distinct entities with different sequences due to the mutations.
|
3361
|
+
|
3362
|
+
Only match variants that represent the SAME enzyme, accounting for different naming conventions between sections.
|
3363
|
+
|
3460
3364
|
Return ONLY a JSON object mapping lineage IDs to sequence IDs.
|
3461
3365
|
Format: {{"lineage_id": "sequence_id", ...}}
|
3366
|
+
Only include matches you are confident represent the same variant.
|
3462
3367
|
"""
|
3463
3368
|
|
3464
3369
|
try:
|
@@ -3738,16 +3643,27 @@ def run_pipeline(
|
|
3738
3643
|
# 4. Extract sequences (Section 7) ----------------------------------------
|
3739
3644
|
sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
|
3740
3645
|
|
3741
|
-
# 4a.
|
3742
|
-
#
|
3743
|
-
|
3744
|
-
|
3745
|
-
|
3746
|
-
|
3646
|
+
# 4a. First try to merge extracted sequences with lineage using Gemini matching
|
3647
|
+
# This allows fuzzy matching of complex variant IDs before external lookups
|
3648
|
+
doi = extract_doi(manuscript)
|
3649
|
+
df_merged = merge_and_score(lineage, sequences, doi, model)
|
3650
|
+
|
3651
|
+
# 4b. Check if ALL variants are missing sequences after merging
|
3652
|
+
# Only try external sources if no sequences were successfully matched
|
3653
|
+
all_missing_sequences = True
|
3654
|
+
if 'aa_seq' in df_merged.columns or 'dna_seq' in df_merged.columns:
|
3655
|
+
for _, row in df_merged.iterrows():
|
3656
|
+
has_aa = pd.notna(row.get('aa_seq'))
|
3657
|
+
has_dna = pd.notna(row.get('dna_seq'))
|
3658
|
+
if has_aa or has_dna:
|
3659
|
+
all_missing_sequences = False
|
3660
|
+
break
|
3747
3661
|
|
3748
|
-
if
|
3749
|
-
|
3750
|
-
|
3662
|
+
if all_missing_sequences:
|
3663
|
+
MIN_PROTEIN_LENGTH = 50 # Most proteins are >50 AA
|
3664
|
+
MIN_DNA_LENGTH = 150 # DNA sequences should be >150 nt
|
3665
|
+
log.info("No full-length sequences found in paper (only partial sequences < %d AA or < %d nt), attempting PDB extraction...",
|
3666
|
+
MIN_PROTEIN_LENGTH, MIN_DNA_LENGTH)
|
3751
3667
|
|
3752
3668
|
# Extract PDB IDs from all PDFs
|
3753
3669
|
pdb_ids = []
|
@@ -3785,7 +3701,13 @@ def run_pipeline(
|
|
3785
3701
|
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
3786
3702
|
|
3787
3703
|
if pdb_seq_blocks:
|
3788
|
-
|
3704
|
+
# Update the dataframe with PDB sequences
|
3705
|
+
for seq_block in pdb_seq_blocks:
|
3706
|
+
mask = df_merged['variant_id'] == seq_block.variant_id
|
3707
|
+
if mask.any():
|
3708
|
+
df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
|
3709
|
+
df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
|
3710
|
+
df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
|
3789
3711
|
log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
|
3790
3712
|
break
|
3791
3713
|
else:
|
@@ -3793,8 +3715,15 @@ def run_pipeline(
|
|
3793
3715
|
else:
|
3794
3716
|
log.warning("No PDB IDs found in paper")
|
3795
3717
|
|
3796
|
-
#
|
3797
|
-
|
3718
|
+
# 4c. If still no sequences after PDB, try Gemini extraction as last resort
|
3719
|
+
# Re-check if all variants are still missing sequences
|
3720
|
+
still_all_missing = True
|
3721
|
+
for _, row in df_merged.iterrows():
|
3722
|
+
if pd.notna(row.get('aa_seq')) or pd.notna(row.get('dna_seq')):
|
3723
|
+
still_all_missing = False
|
3724
|
+
break
|
3725
|
+
|
3726
|
+
if still_all_missing:
|
3798
3727
|
log.info("No sequences from PDB, attempting Gemini-based extraction...")
|
3799
3728
|
|
3800
3729
|
gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
|
@@ -3818,14 +3747,19 @@ def run_pipeline(
|
|
3818
3747
|
log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
|
3819
3748
|
|
3820
3749
|
if gemini_seq_blocks:
|
3821
|
-
|
3750
|
+
# Update the dataframe with Gemini/UniProt sequences
|
3751
|
+
for seq_block in gemini_seq_blocks:
|
3752
|
+
mask = df_merged['variant_id'] == seq_block.variant_id
|
3753
|
+
if mask.any():
|
3754
|
+
df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
|
3755
|
+
df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
|
3756
|
+
df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'Gemini/UniProt')
|
3822
3757
|
log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
|
3823
3758
|
else:
|
3824
3759
|
log.warning("Failed to extract sequences via Gemini")
|
3825
3760
|
|
3826
|
-
# 5.
|
3827
|
-
|
3828
|
-
df_final = merge_and_score(lineage, sequences, doi, model)
|
3761
|
+
# 5. Use the merged dataframe (already merged above)
|
3762
|
+
df_final = df_merged
|
3829
3763
|
|
3830
3764
|
# 6. Write FINAL CSV -------------------------------------------------------
|
3831
3765
|
if output_csv:
|
@@ -54,11 +54,11 @@ class Config:
|
|
54
54
|
"""Centralised tunables so tests can override them easily."""
|
55
55
|
|
56
56
|
model_name: str = "gemini-2.5-flash"
|
57
|
-
location_temperature: float = 0.
|
57
|
+
location_temperature: float = 0.0
|
58
58
|
extract_temperature: float = 0.0
|
59
59
|
model_reaction_temperature: float = 0.0
|
60
60
|
top_p: float = 1.0
|
61
|
-
max_tokens: int = 12288
|
61
|
+
max_tokens: int = 12288
|
62
62
|
pdf_cache_size: int = 8
|
63
63
|
retries: int = 2
|
64
64
|
|
@@ -778,50 +778,62 @@ class ReactionExtractor:
|
|
778
778
|
# ------------------------------------------------------------------
|
779
779
|
|
780
780
|
def _collect_captions_and_titles(self) -> str:
|
781
|
-
# Pattern to match Table or Figure with optional leading whitespace
|
781
|
+
# Pattern to match Table or Figure with optional leading whitespace and page numbers
|
782
782
|
# This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
|
783
|
-
# Also handles cases where there's whitespace before the caption
|
784
|
-
cap_pattern = re.compile(r"
|
783
|
+
# Also handles cases where there's whitespace or page numbers before the caption
|
784
|
+
cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
|
785
785
|
captions: List[str] = []
|
786
786
|
|
787
|
-
#
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
#
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
# Include
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
787
|
+
# Process each page individually to avoid TOC entries
|
788
|
+
for page_idx, page_text in enumerate(self.all_pages):
|
789
|
+
# Skip if this looks like a TOC page
|
790
|
+
if self._is_toc_page(page_text):
|
791
|
+
LOGGER.debug("Skipping TOC page %d for caption collection", page_idx + 1)
|
792
|
+
continue
|
793
|
+
|
794
|
+
# Find all figure/table captions with more context
|
795
|
+
for match in cap_pattern.finditer(page_text):
|
796
|
+
caption_line = match.group(0).strip()
|
797
|
+
|
798
|
+
# Skip if this looks like a TOC entry (has page number at end or dots)
|
799
|
+
if re.search(r'\.{3,}|\.{2,}\s*\d+\s*$|\s+\d+\s*$', caption_line):
|
800
|
+
LOGGER.debug("Skipping TOC-style entry: %s", caption_line[:50])
|
801
|
+
continue
|
802
|
+
|
803
|
+
caption_start = match.start()
|
804
|
+
|
805
|
+
# For tables, include much more content after the caption to show actual table data
|
806
|
+
# For figures, include substantial content to show what the figure contains
|
807
|
+
is_table = 'table' in match.group(1).lower()
|
808
|
+
# Increase context for figures to ensure we capture descriptive text
|
809
|
+
max_chars = 8000 if is_table else 3000
|
810
|
+
|
811
|
+
# Get context including text before and after the caption
|
812
|
+
# Include some text before to help identify the location
|
813
|
+
context_before = max(0, caption_start - 200)
|
814
|
+
context_after = min(len(page_text), caption_start + max_chars)
|
815
|
+
|
816
|
+
# Extract the full context
|
817
|
+
full_context = page_text[context_before:context_after].strip()
|
818
|
+
|
819
|
+
# Find the actual caption text (not just the "Figure X" part)
|
820
|
+
# Look for text after the figure/table identifier that forms the caption
|
821
|
+
caption_text = page_text[caption_start:context_after]
|
822
|
+
|
823
|
+
# Try to find the end of the caption (usually ends with a period before next paragraph)
|
824
|
+
caption_end_match = re.search(r'^[^\n]+\.[^\n]*(?:\n\n|\n(?=[A-Z]))', caption_text)
|
825
|
+
if caption_end_match:
|
826
|
+
actual_caption = caption_text[:caption_end_match.end()].strip()
|
827
|
+
else:
|
828
|
+
# Fallback: take first few lines
|
829
|
+
lines = caption_text.split('\n')
|
830
|
+
actual_caption = '\n'.join(lines[:3]).strip()
|
831
|
+
|
832
|
+
# Ensure we have meaningful content, not just the figure number
|
833
|
+
if len(actual_caption) > 20: # More than just "Figure S23."
|
834
|
+
# For the prompt, include the full context to help identify what's in the figure
|
835
|
+
caption_with_context = f"{actual_caption}\n\n[Context around figure/table:]\n{full_context}"
|
836
|
+
captions.append(caption_with_context)
|
825
837
|
|
826
838
|
# Also look for SI section titles
|
827
839
|
si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
|
@@ -1058,6 +1070,39 @@ class ReactionExtractor:
|
|
1058
1070
|
# 6.2 Figure / Table context helpers
|
1059
1071
|
# ------------------------------------------------------------------
|
1060
1072
|
|
1073
|
+
def _is_toc_page(self, page_text: str) -> bool:
|
1074
|
+
"""Detect if a page is a Table of Contents page."""
|
1075
|
+
# Look for common TOC indicators
|
1076
|
+
toc_indicators = [
|
1077
|
+
"table of contents",
|
1078
|
+
"contents",
|
1079
|
+
r"\.{5,}", # Multiple dots (common in TOCs)
|
1080
|
+
r"\d+\s*\n\s*\d+\s*\n\s*\d+", # Multiple page numbers in sequence
|
1081
|
+
]
|
1082
|
+
|
1083
|
+
# Count how many TOC-like patterns we find
|
1084
|
+
toc_score = 0
|
1085
|
+
text_lower = page_text.lower()
|
1086
|
+
|
1087
|
+
# Check for explicit TOC title
|
1088
|
+
if "table of contents" in text_lower or (
|
1089
|
+
"contents" in text_lower and text_lower.index("contents") < 200
|
1090
|
+
):
|
1091
|
+
toc_score += 3
|
1092
|
+
|
1093
|
+
# Check for multiple figure/table references with page numbers
|
1094
|
+
figure_with_page = re.findall(r'figure\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
|
1095
|
+
table_with_page = re.findall(r'table\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
|
1096
|
+
|
1097
|
+
if len(figure_with_page) + len(table_with_page) > 5:
|
1098
|
+
toc_score += 2
|
1099
|
+
|
1100
|
+
# Check for many dotted lines
|
1101
|
+
if len(re.findall(r'\.{5,}', page_text)) > 3:
|
1102
|
+
toc_score += 1
|
1103
|
+
|
1104
|
+
return toc_score >= 2
|
1105
|
+
|
1061
1106
|
def _page_with_reference(self, ref_id: str) -> Optional[str]:
|
1062
1107
|
for page in self.all_pages:
|
1063
1108
|
if ref_id.lower() in page.lower():
|
@@ -1131,9 +1176,14 @@ class ReactionExtractor:
|
|
1131
1176
|
LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
|
1132
1177
|
page_number + 1, doc_name, len(page_text))
|
1133
1178
|
|
1134
|
-
#
|
1135
|
-
|
1136
|
-
|
1179
|
+
# Skip Table of Contents pages
|
1180
|
+
if self._is_toc_page(page_text):
|
1181
|
+
LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
|
1182
|
+
continue
|
1183
|
+
|
1184
|
+
# Look for figure caption pattern more flexibly
|
1185
|
+
# Normalize the reference to handle variations
|
1186
|
+
figure_num = ref.replace('Figure', '').replace('figure', '').strip()
|
1137
1187
|
|
1138
1188
|
# Extract main figure number from subfigure (e.g., "1C" -> "1")
|
1139
1189
|
main_figure_num = re.match(r'^(\d+)', figure_num)
|
@@ -1142,33 +1192,62 @@ class ReactionExtractor:
|
|
1142
1192
|
else:
|
1143
1193
|
main_figure_num = figure_num
|
1144
1194
|
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
|
1150
|
-
rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
|
1151
|
-
rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
|
1152
|
-
]
|
1195
|
+
# Create a flexible pattern that handles various spacing and formatting
|
1196
|
+
# This pattern looks for "Figure" (case insensitive) followed by optional spaces
|
1197
|
+
# then the figure number, then any of: period, colon, space+capital letter, or end of line
|
1198
|
+
flexible_pattern = rf"(?i)figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
|
1153
1199
|
|
1154
|
-
LOGGER.debug("Looking for
|
1155
|
-
main_figure_num,
|
1200
|
+
LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
|
1201
|
+
main_figure_num, flexible_pattern)
|
1156
1202
|
|
1157
1203
|
caption_found = False
|
1158
1204
|
cap_rect = None
|
1159
1205
|
|
1160
|
-
for
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1206
|
+
# Search for all matches of the flexible pattern
|
1207
|
+
for match in re.finditer(flexible_pattern, page_text, re.MULTILINE):
|
1208
|
+
LOGGER.debug("Found potential figure caption: %s at position %d", match.group(0), match.start())
|
1209
|
+
# Check if this is likely an actual caption (not just a reference)
|
1210
|
+
match_start = match.start()
|
1211
|
+
match_end = match.end()
|
1212
|
+
|
1213
|
+
# Get surrounding context
|
1214
|
+
context_start = max(0, match_start - 50)
|
1215
|
+
context_end = min(len(page_text), match_end + 100)
|
1216
|
+
context = page_text[context_start:context_end]
|
1217
|
+
|
1218
|
+
# Check if this looks like a real caption (not just a reference)
|
1219
|
+
# Look for words that typically precede figure references
|
1220
|
+
preceding_text = page_text[max(0, match_start-20):match_start].lower()
|
1221
|
+
if any(word in preceding_text for word in ['see ', 'in ', 'from ', 'shown in ', 'refer to ']):
|
1222
|
+
LOGGER.debug("Skipping reference preceded by: %s", preceding_text.strip())
|
1223
|
+
continue
|
1224
|
+
|
1225
|
+
# Check if there's descriptive text after the figure number
|
1226
|
+
remaining_text = page_text[match_end:match_end+100].strip()
|
1227
|
+
|
1228
|
+
# For actual captions, there should be substantial descriptive text
|
1229
|
+
if len(remaining_text) < 20:
|
1230
|
+
LOGGER.debug("Skipping potential reference: insufficient text after (%d chars)", len(remaining_text))
|
1231
|
+
continue
|
1232
|
+
|
1233
|
+
# Check if the remaining text looks like a caption (contains descriptive words)
|
1234
|
+
first_words = remaining_text[:50].lower()
|
1235
|
+
if not any(word in first_words for word in ['detailed', 'representative', 'shows', 'comparison',
|
1236
|
+
'illustrates', 'demonstrates', 'results', 'data',
|
1237
|
+
'chromatogram', 'spectra', 'analysis', 'site-directed',
|
1238
|
+
'mutagenesis', 'mutants']):
|
1239
|
+
LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
|
1240
|
+
continue
|
1241
|
+
|
1242
|
+
# Found actual figure caption, get its position
|
1243
|
+
caption_text = match.group(0)
|
1244
|
+
text_instances = page.search_for(caption_text, quads=False)
|
1245
|
+
if text_instances:
|
1246
|
+
cap_rect = text_instances[0]
|
1247
|
+
caption_found = True
|
1248
|
+
LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
|
1249
|
+
ref, caption_text, remaining_text[:50])
|
1250
|
+
break
|
1172
1251
|
|
1173
1252
|
if not caption_found:
|
1174
1253
|
# Debug: show what figure-related text is actually on this page
|
@@ -1258,6 +1337,11 @@ class ReactionExtractor:
|
|
1258
1337
|
page = doc.load_page(page_number)
|
1259
1338
|
page_text = page.get_text()
|
1260
1339
|
|
1340
|
+
# Skip Table of Contents pages
|
1341
|
+
if self._is_toc_page(page_text):
|
1342
|
+
LOGGER.debug("Skipping TOC page %d in _find_pages_with_reference", page_number + 1)
|
1343
|
+
continue
|
1344
|
+
|
1261
1345
|
# Check for actual figure caption first
|
1262
1346
|
if ref.lower().startswith('figure'):
|
1263
1347
|
figure_num = ref.replace('Figure ', '').replace('figure ', '')
|
@@ -0,0 +1,16 @@
|
|
1
|
+
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
+
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
+
debase/_version.py,sha256=sJMwhIVyUE0G4qRHUUpEgw2beNe5jCSb9uQVOTV6krw,49
|
4
|
+
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
+
debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
|
6
|
+
debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
|
7
|
+
debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
|
8
|
+
debase/reaction_info_extractor.py,sha256=8ilu5o2FbXTV9R1Nhxd4m4TdgHOd6GsC3rxxHvqu9f4,165555
|
9
|
+
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
+
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
+
debase-0.5.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
+
debase-0.5.0.dist-info/METADATA,sha256=2Csgtf4gF8egVAvq8CsY4jpad2yWw_6c1iuOj55L5n8,4047
|
13
|
+
debase-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
+
debase-0.5.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
+
debase-0.5.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
+
debase-0.5.0.dist-info/RECORD,,
|
debase-0.4.5.dist-info/RECORD
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
|
2
|
-
debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
|
3
|
-
debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
|
4
|
-
debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
|
5
|
-
debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
|
6
|
-
debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
|
7
|
-
debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
|
8
|
-
debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
|
9
|
-
debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
|
10
|
-
debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
|
11
|
-
debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
|
12
|
-
debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
|
13
|
-
debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
14
|
-
debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
|
15
|
-
debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
|
16
|
-
debase-0.4.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|