debase 0.4.5__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.4.5"
3
+ __version__ = "0.5.0"
@@ -30,6 +30,27 @@ except ImportError: # pragma: no cover
30
30
  # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
31
31
 
32
32
  VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
33
+ VALID_DNA_BASES = set("ACGT")
34
+
35
+ # Genetic code table for DNA to amino acid translation
36
+ GENETIC_CODE = {
37
+ 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
38
+ 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
39
+ 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
40
+ 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
41
+ 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
42
+ 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
43
+ 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
44
+ 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
45
+ 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
46
+ 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
47
+ 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
48
+ 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
49
+ 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
50
+ 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
51
+ 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
52
+ 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
53
+ }
33
54
 
34
55
  # Gemini API configuration
35
56
  GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
@@ -182,6 +203,44 @@ class SequenceManipulator:
182
203
  """Validate that a sequence contains only valid amino acids."""
183
204
  return all(aa in VALID_AMINO_ACIDS for aa in seq.upper())
184
205
 
206
+ @staticmethod
207
+ def is_dna_sequence(seq: str) -> bool:
208
+ """Check if a sequence is DNA (contains only ACGT)."""
209
+ seq_upper = seq.upper().replace(" ", "").replace("\n", "")
210
+ return all(base in VALID_DNA_BASES for base in seq_upper) and len(seq_upper) > 0
211
+
212
+ @staticmethod
213
+ def translate_dna_to_protein(dna_seq: str) -> str:
214
+ """Translate DNA sequence to protein sequence.
215
+
216
+ Args:
217
+ dna_seq: DNA sequence string
218
+
219
+ Returns:
220
+ Protein sequence string
221
+ """
222
+ # Clean the DNA sequence
223
+ dna_seq = dna_seq.upper().replace(" ", "").replace("\n", "")
224
+
225
+ # Check if sequence length is multiple of 3
226
+ if len(dna_seq) % 3 != 0:
227
+ log.warning(f"DNA sequence length ({len(dna_seq)}) is not a multiple of 3. Truncating to nearest codon.")
228
+ dna_seq = dna_seq[:-(len(dna_seq) % 3)]
229
+
230
+ protein_seq = []
231
+ for i in range(0, len(dna_seq), 3):
232
+ codon = dna_seq[i:i+3]
233
+ if len(codon) == 3:
234
+ # Handle unknown codons (with N or other non-standard bases)
235
+ if codon in GENETIC_CODE:
236
+ protein_seq.append(GENETIC_CODE[codon])
237
+ else:
238
+ # If codon contains non-standard bases, add 'X' for unknown amino acid
239
+ protein_seq.append('X')
240
+ log.debug(f"Unknown codon '{codon}' at position {i}, using 'X' for unknown amino acid")
241
+
242
+ return ''.join(protein_seq)
243
+
185
244
  @staticmethod
186
245
  def determine_indexing(parent_seq: str, mutations: List[Mutation]) -> int:
187
246
  """Determine whether mutations use 0-based or 1-based indexing."""
@@ -1141,6 +1200,9 @@ class SequenceProcessor:
1141
1200
  # Detect and handle column format automatically
1142
1201
  self._normalize_columns()
1143
1202
 
1203
+ # Translate DNA sequences to protein sequences if needed
1204
+ self._translate_dna_sequences()
1205
+
1144
1206
  log.info(
1145
1207
  f"Loaded {len(self.df)} rows, "
1146
1208
  f"{sum(self.df['protein_sequence'].str.strip() == '')} empty sequences"
@@ -1153,6 +1215,67 @@ class SequenceProcessor:
1153
1215
  # Initialize generator
1154
1216
  self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
1155
1217
 
1218
+ def _translate_dna_sequences(self) -> None:
1219
+ """Translate DNA sequences to protein sequences if no amino acid sequences exist."""
1220
+ manipulator = SequenceManipulator()
1221
+
1222
+ # First check if ANY sequences are amino acid sequences
1223
+ has_amino_acid = False
1224
+ for idx, row in self.df.iterrows():
1225
+ seq = str(row.get("protein_sequence", "")).strip()
1226
+ if seq and seq.lower() not in ["nan", "none", ""]:
1227
+ if not manipulator.is_dna_sequence(seq):
1228
+ has_amino_acid = True
1229
+ break
1230
+
1231
+ # If we found amino acid sequences, don't translate anything
1232
+ if has_amino_acid:
1233
+ log.info("Found amino acid sequences in data, skipping DNA translation")
1234
+ return
1235
+
1236
+ # No amino acid sequences found, check for DNA sequences in dna_seq column
1237
+ if "dna_seq" in self.df.columns:
1238
+ dna_count = 0
1239
+ for idx, row in self.df.iterrows():
1240
+ protein_seq = str(row.get("protein_sequence", "")).strip()
1241
+ dna_seq = str(row.get("dna_seq", "")).strip()
1242
+
1243
+ # If protein_sequence is empty but dna_seq has content, translate it
1244
+ if (not protein_seq or protein_seq.lower() in ["nan", "none", ""]) and \
1245
+ (dna_seq and dna_seq.lower() not in ["nan", "none", ""]):
1246
+ if manipulator.is_dna_sequence(dna_seq):
1247
+ # Translate DNA to protein
1248
+ translated_seq = manipulator.translate_dna_to_protein(dna_seq)
1249
+ self.df.at[idx, "protein_sequence"] = translated_seq
1250
+
1251
+ # Add flag to indicate this was translated from DNA
1252
+ if "flag" not in self.df.columns:
1253
+ self.df["flag"] = ""
1254
+ existing_flag = str(self.df.at[idx, "flag"]).strip()
1255
+ self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
1256
+ dna_count += 1
1257
+
1258
+ if dna_count > 0:
1259
+ log.info(f"Translated {dna_count} DNA sequences from dna_seq column to protein sequences")
1260
+
1261
+ # Also check if DNA sequences are mistakenly in protein_sequence column
1262
+ dna_count = 0
1263
+ for idx, row in self.df.iterrows():
1264
+ seq = str(row.get("protein_sequence", "")).strip()
1265
+ if seq and seq.lower() not in ["nan", "none", ""]:
1266
+ if manipulator.is_dna_sequence(seq):
1267
+ # Translate DNA to protein
1268
+ protein_seq = manipulator.translate_dna_to_protein(seq)
1269
+ self.df.at[idx, "protein_sequence"] = protein_seq
1270
+
1271
+ # Add flag to indicate this was translated from DNA
1272
+ existing_flag = str(self.df.at[idx, "flag"]).strip()
1273
+ self.df.at[idx, "flag"] = f"{existing_flag} dna_translated".strip()
1274
+ dna_count += 1
1275
+
1276
+ if dna_count > 0:
1277
+ log.info(f"Translated {dna_count} DNA sequences to protein sequences")
1278
+
1156
1279
  def _normalize_columns(self) -> None:
1157
1280
  """Automatically detect and normalize column names from different formats."""
1158
1281
  # Check if this is enzyme_lineage_extractor format
@@ -24,6 +24,7 @@ import pandas as pd
24
24
  import networkx as nx # light dependency, used only for generation inference
25
25
 
26
26
  import os
27
+ import fitz
27
28
  import re
28
29
  import json
29
30
  import time
@@ -460,8 +461,32 @@ def get_model():
460
461
  if not api_key:
461
462
  raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
462
463
  _genai.configure(api_key=api_key)
463
- # Positional constructor arg works for both SDK flavors
464
- return _genai.GenerativeModel(MODEL_NAME)
464
+
465
+ # Create generation config to optimize performance and costs
466
+ generation_config = {
467
+ "temperature": 0.0, # Deterministic: always pick the most likely token
468
+ "top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
469
+ "top_k": 1, # Only consider the single most likely token
470
+ "max_output_tokens": 32768, # Increased from 8192 to handle larger sequence extractions
471
+ }
472
+
473
+ # For Gemini 2.5 Flash, disable thinking tokens to save costs
474
+ # thinking_budget=0 disables thinking, -1 enables dynamic thinking (default)
475
+ # Only add if SDK supports it to maintain compatibility
476
+ try:
477
+ # Test if thinking_budget is supported by making a minimal API call
478
+ test_config = {"thinking_budget": 0, "max_output_tokens": 10}
479
+ test_model = _genai.GenerativeModel(MODEL_NAME, generation_config=test_config)
480
+ # Actually test the API call to see if thinking_budget is supported
481
+ test_response = test_model.generate_content("Return 'OK'")
482
+ # If no error, add thinking_budget to main config
483
+ generation_config["thinking_budget"] = 0
484
+ log.debug("Disabled thinking tokens (thinking_budget=0)")
485
+ except Exception as e:
486
+ # SDK doesn't support thinking_budget, continue without it
487
+ log.debug(f"thinking_budget not supported: {e}")
488
+
489
+ return _genai.GenerativeModel(MODEL_NAME, generation_config=generation_config)
465
490
 
466
491
  # === 5.3 Unified call helper ----------------------------------------------
467
492
 
@@ -728,22 +753,24 @@ Return a JSON object with:
728
753
  _LINEAGE_LOC_PROMPT = """
729
754
  You are an expert reader of protein engineering manuscripts.
730
755
  {campaign_context}
731
- Given the following article text, list up to {max_results} *locations* (page
732
- numbers, figure/table IDs, or section headings) that you would review first to
733
- find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
734
- came from which parent and what mutations were introduced){campaign_specific}.
756
+ Given the following article text, list up to {max_results} *locations* (figure/table IDs
757
+ or section headings) that you would review first to find the COMPLETE evolutionary
758
+ lineage of enzyme variants (i.e. which variant came from which parent and what
759
+ mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
760
+ ensure the location you return are actually lineage location with variants and mutations.
735
761
 
736
762
  Respond with a JSON array of objects, each containing:
737
- - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
738
- - "type": one of "table", "figure", "text", "section"
763
+ - "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
764
+ - "type": one of "table", "figure", "section"
739
765
  - "confidence": your confidence score (0-100) that this location contains lineage data
740
766
  - "reason": brief explanation of why this location likely contains lineage
741
767
  {campaign_field}
742
- IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
768
+ IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
769
+ NOT page numbers. Focus on the actual figure/table titles and numbers.
743
770
 
744
771
  Order by confidence score (highest first). Tables showing complete variant lineages or
745
- mutation lists should be ranked higher than figure showing complete variant lineages.
746
- Text sections is used when no suitable tables/figurews exist.
772
+ mutation lists should be ranked higher than figures showing complete variant lineages.
773
+ Sections are used when no suitable tables/figures exist.
747
774
 
748
775
  Don't include oligonucleotide results or result from only one round.
749
776
 
@@ -1713,7 +1740,6 @@ def get_lineage(
1713
1740
  for pdf_path in pdf_paths:
1714
1741
  # Extract first few pages looking for TOC
1715
1742
  try:
1716
- import fitz # PyMuPDF
1717
1743
  doc = fitz.open(pdf_path)
1718
1744
  toc_text = ""
1719
1745
  for page_num in range(min(5, doc.page_count)): # First 5 pages
@@ -2011,7 +2037,7 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
2011
2037
 
2012
2038
  # --- 7.2 Page-based extraction helper ---------------------------------------
2013
2039
  def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
2014
- """Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
2040
+ """Extract plain text sequence using Gemini with 6 attempts, returning most common result.
2015
2041
 
2016
2042
  Args:
2017
2043
  prompt: The prompt to send to Gemini
@@ -2019,12 +2045,12 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
2019
2045
  context: Additional context for logging (e.g., "validation" or "extraction")
2020
2046
 
2021
2047
  Returns:
2022
- The validated sequence or None if no consensus
2048
+ The most common sequence or None if all attempts failed
2023
2049
  """
2024
2050
  sequences = []
2025
- max_attempts = 5 # Increased from 3 to 5
2051
+ max_attempts = 6
2026
2052
 
2027
- # Try up to 5 times
2053
+ # Try 6 times
2028
2054
  for attempt in range(max_attempts):
2029
2055
  try:
2030
2056
  response = model.generate_content(prompt)
@@ -2050,38 +2076,14 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
2050
2076
  except Exception as e:
2051
2077
  log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
2052
2078
  sequences.append("ERROR")
2053
-
2054
- # Check for early consensus after 2 attempts
2055
- if len(sequences) == 2:
2056
- # Clean sequences before comparison
2057
- seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
2058
- seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
2059
-
2060
- if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
2061
- log.info(f"Gemini {context} consensus reached after 2 attempts")
2062
- return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
2063
- else:
2064
- log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
2065
2079
 
2066
- # After all attempts, find consensus
2080
+ # After all attempts, find most common result
2067
2081
  valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
2068
2082
 
2069
2083
  if not valid_sequences:
2070
2084
  log.error(f"All {max_attempts} {context} attempts failed")
2071
2085
  return None
2072
2086
 
2073
- # Find any matching pair
2074
- for i in range(len(sequences)):
2075
- for j in range(i + 1, len(sequences)):
2076
- # Clean sequences before comparison
2077
- seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
2078
- seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
2079
-
2080
- if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
2081
- log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
2082
- return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
2083
-
2084
- # If no exact match, use adaptive validation
2085
2087
  # Count occurrences of each valid sequence
2086
2088
  sequence_counts = {}
2087
2089
  for seq in valid_sequences:
@@ -2090,80 +2092,16 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
2090
2092
  seq_clean = seq.replace(" ", "").replace("\n", "")
2091
2093
  sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
2092
2094
 
2093
- # Return the most common sequence if it appears at least twice
2095
+ # Return the most common sequence
2094
2096
  if sequence_counts:
2095
2097
  most_common = max(sequence_counts.items(), key=lambda x: x[1])
2096
- if most_common[1] >= 2:
2097
- log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
2098
- return most_common[0]
2098
+ log.info(f"Gemini {context} most common: sequence appeared {most_common[1]}/{max_attempts} times")
2099
+ return most_common[0]
2099
2100
 
2100
- log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
2101
+ log.warning(f"Gemini {context} no valid sequences after {max_attempts} attempts")
2101
2102
  return None
2102
2103
 
2103
2104
 
2104
- def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
2105
- """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
2106
-
2107
- # Extract mutations from variants
2108
- mutations = []
2109
- for variant in variants:
2110
- if variant.mutations:
2111
- mutations.extend(variant.mutations)
2112
-
2113
- if not mutations:
2114
- return None
2115
-
2116
- # Take a sample of mutations for validation
2117
- sample_mutations = mutations[:10] # Check first 10 mutations
2118
-
2119
- # First do a quick local check for obvious inconsistencies
2120
- local_issues = []
2121
- for mutation in sample_mutations:
2122
- if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
2123
- pos = mutation.position - 1 # Convert to 0-indexed
2124
- if 0 <= pos < len(sequence):
2125
- actual_aa = sequence[pos]
2126
- expected_aa = mutation.original
2127
- if actual_aa != expected_aa:
2128
- local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
2129
-
2130
- if not local_issues:
2131
- return None # No obvious issues found
2132
-
2133
- log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
2134
-
2135
- prompt = f"""
2136
- You are validating a protein sequence that was extracted from a scientific paper.
2137
- The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
2138
-
2139
- Original sequence (length {len(sequence)}):
2140
- {sequence}
2141
-
2142
- Known mutations that should be applicable to this sequence:
2143
- {', '.join(str(m) for m in sample_mutations)}
2144
-
2145
- Potential issues detected:
2146
- {chr(10).join(local_issues)}
2147
-
2148
- Please check if the sequence is consistent with these mutations:
2149
- 1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
2150
- 2. If you find inconsistencies, suggest the most likely correction
2151
- 3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
2152
- 4. Pay special attention to consecutive identical amino acids that might be OCR errors
2153
-
2154
- Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
2155
- If you cannot determine the correct sequence, return "UNCERTAIN".
2156
- """
2157
-
2158
- # Use triple validation
2159
- result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
2160
-
2161
- if result == "VALID" or result is None:
2162
- return None # No changes needed
2163
- else:
2164
- log.info(f"Gemini suggested sequence correction (length {len(result)})")
2165
- return result
2166
-
2167
2105
 
2168
2106
  def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
2169
2107
  """Extract text from a specific page number in the PDFs.
@@ -2331,11 +2269,11 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
2331
2269
 
2332
2270
  SEQUENCE EXTRACTION RULES:
2333
2271
  - Copy sequences EXACTLY as they appear in the text
2334
- - Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
2335
- - Do NOT add, remove, or modify any amino acids
2272
+ - Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
2273
+ - Do NOT add, remove, or modify any amino acids, or nucleotides
2336
2274
  - Preserve the exact length and character sequence
2337
2275
  - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
2338
- - Double-check that consecutive identical amino acids are copied correctly
2276
+ - Double-check that consecutive identical amino acids or nucleotides are copied correctly
2339
2277
 
2340
2278
  For each variant return:
2341
2279
  * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
@@ -2356,8 +2294,81 @@ TEXT (may be truncated):
2356
2294
  ```
2357
2295
  """.strip()
2358
2296
 
2297
+ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
2298
+ """
2299
+ Check if two sequence extraction responses match.
2300
+
2301
+ Args:
2302
+ resp1: First response (list of sequences or dict)
2303
+ resp2: Second response (list of sequences or dict)
2304
+
2305
+ Returns:
2306
+ True if responses match, False otherwise
2307
+ """
2308
+ # Handle None cases
2309
+ if resp1 is None or resp2 is None:
2310
+ return False
2311
+
2312
+ # Both should be the same type
2313
+ if type(resp1) != type(resp2):
2314
+ return False
2315
+
2316
+ # If both are lists
2317
+ if isinstance(resp1, list) and isinstance(resp2, list):
2318
+ # Must have same length
2319
+ if len(resp1) != len(resp2):
2320
+ return False
2321
+
2322
+ # Create normalized sequence sets for comparison
2323
+ seq_set1 = set()
2324
+ seq_set2 = set()
2325
+
2326
+ for seq in resp1:
2327
+ if isinstance(seq, dict):
2328
+ variant_id = seq.get("variant_id", "")
2329
+ aa_seq = seq.get("aa_seq")
2330
+ dna_seq = seq.get("dna_seq")
2331
+ # Handle None/null values - convert to empty string for comparison
2332
+ if aa_seq is None:
2333
+ aa_seq = ""
2334
+ else:
2335
+ aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2336
+ if dna_seq is None:
2337
+ dna_seq = ""
2338
+ else:
2339
+ dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
2340
+ seq_set1.add(f"{variant_id}|{aa_seq}|{dna_seq}")
2341
+
2342
+ for seq in resp2:
2343
+ if isinstance(seq, dict):
2344
+ variant_id = seq.get("variant_id", "")
2345
+ aa_seq = seq.get("aa_seq")
2346
+ dna_seq = seq.get("dna_seq")
2347
+ # Handle None/null values - convert to empty string for comparison
2348
+ if aa_seq is None:
2349
+ aa_seq = ""
2350
+ else:
2351
+ aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2352
+ if dna_seq is None:
2353
+ dna_seq = ""
2354
+ else:
2355
+ dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
2356
+ seq_set2.add(f"{variant_id}|{aa_seq}|{dna_seq}")
2357
+
2358
+ return seq_set1 == seq_set2
2359
+
2360
+ # If both are dicts, compare normalized content
2361
+ if isinstance(resp1, dict) and isinstance(resp2, dict):
2362
+ # Normalize and compare
2363
+ return json.dumps(resp1, sort_keys=True) == json.dumps(resp2, sort_keys=True)
2364
+
2365
+ return False
2366
+
2367
+
2359
2368
  def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
2360
- """Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
2369
+ """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
2370
+
2371
+ Can exit early after 2 attempts if the responses match exactly.
2361
2372
 
2362
2373
  Args:
2363
2374
  model: The Gemini model instance
@@ -2366,12 +2377,12 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2366
2377
  debug_dir: Optional debug directory
2367
2378
 
2368
2379
  Returns:
2369
- The validated sequence JSON data or None if no consensus
2380
+ The most common sequence JSON data or None if all attempts failed
2370
2381
  """
2371
2382
  responses = []
2372
- max_attempts = 5 # Increased from 3 to 5
2383
+ max_attempts = 6
2373
2384
 
2374
- # Try up to 5 times
2385
+ # Try 6 times with early match detection
2375
2386
  for attempt in range(max_attempts):
2376
2387
  try:
2377
2388
  log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2443,167 +2454,69 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2443
2454
  else:
2444
2455
  raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
2445
2456
 
2446
- # Store both the original and normalized response
2447
- normalized_response = _normalize_sequence_response(parsed)
2448
- responses.append((parsed, normalized_response))
2449
-
2450
- log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
2457
+ # Store the response
2458
+ responses.append(parsed)
2459
+ log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
2460
+
2461
+ # Early match detection after 2 attempts
2462
+ if attempt >= 1: # After 2nd attempt (0-indexed)
2463
+ valid_responses_so_far = [r for r in responses if r is not None]
2464
+ if len(valid_responses_so_far) >= 2:
2465
+ # Check if the last two valid responses match
2466
+ if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2467
+ log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2468
+ # Add the matching response 4 more times to simulate consensus
2469
+ for _ in range(max_attempts - attempt - 1):
2470
+ responses.append(valid_responses_so_far[-1])
2471
+ break
2451
2472
 
2452
2473
  except Exception as e:
2453
2474
  log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
2454
2475
  responses.append(None)
2455
-
2456
- # Check for early consensus after 2 attempts
2457
- if len(responses) == 2:
2458
- if (responses[0] and responses[1] and
2459
- _sequences_match(responses[0][1], responses[1][1])):
2460
- log.info("Sequence extraction consensus reached after 2 attempts")
2461
- return responses[0][0] # Return original parsed data
2462
- else:
2463
- log.info("Sequence extraction mismatch after 2 attempts - trying third")
2464
2476
 
2465
- # After all attempts, use adaptive validation
2477
+ # After all attempts, find most common sequences
2466
2478
  valid_responses = [r for r in responses if r is not None]
2467
2479
 
2468
2480
  if not valid_responses:
2469
2481
  log.error(f"All {max_attempts} sequence extraction attempts failed")
2470
2482
  return None
2471
2483
 
2472
- # First, try to find exact consensus (any matching pair)
2473
- for i in range(len(valid_responses)):
2474
- for j in range(i + 1, len(valid_responses)):
2475
- if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
2476
- log.info(f"Sequence extraction consensus found: attempts with matching content")
2477
- return valid_responses[i][0] # Return original parsed data
2478
-
2479
- # If no exact consensus, use adaptive validation
2480
- log.info("No exact consensus found, applying adaptive validation...")
2481
-
2482
- # Find sequences that appear consistently across multiple attempts
2483
- consistent_sequences = _find_consistent_sequences(valid_responses)
2484
-
2485
- if consistent_sequences:
2486
- log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
2487
- return consistent_sequences
2488
-
2489
- # If still no consensus, use the attempt with the most sequences
2490
- best_response = max(valid_responses,
2491
- key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
2492
-
2493
- if best_response and len(best_response[1]) > 0:
2494
- log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
2495
- return best_response[0]
2496
-
2497
- log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
2498
- return None
2499
-
2500
-
2501
- def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
2502
- """Find sequences that appear consistently across multiple extraction attempts.
2503
-
2504
- Args:
2505
- valid_responses: List of (original_data, normalized_data) tuples
2506
-
2507
- Returns:
2508
- List of consistent sequences with confidence scores, or None if none found
2509
- """
2510
- if not valid_responses:
2511
- return None
2512
-
2513
- # Count how many times each sequence appears
2484
+ # Count occurrences of each individual sequence across all attempts
2514
2485
  sequence_counts = {}
2515
- sequence_full_data = {}
2516
-
2517
- for original, normalized in valid_responses:
2518
- if not isinstance(normalized, list):
2519
- continue
2520
-
2521
- for seq in normalized:
2522
- variant_id = seq.get("variant_id", "")
2523
- aa_seq = seq.get("aa_seq", "")
2524
- # Clean sequence before using in key
2525
- aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
2526
-
2527
- # Create a unique key for this sequence
2528
- key = f"{variant_id}|{aa_seq_clean}"
2529
-
2530
- if key not in sequence_counts:
2531
- sequence_counts[key] = 0
2532
- sequence_full_data[key] = []
2533
-
2534
- sequence_counts[key] += 1
2535
-
2536
- # Find the full data for this sequence from the original response
2537
- if isinstance(original, list):
2538
- for orig_seq in original:
2539
- if (orig_seq.get("variant_id") == variant_id and
2540
- orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
2541
- sequence_full_data[key].append(orig_seq)
2542
- break
2543
-
2544
- # Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
2545
- min_appearances = max(2, len(valid_responses) // 2)
2546
- consistent_sequences = []
2547
-
2548
- for key, count in sequence_counts.items():
2549
- if count >= min_appearances:
2550
- # Use the first occurrence of the full data
2551
- if sequence_full_data[key]:
2552
- seq_data = sequence_full_data[key][0].copy()
2553
- # Add confidence based on how many times it appeared
2554
- seq_data["confidence"] = count / len(valid_responses)
2555
- seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
2556
- consistent_sequences.append(seq_data)
2557
-
2558
- return consistent_sequences if consistent_sequences else None
2559
-
2560
-
2561
- def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
2562
- """Normalize sequence response for comparison."""
2563
- if not isinstance(data, list):
2564
- return []
2565
-
2566
- normalized = []
2567
- for item in data:
2568
- if isinstance(item, dict):
2569
- # Extract key fields for comparison
2570
- normalized_item = {
2571
- "variant_id": item.get("variant_id", ""),
2572
- "aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
2573
- "dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
2574
- "confidence": item.get("confidence", 0.0)
2575
- }
2576
- normalized.append(normalized_item)
2486
+ for resp in valid_responses:
2487
+ if isinstance(resp, list):
2488
+ for seq in resp:
2489
+ if isinstance(seq, dict) and "variant_id" in seq:
2490
+ # Create a key for this sequence (variant_id + cleaned aa_seq)
2491
+ variant_id = seq.get("variant_id", "")
2492
+ aa_seq = seq.get("aa_seq", "")
2493
+ if aa_seq:
2494
+ aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2495
+ key = f"{variant_id}|{aa_seq}"
2496
+
2497
+ if key not in sequence_counts:
2498
+ sequence_counts[key] = {"count": 0, "data": seq}
2499
+ sequence_counts[key]["count"] += 1
2500
+
2501
+ # Build result with sequences that appear in at least 3 attempts
2502
+ result = []
2503
+ for key, info in sequence_counts.items():
2504
+ if info["count"] >= 3: # Appears in at least 3/6 attempts
2505
+ seq_data = info["data"].copy()
2506
+ seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
2507
+ result.append(seq_data)
2508
+ log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
2509
+
2510
+ if result:
2511
+ log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
2512
+ return result
2577
2513
 
2578
- # Sort by variant_id for consistent comparison
2579
- return sorted(normalized, key=lambda x: x["variant_id"])
2514
+ # If no sequences appear twice, return the most complete attempt
2515
+ best_attempt = max(valid_responses, key=lambda x: len(x) if isinstance(x, list) else 0)
2516
+ log.warning(f"No consensus sequences found, returning best attempt with {len(best_attempt)} sequences")
2517
+ return best_attempt
2580
2518
 
2581
2519
 
2582
- def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
2583
- """Check if two sequence response lists match on key fields."""
2584
- if len(seq1) != len(seq2):
2585
- return False
2586
-
2587
- for i, (s1, s2) in enumerate(zip(seq1, seq2)):
2588
- # Compare variant IDs
2589
- if s1.get("variant_id") != s2.get("variant_id"):
2590
- return False
2591
-
2592
- # Compare amino acid sequences (most critical)
2593
- aa1 = s1.get("aa_seq", "")
2594
- aa2 = s2.get("aa_seq", "")
2595
- if aa1 and aa2 and aa1 != aa2:
2596
- return False
2597
- elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
2598
- return False
2599
-
2600
- # Compare DNA sequences if present
2601
- dna1 = s1.get("dna_seq", "")
2602
- dna2 = s2.get("dna_seq", "")
2603
- if dna1 and dna2 and dna1 != dna2:
2604
- return False
2605
-
2606
- return True
2607
2520
 
2608
2521
 
2609
2522
  def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
@@ -2624,18 +2537,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
2624
2537
  else:
2625
2538
  prompt = base_prompt
2626
2539
 
2627
- # Add mutation validation context if we have lineage variants with mutations
2628
- if lineage_variants:
2629
- mutation_context = _build_mutation_validation_context(lineage_variants)
2630
- if mutation_context:
2631
- prompt = f"""{prompt}
2632
-
2633
- CRITICAL MUTATION VALIDATION:
2634
- {mutation_context}
2635
-
2636
- IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
2637
- For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
2638
- """
2540
+ # Skip mutation validation context
2639
2541
 
2640
2542
  # Save the complete prompt for debugging
2641
2543
  if debug_dir:
@@ -2662,11 +2564,7 @@ For example, if variant "III" has mutation "A100V" from parent "II", then positi
2662
2564
 
2663
2565
  extracted_sequences = _parse_sequences(data)
2664
2566
 
2665
- # Post-process: validate sequences against mutations if we have lineage info
2666
- if lineage_variants:
2667
- validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
2668
- return validated_sequences
2669
-
2567
+ # Return extracted sequences without mutation validation
2670
2568
  return extracted_sequences
2671
2569
 
2672
2570
  # --- 7.4 JSON -> dataclass helpers -------------------------------------------
@@ -2701,6 +2599,19 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
2701
2599
  aa = _clean_seq(entry.get("aa_seq"), _VALID_AA)
2702
2600
  dna = _clean_seq(entry.get("dna_seq"), _VALID_DNA)
2703
2601
 
2602
+ # Check minimum length requirements
2603
+ # AA sequences should be > 50, DNA sequences should be > 150
2604
+ if aa and len(aa) <= 50:
2605
+ log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
2606
+ aa = None
2607
+ if dna and len(dna) <= 150:
2608
+ log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
2609
+ dna = None
2610
+
2611
+ # Skip if both sequences are too short or missing
2612
+ if not aa and not dna:
2613
+ continue
2614
+
2704
2615
  conf: float | None = None
2705
2616
  if aa:
2706
2617
  conf = sum(c in _VALID_AA for c in aa) / len(aa)
@@ -3118,12 +3029,6 @@ If you cannot determine certain fields, set them to null.
3118
3029
  seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
3119
3030
  # Validate it looks like a protein sequence
3120
3031
  if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
3121
- # Sanity check the sequence against known mutations
3122
- validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
3123
- if validated_seq:
3124
- seq = validated_seq
3125
- log.info(f"Sequence validated and potentially corrected by Gemini")
3126
-
3127
3032
  # Map to the first variant or wild-type
3128
3033
  wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
3129
3034
  if wt_variant:
@@ -3427,7 +3332,7 @@ def _merge_lineage_and_sequences(
3427
3332
  log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
3428
3333
 
3429
3334
  # 3. If we have unmatched sequences and a model, use Gemini to match
3430
- if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
3335
+ if model and len(df_seq) > 0 and (df['aa_seq'].isna().any() or df['dna_seq'].isna().any()):
3431
3336
  # Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
3432
3337
  missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
3433
3338
  unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
@@ -3442,14 +3347,9 @@ def _merge_lineage_and_sequences(
3442
3347
  log.info("Using Gemini to match variants")
3443
3348
 
3444
3349
  # Build prompt for Gemini
3445
- prompt = f"""Match enzyme variant IDs between two lists from the same paper.
3350
+ prompt = f"""Match enzyme variant IDs between two lists from the same paper using your best judgment.
3446
3351
 
3447
- Papers often use different naming conventions for the same variant:
3448
- - Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
3449
- - Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
3450
-
3451
- Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
3452
- use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
3352
+ These IDs come from different sections of the paper and may use different naming conventions for the same variant.
3453
3353
 
3454
3354
  Lineage variant IDs (need sequences):
3455
3355
  {json.dumps(unmatched_lineage_ids)}
@@ -3457,8 +3357,13 @@ Lineage variant IDs (need sequences):
3457
3357
  Sequence variant IDs (have sequences):
3458
3358
  {json.dumps(unmatched_seqs['variant_id'].tolist())}
3459
3359
 
3360
+ IMPORTANT: A variant with mutations (indicated by mutation codes like letters and numbers after an underscore or space) is a DIFFERENT enzyme from its parent. Do not match mutation variants to their base sequences - they are distinct entities with different sequences due to the mutations.
3361
+
3362
+ Only match variants that represent the SAME enzyme, accounting for different naming conventions between sections.
3363
+
3460
3364
  Return ONLY a JSON object mapping lineage IDs to sequence IDs.
3461
3365
  Format: {{"lineage_id": "sequence_id", ...}}
3366
+ Only include matches you are confident represent the same variant.
3462
3367
  """
3463
3368
 
3464
3369
  try:
@@ -3738,16 +3643,27 @@ def run_pipeline(
3738
3643
  # 4. Extract sequences (Section 7) ----------------------------------------
3739
3644
  sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
3740
3645
 
3741
- # 4a. Try PDB extraction if no sequences found -----------------------------
3742
- # Check if we need PDB sequences (no sequences or only partial sequences)
3743
- MIN_PROTEIN_LENGTH = 50 # Most proteins are >50 AA
3744
- needs_pdb = (not sequences or
3745
- all(s.aa_seq is None or (s.aa_seq and len(s.aa_seq) < MIN_PROTEIN_LENGTH)
3746
- for s in sequences))
3646
+ # 4a. First try to merge extracted sequences with lineage using Gemini matching
3647
+ # This allows fuzzy matching of complex variant IDs before external lookups
3648
+ doi = extract_doi(manuscript)
3649
+ df_merged = merge_and_score(lineage, sequences, doi, model)
3650
+
3651
+ # 4b. Check if ALL variants are missing sequences after merging
3652
+ # Only try external sources if no sequences were successfully matched
3653
+ all_missing_sequences = True
3654
+ if 'aa_seq' in df_merged.columns or 'dna_seq' in df_merged.columns:
3655
+ for _, row in df_merged.iterrows():
3656
+ has_aa = pd.notna(row.get('aa_seq'))
3657
+ has_dna = pd.notna(row.get('dna_seq'))
3658
+ if has_aa or has_dna:
3659
+ all_missing_sequences = False
3660
+ break
3747
3661
 
3748
- if needs_pdb:
3749
- log.info("No full-length sequences found in paper (only partial sequences < %d AA), attempting PDB extraction...",
3750
- MIN_PROTEIN_LENGTH)
3662
+ if all_missing_sequences:
3663
+ MIN_PROTEIN_LENGTH = 50 # Most proteins are >50 AA
3664
+ MIN_DNA_LENGTH = 150 # DNA sequences should be >150 nt
3665
+ log.info("No full-length sequences found in paper (only partial sequences < %d AA or < %d nt), attempting PDB extraction...",
3666
+ MIN_PROTEIN_LENGTH, MIN_DNA_LENGTH)
3751
3667
 
3752
3668
  # Extract PDB IDs from all PDFs
3753
3669
  pdb_ids = []
@@ -3785,7 +3701,13 @@ def run_pipeline(
3785
3701
  log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
3786
3702
 
3787
3703
  if pdb_seq_blocks:
3788
- sequences = pdb_seq_blocks
3704
+ # Update the dataframe with PDB sequences
3705
+ for seq_block in pdb_seq_blocks:
3706
+ mask = df_merged['variant_id'] == seq_block.variant_id
3707
+ if mask.any():
3708
+ df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
3709
+ df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
3710
+ df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
3789
3711
  log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
3790
3712
  break
3791
3713
  else:
@@ -3793,8 +3715,15 @@ def run_pipeline(
3793
3715
  else:
3794
3716
  log.warning("No PDB IDs found in paper")
3795
3717
 
3796
- # 4b. If still no sequences, try Gemini extraction as last resort
3797
- if not sequences or all(not s.aa_seq for s in sequences):
3718
+ # 4c. If still no sequences after PDB, try Gemini extraction as last resort
3719
+ # Re-check if all variants are still missing sequences
3720
+ still_all_missing = True
3721
+ for _, row in df_merged.iterrows():
3722
+ if pd.notna(row.get('aa_seq')) or pd.notna(row.get('dna_seq')):
3723
+ still_all_missing = False
3724
+ break
3725
+
3726
+ if still_all_missing:
3798
3727
  log.info("No sequences from PDB, attempting Gemini-based extraction...")
3799
3728
 
3800
3729
  gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
@@ -3818,14 +3747,19 @@ def run_pipeline(
3818
3747
  log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
3819
3748
 
3820
3749
  if gemini_seq_blocks:
3821
- sequences = gemini_seq_blocks
3750
+ # Update the dataframe with Gemini/UniProt sequences
3751
+ for seq_block in gemini_seq_blocks:
3752
+ mask = df_merged['variant_id'] == seq_block.variant_id
3753
+ if mask.any():
3754
+ df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
3755
+ df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
3756
+ df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'Gemini/UniProt')
3822
3757
  log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
3823
3758
  else:
3824
3759
  log.warning("Failed to extract sequences via Gemini")
3825
3760
 
3826
- # 5. Merge & score (Section 8) --------------------------------------------
3827
- doi = extract_doi(manuscript)
3828
- df_final = merge_and_score(lineage, sequences, doi, model)
3761
+ # 5. Use the merged dataframe (already merged above)
3762
+ df_final = df_merged
3829
3763
 
3830
3764
  # 6. Write FINAL CSV -------------------------------------------------------
3831
3765
  if output_csv:
@@ -54,11 +54,11 @@ class Config:
54
54
  """Centralised tunables so tests can override them easily."""
55
55
 
56
56
  model_name: str = "gemini-2.5-flash"
57
- location_temperature: float = 0.2
57
+ location_temperature: float = 0.0
58
58
  extract_temperature: float = 0.0
59
59
  model_reaction_temperature: float = 0.0
60
60
  top_p: float = 1.0
61
- max_tokens: int = 12288 # Increased 3x from 4096
61
+ max_tokens: int = 12288
62
62
  pdf_cache_size: int = 8
63
63
  retries: int = 2
64
64
 
@@ -778,50 +778,62 @@ class ReactionExtractor:
778
778
  # ------------------------------------------------------------------
779
779
 
780
780
  def _collect_captions_and_titles(self) -> str:
781
- # Pattern to match Table or Figure with optional leading whitespace
781
+ # Pattern to match Table or Figure with optional leading whitespace and page numbers
782
782
  # This catches all variations including "Supplementary Table", "Table S 2", "Figure S1", etc.
783
- # Also handles cases where there's whitespace before the caption
784
- cap_pattern = re.compile(r"^\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
783
+ # Also handles cases where there's whitespace or page numbers before the caption
784
+ cap_pattern = re.compile(r"^[\s\d]*\s*(Supplementary\s+Table|Table|Figure).*", re.I | re.M)
785
785
  captions: List[str] = []
786
786
 
787
- # Collect from all pages
788
- all_text = "\n".join(self.all_pages)
789
-
790
- # Find all figure/table captions with more context
791
- for match in cap_pattern.finditer(all_text):
792
- caption_start = match.start()
793
-
794
- # Include some context before the caption (up to 200 chars)
795
- context_start = max(0, caption_start - 200)
796
- # Find the start of the sentence/paragraph before the caption
797
- context_text = all_text[context_start:caption_start]
798
- last_period = context_text.rfind('.')
799
- if last_period != -1:
800
- context_start = context_start + last_period + 1
801
-
802
- # For tables, include much more content after the caption to show actual table data
803
- # For figures, include more content to ensure complete captions
804
- is_table = 'table' in match.group(1).lower()
805
- max_chars = 8000 if is_table else 5000
806
-
807
- # Get up to max_chars or until double newline (but ensure we get complete caption)
808
- # First, try to find the end of the caption sentence
809
- caption_end = caption_start
810
- period_pos = all_text.find('. ', caption_start)
811
- if period_pos != -1 and period_pos < caption_start + 1000:
812
- # Include at least to the end of the caption sentence
813
- caption_end = period_pos + 1
814
-
815
- # Then extend to include more context or until double newline
816
- double_newline_pos = all_text.find("\n\n", caption_end)
817
- if double_newline_pos == -1 or double_newline_pos - caption_start > max_chars:
818
- caption_end = caption_start + max_chars
819
- else:
820
- caption_end = double_newline_pos
821
-
822
- # Include the context and full caption with table content
823
- full_caption = all_text[context_start:caption_end].strip()
824
- captions.append(full_caption)
787
+ # Process each page individually to avoid TOC entries
788
+ for page_idx, page_text in enumerate(self.all_pages):
789
+ # Skip if this looks like a TOC page
790
+ if self._is_toc_page(page_text):
791
+ LOGGER.debug("Skipping TOC page %d for caption collection", page_idx + 1)
792
+ continue
793
+
794
+ # Find all figure/table captions with more context
795
+ for match in cap_pattern.finditer(page_text):
796
+ caption_line = match.group(0).strip()
797
+
798
+ # Skip if this looks like a TOC entry (has page number at end or dots)
799
+ if re.search(r'\.{3,}|\.{2,}\s*\d+\s*$|\s+\d+\s*$', caption_line):
800
+ LOGGER.debug("Skipping TOC-style entry: %s", caption_line[:50])
801
+ continue
802
+
803
+ caption_start = match.start()
804
+
805
+ # For tables, include much more content after the caption to show actual table data
806
+ # For figures, include substantial content to show what the figure contains
807
+ is_table = 'table' in match.group(1).lower()
808
+ # Increase context for figures to ensure we capture descriptive text
809
+ max_chars = 8000 if is_table else 3000
810
+
811
+ # Get context including text before and after the caption
812
+ # Include some text before to help identify the location
813
+ context_before = max(0, caption_start - 200)
814
+ context_after = min(len(page_text), caption_start + max_chars)
815
+
816
+ # Extract the full context
817
+ full_context = page_text[context_before:context_after].strip()
818
+
819
+ # Find the actual caption text (not just the "Figure X" part)
820
+ # Look for text after the figure/table identifier that forms the caption
821
+ caption_text = page_text[caption_start:context_after]
822
+
823
+ # Try to find the end of the caption (usually ends with a period before next paragraph)
824
+ caption_end_match = re.search(r'^[^\n]+\.[^\n]*(?:\n\n|\n(?=[A-Z]))', caption_text)
825
+ if caption_end_match:
826
+ actual_caption = caption_text[:caption_end_match.end()].strip()
827
+ else:
828
+ # Fallback: take first few lines
829
+ lines = caption_text.split('\n')
830
+ actual_caption = '\n'.join(lines[:3]).strip()
831
+
832
+ # Ensure we have meaningful content, not just the figure number
833
+ if len(actual_caption) > 20: # More than just "Figure S23."
834
+ # For the prompt, include the full context to help identify what's in the figure
835
+ caption_with_context = f"{actual_caption}\n\n[Context around figure/table:]\n{full_context}"
836
+ captions.append(caption_with_context)
825
837
 
826
838
  # Also look for SI section titles
827
839
  si_titles = re.findall(r"^S\d+\s+[A-Z].{3,80}", "\n".join(self.si_pages), re.M)
@@ -1058,6 +1070,39 @@ class ReactionExtractor:
1058
1070
  # 6.2 Figure / Table context helpers
1059
1071
  # ------------------------------------------------------------------
1060
1072
 
1073
+ def _is_toc_page(self, page_text: str) -> bool:
1074
+ """Detect if a page is a Table of Contents page."""
1075
+ # Look for common TOC indicators
1076
+ toc_indicators = [
1077
+ "table of contents",
1078
+ "contents",
1079
+ r"\.{5,}", # Multiple dots (common in TOCs)
1080
+ r"\d+\s*\n\s*\d+\s*\n\s*\d+", # Multiple page numbers in sequence
1081
+ ]
1082
+
1083
+ # Count how many TOC-like patterns we find
1084
+ toc_score = 0
1085
+ text_lower = page_text.lower()
1086
+
1087
+ # Check for explicit TOC title
1088
+ if "table of contents" in text_lower or (
1089
+ "contents" in text_lower and text_lower.index("contents") < 200
1090
+ ):
1091
+ toc_score += 3
1092
+
1093
+ # Check for multiple figure/table references with page numbers
1094
+ figure_with_page = re.findall(r'figure\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
1095
+ table_with_page = re.findall(r'table\s+[sS]?\d+.*?\.{2,}.*?\d+', text_lower)
1096
+
1097
+ if len(figure_with_page) + len(table_with_page) > 5:
1098
+ toc_score += 2
1099
+
1100
+ # Check for many dotted lines
1101
+ if len(re.findall(r'\.{5,}', page_text)) > 3:
1102
+ toc_score += 1
1103
+
1104
+ return toc_score >= 2
1105
+
1061
1106
  def _page_with_reference(self, ref_id: str) -> Optional[str]:
1062
1107
  for page in self.all_pages:
1063
1108
  if ref_id.lower() in page.lower():
@@ -1131,9 +1176,14 @@ class ReactionExtractor:
1131
1176
  LOGGER.debug("Checking page %d of %s document (text length: %d chars)",
1132
1177
  page_number + 1, doc_name, len(page_text))
1133
1178
 
1134
- # Look for figure caption pattern: "Figure X." or "Figure X:" or "Figure X " at start of line
1135
- # For subfigures like "Figure 1C", extract the main figure "Figure 1"
1136
- figure_num = ref.replace('Figure ', '').replace('figure ', '')
1179
+ # Skip Table of Contents pages
1180
+ if self._is_toc_page(page_text):
1181
+ LOGGER.debug("Skipping page %d - detected as Table of Contents", page_number + 1)
1182
+ continue
1183
+
1184
+ # Look for figure caption pattern more flexibly
1185
+ # Normalize the reference to handle variations
1186
+ figure_num = ref.replace('Figure', '').replace('figure', '').strip()
1137
1187
 
1138
1188
  # Extract main figure number from subfigure (e.g., "1C" -> "1")
1139
1189
  main_figure_num = re.match(r'^(\d+)', figure_num)
@@ -1142,33 +1192,62 @@ class ReactionExtractor:
1142
1192
  else:
1143
1193
  main_figure_num = figure_num
1144
1194
 
1145
- caption_patterns = [
1146
- rf"^Figure\s+{re.escape(main_figure_num)}\.", # "Figure 1."
1147
- rf"^Figure\s+{re.escape(main_figure_num)}:", # "Figure 1:"
1148
- rf"^Figure\s+{re.escape(main_figure_num)}\s+[A-Z]", # "Figure 1 Performance"
1149
- rf"^Figure\s+{re.escape(main_figure_num)}\s*$", # "Figure 1" at end of line
1150
- rf"Figure\s+{re.escape(main_figure_num)}\s*\.", # "Figure 1." anywhere in line
1151
- rf"Figure\s+{re.escape(main_figure_num)}\s*:", # "Figure 1:" anywhere in line
1152
- ]
1195
+ # Create a flexible pattern that handles various spacing and formatting
1196
+ # This pattern looks for "Figure" (case insensitive) followed by optional spaces
1197
+ # then the figure number, then any of: period, colon, space+capital letter, or end of line
1198
+ flexible_pattern = rf"(?i)figure\s*{re.escape(main_figure_num)}(?:\.|:|(?=\s+[A-Z])|\s*$)"
1153
1199
 
1154
- LOGGER.debug("Looking for main figure caption '%s' (from ref '%s') with patterns: %s",
1155
- main_figure_num, ref, caption_patterns)
1200
+ LOGGER.debug("Looking for figure caption '%s' with flexible pattern: %s",
1201
+ main_figure_num, flexible_pattern)
1156
1202
 
1157
1203
  caption_found = False
1158
1204
  cap_rect = None
1159
1205
 
1160
- for pattern in caption_patterns:
1161
- matches = re.search(pattern, page_text, re.MULTILINE | re.IGNORECASE)
1162
- if matches:
1163
- LOGGER.debug("Found figure caption match with pattern '%s': %s", pattern, matches.group(0))
1164
- # Found actual figure caption, get its position
1165
- caption_text = matches.group(0)
1166
- text_instances = page.search_for(caption_text, quads=False)
1167
- if text_instances:
1168
- cap_rect = text_instances[0]
1169
- caption_found = True
1170
- LOGGER.info("Found actual caption for %s: '%s'", ref, caption_text)
1171
- break
1206
+ # Search for all matches of the flexible pattern
1207
+ for match in re.finditer(flexible_pattern, page_text, re.MULTILINE):
1208
+ LOGGER.debug("Found potential figure caption: %s at position %d", match.group(0), match.start())
1209
+ # Check if this is likely an actual caption (not just a reference)
1210
+ match_start = match.start()
1211
+ match_end = match.end()
1212
+
1213
+ # Get surrounding context
1214
+ context_start = max(0, match_start - 50)
1215
+ context_end = min(len(page_text), match_end + 100)
1216
+ context = page_text[context_start:context_end]
1217
+
1218
+ # Check if this looks like a real caption (not just a reference)
1219
+ # Look for words that typically precede figure references
1220
+ preceding_text = page_text[max(0, match_start-20):match_start].lower()
1221
+ if any(word in preceding_text for word in ['see ', 'in ', 'from ', 'shown in ', 'refer to ']):
1222
+ LOGGER.debug("Skipping reference preceded by: %s", preceding_text.strip())
1223
+ continue
1224
+
1225
+ # Check if there's descriptive text after the figure number
1226
+ remaining_text = page_text[match_end:match_end+100].strip()
1227
+
1228
+ # For actual captions, there should be substantial descriptive text
1229
+ if len(remaining_text) < 20:
1230
+ LOGGER.debug("Skipping potential reference: insufficient text after (%d chars)", len(remaining_text))
1231
+ continue
1232
+
1233
+ # Check if the remaining text looks like a caption (contains descriptive words)
1234
+ first_words = remaining_text[:50].lower()
1235
+ if not any(word in first_words for word in ['detailed', 'representative', 'shows', 'comparison',
1236
+ 'illustrates', 'demonstrates', 'results', 'data',
1237
+ 'chromatogram', 'spectra', 'analysis', 'site-directed',
1238
+ 'mutagenesis', 'mutants']):
1239
+ LOGGER.debug("Skipping: doesn't look like caption text: %s", first_words)
1240
+ continue
1241
+
1242
+ # Found actual figure caption, get its position
1243
+ caption_text = match.group(0)
1244
+ text_instances = page.search_for(caption_text, quads=False)
1245
+ if text_instances:
1246
+ cap_rect = text_instances[0]
1247
+ caption_found = True
1248
+ LOGGER.info("Found actual caption for %s: '%s' with following text: '%s...'",
1249
+ ref, caption_text, remaining_text[:50])
1250
+ break
1172
1251
 
1173
1252
  if not caption_found:
1174
1253
  # Debug: show what figure-related text is actually on this page
@@ -1258,6 +1337,11 @@ class ReactionExtractor:
1258
1337
  page = doc.load_page(page_number)
1259
1338
  page_text = page.get_text()
1260
1339
 
1340
+ # Skip Table of Contents pages
1341
+ if self._is_toc_page(page_text):
1342
+ LOGGER.debug("Skipping TOC page %d in _find_pages_with_reference", page_number + 1)
1343
+ continue
1344
+
1261
1345
  # Check for actual figure caption first
1262
1346
  if ref.lower().startswith('figure'):
1263
1347
  figure_num = ref.replace('Figure ', '').replace('figure ', '')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.4.5
3
+ Version: 0.5.0
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -0,0 +1,16 @@
1
+ debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
+ debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
+ debase/_version.py,sha256=sJMwhIVyUE0G4qRHUUpEgw2beNe5jCSb9uQVOTV6krw,49
4
+ debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
+ debase/cleanup_sequence.py,sha256=qKAou871Eri4SDQMz-XCfD3D2BuuINxSxzJZMACJ7p4,73313
6
+ debase/enzyme_lineage_extractor.py,sha256=C2rVFyM84TvDy7hvk_xIeVSdh1F6WSe4QQB8B8QrPC4,168026
7
+ debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
8
+ debase/reaction_info_extractor.py,sha256=8ilu5o2FbXTV9R1Nhxd4m4TdgHOd6GsC3rxxHvqu9f4,165555
9
+ debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
10
+ debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
11
+ debase-0.5.0.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
+ debase-0.5.0.dist-info/METADATA,sha256=2Csgtf4gF8egVAvq8CsY4jpad2yWw_6c1iuOj55L5n8,4047
13
+ debase-0.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ debase-0.5.0.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
+ debase-0.5.0.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
+ debase-0.5.0.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- debase/__init__.py,sha256=YeKveGj_8fwuu5ozoK2mUU86so_FjiCwsvg1d_lYVZU,586
2
- debase/__main__.py,sha256=LbxYt2x9TG5Ced7LpzzX_8gkWyXeZSlVHzqHfqAiPwQ,160
3
- debase/_version.py,sha256=aQmjMn3LxbvC1lgsl7QAKTZYk9rZlRbUZ72_LxKEuIM,49
4
- debase/build_db.py,sha256=bW574GxsL1BJtDwM19urLbciPcejLzfraXZPpzm09FQ,7167
5
- debase/cleanup_sequence.py,sha256=zwRZky7vIKmyphThF_hlhQScF0OV9GOPziQvHG0mTnI,67516
6
- debase/enzyme_lineage_extractor.py,sha256=hPA3r9kEQ0vy4ia9t4lj5m63jJtkslAM-ySsW4WgIVs,170770
7
- debase/lineage_format.py,sha256=Omb3oug0oEfQLcC_5XsbACvTDV7PFIIlGRtOhxC7Nwo,57844
8
- debase/reaction_info_extractor.py,sha256=bnAbPtVr52H_GZg0NVdCksHZfAtYuh4WD3RCAhRgU7Y,160833
9
- debase/substrate_scope_extractor.py,sha256=ydU6iZVRw3fLyQ8kIQs6ZuruBMvM4mMXIeGuPgCUOn4,115956
10
- debase/wrapper.py,sha256=0z1BRvs3pzuPV_sgJxrBVmX_IXqwX3tB4u0GXdSgR3c,24568
11
- debase-0.4.5.dist-info/licenses/LICENSE,sha256=5sk9_tcNmr1r2iMIUAiioBo7wo38u8BrPlO7f0seqgE,1075
12
- debase-0.4.5.dist-info/METADATA,sha256=PaDILdF_IA8qJAF4WHVu0sz1V9ihL_6pJUdoMFa9nRg,4047
13
- debase-0.4.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- debase-0.4.5.dist-info/entry_points.txt,sha256=hUcxA1b4xORu-HHBFTe9u2KTdbxPzt0dwz95_6JNe9M,48
15
- debase-0.4.5.dist-info/top_level.txt,sha256=2BUeq-4kmQr0Rhl06AnRzmmZNs8WzBRK9OcJehkcdk8,7
16
- debase-0.4.5.dist-info/RECORD,,
File without changes