debase 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,6 +24,7 @@ import pandas as pd
24
24
  import networkx as nx # light dependency, used only for generation inference
25
25
 
26
26
  import os
27
+ import fitz
27
28
  import re
28
29
  import json
29
30
  import time
@@ -460,8 +461,32 @@ def get_model():
460
461
  if not api_key:
461
462
  raise EnvironmentError("Set the GEMINI_API_KEY environment variable.")
462
463
  _genai.configure(api_key=api_key)
463
- # Positional constructor arg works for both SDK flavors
464
- return _genai.GenerativeModel(MODEL_NAME)
464
+
465
+ # Create generation config to optimize performance and costs
466
+ generation_config = {
467
+ "temperature": 0.0, # Deterministic: always pick the most likely token
468
+ "top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
469
+ "top_k": 1, # Only consider the single most likely token
470
+ "max_output_tokens": 32768, # Increased from 8192 to handle larger sequence extractions
471
+ }
472
+
473
+ # For Gemini 2.5 Flash, disable thinking tokens to save costs
474
+ # thinking_budget=0 disables thinking, -1 enables dynamic thinking (default)
475
+ # Only add if SDK supports it to maintain compatibility
476
+ try:
477
+ # Test if thinking_budget is supported by making a minimal API call
478
+ test_config = {"thinking_budget": 0, "max_output_tokens": 10}
479
+ test_model = _genai.GenerativeModel(MODEL_NAME, generation_config=test_config)
480
+ # Actually test the API call to see if thinking_budget is supported
481
+ test_response = test_model.generate_content("Return 'OK'")
482
+ # If no error, add thinking_budget to main config
483
+ generation_config["thinking_budget"] = 0
484
+ log.debug("Disabled thinking tokens (thinking_budget=0)")
485
+ except Exception as e:
486
+ # SDK doesn't support thinking_budget, continue without it
487
+ log.debug(f"thinking_budget not supported: {e}")
488
+
489
+ return _genai.GenerativeModel(MODEL_NAME, generation_config=generation_config)
465
490
 
466
491
  # === 5.3 Unified call helper ----------------------------------------------
467
492
 
@@ -728,22 +753,24 @@ Return a JSON object with:
728
753
  _LINEAGE_LOC_PROMPT = """
729
754
  You are an expert reader of protein engineering manuscripts.
730
755
  {campaign_context}
731
- Given the following article text, list up to {max_results} *locations* (page
732
- numbers, figure/table IDs, or section headings) that you would review first to
733
- find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
734
- came from which parent and what mutations were introduced){campaign_specific}.
756
+ Given the following article text, list up to {max_results} *locations* (figure/table IDs
757
+ or section headings) that you would review first to find the COMPLETE evolutionary
758
+ lineage of enzyme variants (i.e. which variant came from which parent and what
759
+ mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
760
+ ensure the location you return are actually lineage location with variants and mutations.
735
761
 
736
762
  Respond with a JSON array of objects, each containing:
737
- - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
738
- - "type": one of "table", "figure", "text", "section"
763
+ - "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
764
+ - "type": one of "table", "figure", "section"
739
765
  - "confidence": your confidence score (0-100) that this location contains lineage data
740
766
  - "reason": brief explanation of why this location likely contains lineage
741
767
  {campaign_field}
742
- IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
768
+ IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
769
+ NOT page numbers. Focus on the actual figure/table titles and numbers.
743
770
 
744
771
  Order by confidence score (highest first). Tables showing complete variant lineages or
745
- mutation lists should be ranked higher than figure showing complete variant lineages.
746
- Text sections is used when no suitable tables/figurews exist.
772
+ mutation lists should be ranked higher than figures showing complete variant lineages.
773
+ Sections are used when no suitable tables/figures exist.
747
774
 
748
775
  Don't include oligonucleotide results or result from only one round.
749
776
 
@@ -1713,7 +1740,6 @@ def get_lineage(
1713
1740
  for pdf_path in pdf_paths:
1714
1741
  # Extract first few pages looking for TOC
1715
1742
  try:
1716
- import fitz # PyMuPDF
1717
1743
  doc = fitz.open(pdf_path)
1718
1744
  toc_text = ""
1719
1745
  for page_num in range(min(5, doc.page_count)): # First 5 pages
@@ -2011,7 +2037,7 @@ def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | Non
2011
2037
 
2012
2038
  # --- 7.2 Page-based extraction helper ---------------------------------------
2013
2039
  def _extract_plain_sequence_with_triple_validation(prompt: str, model, context: str = "") -> Optional[str]:
2014
- """Extract plain text sequence using Gemini with adaptive validation (up to 5 attempts).
2040
+ """Extract plain text sequence using Gemini with 6 attempts, returning most common result.
2015
2041
 
2016
2042
  Args:
2017
2043
  prompt: The prompt to send to Gemini
@@ -2019,12 +2045,12 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
2019
2045
  context: Additional context for logging (e.g., "validation" or "extraction")
2020
2046
 
2021
2047
  Returns:
2022
- The validated sequence or None if no consensus
2048
+ The most common sequence or None if all attempts failed
2023
2049
  """
2024
2050
  sequences = []
2025
- max_attempts = 5 # Increased from 3 to 5
2051
+ max_attempts = 6
2026
2052
 
2027
- # Try up to 5 times
2053
+ # Try 6 times
2028
2054
  for attempt in range(max_attempts):
2029
2055
  try:
2030
2056
  response = model.generate_content(prompt)
@@ -2050,38 +2076,14 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
2050
2076
  except Exception as e:
2051
2077
  log.warning(f"Gemini {context} attempt {attempt + 1} failed: {e}")
2052
2078
  sequences.append("ERROR")
2053
-
2054
- # Check for early consensus after 2 attempts
2055
- if len(sequences) == 2:
2056
- # Clean sequences before comparison
2057
- seq0_clean = sequences[0].replace(" ", "").replace("\n", "") if sequences[0] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[0]
2058
- seq1_clean = sequences[1].replace(" ", "").replace("\n", "") if sequences[1] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[1]
2059
-
2060
- if seq0_clean == seq1_clean and sequences[0] not in ["INVALID", "ERROR"]:
2061
- log.info(f"Gemini {context} consensus reached after 2 attempts")
2062
- return seq0_clean if seq0_clean not in ["VALID", "UNCERTAIN"] else None
2063
- else:
2064
- log.info(f"Gemini {context} mismatch after 2 attempts: {seq0_clean[:20]}... vs {seq1_clean[:20]}... - trying third")
2065
2079
 
2066
- # After all attempts, find consensus
2080
+ # After all attempts, find most common result
2067
2081
  valid_sequences = [s for s in sequences if s not in ["INVALID", "ERROR"]]
2068
2082
 
2069
2083
  if not valid_sequences:
2070
2084
  log.error(f"All {max_attempts} {context} attempts failed")
2071
2085
  return None
2072
2086
 
2073
- # Find any matching pair
2074
- for i in range(len(sequences)):
2075
- for j in range(i + 1, len(sequences)):
2076
- # Clean sequences before comparison
2077
- seq_i_clean = sequences[i].replace(" ", "").replace("\n", "") if sequences[i] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[i]
2078
- seq_j_clean = sequences[j].replace(" ", "").replace("\n", "") if sequences[j] not in ["INVALID", "ERROR", "VALID", "UNCERTAIN"] else sequences[j]
2079
-
2080
- if seq_i_clean == seq_j_clean and sequences[i] not in ["INVALID", "ERROR"]:
2081
- log.info(f"Gemini {context} consensus found: attempts {i+1} and {j+1} match")
2082
- return seq_i_clean if seq_i_clean not in ["VALID", "UNCERTAIN"] else None
2083
-
2084
- # If no exact match, use adaptive validation
2085
2087
  # Count occurrences of each valid sequence
2086
2088
  sequence_counts = {}
2087
2089
  for seq in valid_sequences:
@@ -2090,80 +2092,16 @@ def _extract_plain_sequence_with_triple_validation(prompt: str, model, context:
2090
2092
  seq_clean = seq.replace(" ", "").replace("\n", "")
2091
2093
  sequence_counts[seq_clean] = sequence_counts.get(seq_clean, 0) + 1
2092
2094
 
2093
- # Return the most common sequence if it appears at least twice
2095
+ # Return the most common sequence
2094
2096
  if sequence_counts:
2095
2097
  most_common = max(sequence_counts.items(), key=lambda x: x[1])
2096
- if most_common[1] >= 2:
2097
- log.info(f"Gemini {context} adaptive consensus: sequence appeared {most_common[1]}/{len(sequences)} times")
2098
- return most_common[0]
2098
+ log.info(f"Gemini {context} most common: sequence appeared {most_common[1]}/{max_attempts} times")
2099
+ return most_common[0]
2099
2100
 
2100
- log.warning(f"Gemini {context} no consensus after {max_attempts} attempts")
2101
+ log.warning(f"Gemini {context} no valid sequences after {max_attempts} attempts")
2101
2102
  return None
2102
2103
 
2103
2104
 
2104
- def _validate_sequence_against_mutations(sequence: str, variants: List[Variant], lineage_text: str, model) -> Optional[str]:
2105
- """Validate and potentially correct a sequence using Gemini by checking against known mutations."""
2106
-
2107
- # Extract mutations from variants
2108
- mutations = []
2109
- for variant in variants:
2110
- if variant.mutations:
2111
- mutations.extend(variant.mutations)
2112
-
2113
- if not mutations:
2114
- return None
2115
-
2116
- # Take a sample of mutations for validation
2117
- sample_mutations = mutations[:10] # Check first 10 mutations
2118
-
2119
- # First do a quick local check for obvious inconsistencies
2120
- local_issues = []
2121
- for mutation in sample_mutations:
2122
- if hasattr(mutation, 'original') and hasattr(mutation, 'position'):
2123
- pos = mutation.position - 1 # Convert to 0-indexed
2124
- if 0 <= pos < len(sequence):
2125
- actual_aa = sequence[pos]
2126
- expected_aa = mutation.original
2127
- if actual_aa != expected_aa:
2128
- local_issues.append(f"Position {mutation.position}: expected {expected_aa}, found {actual_aa}")
2129
-
2130
- if not local_issues:
2131
- return None # No obvious issues found
2132
-
2133
- log.info(f"Found {len(local_issues)} potential sequence issues, asking Gemini for validation with triple-check")
2134
-
2135
- prompt = f"""
2136
- You are validating a protein sequence that was extracted from a scientific paper.
2137
- The sequence may have OCR errors like duplicated letters (e.g., "II" becoming "III").
2138
-
2139
- Original sequence (length {len(sequence)}):
2140
- {sequence}
2141
-
2142
- Known mutations that should be applicable to this sequence:
2143
- {', '.join(str(m) for m in sample_mutations)}
2144
-
2145
- Potential issues detected:
2146
- {chr(10).join(local_issues)}
2147
-
2148
- Please check if the sequence is consistent with these mutations:
2149
- 1. For each mutation (e.g., M263T), check if position 263 (1-indexed) actually has M
2150
- 2. If you find inconsistencies, suggest the most likely correction
2151
- 3. Common errors include: duplicated letters, missing letters, OCR confusion (like II vs III)
2152
- 4. Pay special attention to consecutive identical amino acids that might be OCR errors
2153
-
2154
- Return ONLY the corrected sequence if changes are needed, or "VALID" if no changes are needed.
2155
- If you cannot determine the correct sequence, return "UNCERTAIN".
2156
- """
2157
-
2158
- # Use triple validation
2159
- result = _extract_plain_sequence_with_triple_validation(prompt, model, "validation")
2160
-
2161
- if result == "VALID" or result is None:
2162
- return None # No changes needed
2163
- else:
2164
- log.info(f"Gemini suggested sequence correction (length {len(result)})")
2165
- return result
2166
-
2167
2105
 
2168
2106
  def _extract_text_from_page(pdf_paths: List[Path], page_num: Union[str, int], skip_si_toc: bool = True) -> str:
2169
2107
  """Extract text from a specific page number in the PDFs.
@@ -2331,11 +2269,11 @@ CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
2331
2269
 
2332
2270
  SEQUENCE EXTRACTION RULES:
2333
2271
  - Copy sequences EXACTLY as they appear in the text
2334
- - Pay careful attention to repeated amino acids (e.g., "AAA" should remain "AAA", not become "A")
2335
- - Do NOT add, remove, or modify any amino acids
2272
+ - Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
2273
+ - Do NOT add, remove, or modify any amino acids, or nucleotides
2336
2274
  - Preserve the exact length and character sequence
2337
2275
  - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
2338
- - Double-check that consecutive identical amino acids are copied correctly
2276
+ - Double-check that consecutive identical amino acids or nucleotides are copied correctly
2339
2277
 
2340
2278
  For each variant return:
2341
2279
  * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
@@ -2356,8 +2294,81 @@ TEXT (may be truncated):
2356
2294
  ```
2357
2295
  """.strip()
2358
2296
 
2297
+ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
2298
+ """
2299
+ Check if two sequence extraction responses match.
2300
+
2301
+ Args:
2302
+ resp1: First response (list of sequences or dict)
2303
+ resp2: Second response (list of sequences or dict)
2304
+
2305
+ Returns:
2306
+ True if responses match, False otherwise
2307
+ """
2308
+ # Handle None cases
2309
+ if resp1 is None or resp2 is None:
2310
+ return False
2311
+
2312
+ # Both should be the same type
2313
+ if type(resp1) != type(resp2):
2314
+ return False
2315
+
2316
+ # If both are lists
2317
+ if isinstance(resp1, list) and isinstance(resp2, list):
2318
+ # Must have same length
2319
+ if len(resp1) != len(resp2):
2320
+ return False
2321
+
2322
+ # Create normalized sequence sets for comparison
2323
+ seq_set1 = set()
2324
+ seq_set2 = set()
2325
+
2326
+ for seq in resp1:
2327
+ if isinstance(seq, dict):
2328
+ variant_id = seq.get("variant_id", "")
2329
+ aa_seq = seq.get("aa_seq")
2330
+ dna_seq = seq.get("dna_seq")
2331
+ # Handle None/null values - convert to empty string for comparison
2332
+ if aa_seq is None:
2333
+ aa_seq = ""
2334
+ else:
2335
+ aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2336
+ if dna_seq is None:
2337
+ dna_seq = ""
2338
+ else:
2339
+ dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
2340
+ seq_set1.add(f"{variant_id}|{aa_seq}|{dna_seq}")
2341
+
2342
+ for seq in resp2:
2343
+ if isinstance(seq, dict):
2344
+ variant_id = seq.get("variant_id", "")
2345
+ aa_seq = seq.get("aa_seq")
2346
+ dna_seq = seq.get("dna_seq")
2347
+ # Handle None/null values - convert to empty string for comparison
2348
+ if aa_seq is None:
2349
+ aa_seq = ""
2350
+ else:
2351
+ aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2352
+ if dna_seq is None:
2353
+ dna_seq = ""
2354
+ else:
2355
+ dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
2356
+ seq_set2.add(f"{variant_id}|{aa_seq}|{dna_seq}")
2357
+
2358
+ return seq_set1 == seq_set2
2359
+
2360
+ # If both are dicts, compare normalized content
2361
+ if isinstance(resp1, dict) and isinstance(resp2, dict):
2362
+ # Normalize and compare
2363
+ return json.dumps(resp1, sort_keys=True) == json.dumps(resp2, sort_keys=True)
2364
+
2365
+ return False
2366
+
2367
+
2359
2368
  def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
2360
- """Extract sequence JSON using Gemini with adaptive validation (up to 5 attempts).
2369
+ """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
2370
+
2371
+ Can exit early after 2 attempts if the responses match exactly.
2361
2372
 
2362
2373
  Args:
2363
2374
  model: The Gemini model instance
@@ -2366,12 +2377,12 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2366
2377
  debug_dir: Optional debug directory
2367
2378
 
2368
2379
  Returns:
2369
- The validated sequence JSON data or None if no consensus
2380
+ The most common sequence JSON data or None if all attempts failed
2370
2381
  """
2371
2382
  responses = []
2372
- max_attempts = 5 # Increased from 3 to 5
2383
+ max_attempts = 6
2373
2384
 
2374
- # Try up to 5 times
2385
+ # Try 6 times with early match detection
2375
2386
  for attempt in range(max_attempts):
2376
2387
  try:
2377
2388
  log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2443,167 +2454,69 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2443
2454
  else:
2444
2455
  raise json.JSONDecodeError("No JSON structure found in response", raw, 0)
2445
2456
 
2446
- # Store both the original and normalized response
2447
- normalized_response = _normalize_sequence_response(parsed)
2448
- responses.append((parsed, normalized_response))
2449
-
2450
- log.info(f"Sequence extraction attempt {attempt + 1}: {len(normalized_response) if isinstance(normalized_response, list) else 'invalid'} sequences")
2457
+ # Store the response
2458
+ responses.append(parsed)
2459
+ log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
2460
+
2461
+ # Early match detection after 2 attempts
2462
+ if attempt >= 1: # After 2nd attempt (0-indexed)
2463
+ valid_responses_so_far = [r for r in responses if r is not None]
2464
+ if len(valid_responses_so_far) >= 2:
2465
+ # Check if the last two valid responses match
2466
+ if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2467
+ log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2468
+ # Add the matching response 4 more times to simulate consensus
2469
+ for _ in range(max_attempts - attempt - 1):
2470
+ responses.append(valid_responses_so_far[-1])
2471
+ break
2451
2472
 
2452
2473
  except Exception as e:
2453
2474
  log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
2454
2475
  responses.append(None)
2455
-
2456
- # Check for early consensus after 2 attempts
2457
- if len(responses) == 2:
2458
- if (responses[0] and responses[1] and
2459
- _sequences_match(responses[0][1], responses[1][1])):
2460
- log.info("Sequence extraction consensus reached after 2 attempts")
2461
- return responses[0][0] # Return original parsed data
2462
- else:
2463
- log.info("Sequence extraction mismatch after 2 attempts - trying third")
2464
2476
 
2465
- # After all attempts, use adaptive validation
2477
+ # After all attempts, find most common sequences
2466
2478
  valid_responses = [r for r in responses if r is not None]
2467
2479
 
2468
2480
  if not valid_responses:
2469
2481
  log.error(f"All {max_attempts} sequence extraction attempts failed")
2470
2482
  return None
2471
2483
 
2472
- # First, try to find exact consensus (any matching pair)
2473
- for i in range(len(valid_responses)):
2474
- for j in range(i + 1, len(valid_responses)):
2475
- if _sequences_match(valid_responses[i][1], valid_responses[j][1]):
2476
- log.info(f"Sequence extraction consensus found: attempts with matching content")
2477
- return valid_responses[i][0] # Return original parsed data
2478
-
2479
- # If no exact consensus, use adaptive validation
2480
- log.info("No exact consensus found, applying adaptive validation...")
2481
-
2482
- # Find sequences that appear consistently across multiple attempts
2483
- consistent_sequences = _find_consistent_sequences(valid_responses)
2484
-
2485
- if consistent_sequences:
2486
- log.info(f"Found {len(consistent_sequences)} consistent sequences using adaptive validation")
2487
- return consistent_sequences
2488
-
2489
- # If still no consensus, use the attempt with the most sequences
2490
- best_response = max(valid_responses,
2491
- key=lambda r: len(r[1]) if isinstance(r[1], list) else 0)
2492
-
2493
- if best_response and len(best_response[1]) > 0:
2494
- log.warning(f"No consensus after {max_attempts} attempts, using best effort with {len(best_response[1])} sequences")
2495
- return best_response[0]
2496
-
2497
- log.warning(f"Sequence extraction failed to find any valid sequences after {max_attempts} attempts")
2498
- return None
2499
-
2500
-
2501
- def _find_consistent_sequences(valid_responses: List[Tuple[Any, List[Dict[str, Any]]]]) -> Optional[List[Dict[str, Any]]]:
2502
- """Find sequences that appear consistently across multiple extraction attempts.
2503
-
2504
- Args:
2505
- valid_responses: List of (original_data, normalized_data) tuples
2506
-
2507
- Returns:
2508
- List of consistent sequences with confidence scores, or None if none found
2509
- """
2510
- if not valid_responses:
2511
- return None
2512
-
2513
- # Count how many times each sequence appears
2484
+ # Count occurrences of each individual sequence across all attempts
2514
2485
  sequence_counts = {}
2515
- sequence_full_data = {}
2516
-
2517
- for original, normalized in valid_responses:
2518
- if not isinstance(normalized, list):
2519
- continue
2520
-
2521
- for seq in normalized:
2522
- variant_id = seq.get("variant_id", "")
2523
- aa_seq = seq.get("aa_seq", "")
2524
- # Clean sequence before using in key
2525
- aa_seq_clean = aa_seq.replace(" ", "").replace("\n", "").upper() if aa_seq else ""
2526
-
2527
- # Create a unique key for this sequence
2528
- key = f"{variant_id}|{aa_seq_clean}"
2529
-
2530
- if key not in sequence_counts:
2531
- sequence_counts[key] = 0
2532
- sequence_full_data[key] = []
2533
-
2534
- sequence_counts[key] += 1
2535
-
2536
- # Find the full data for this sequence from the original response
2537
- if isinstance(original, list):
2538
- for orig_seq in original:
2539
- if (orig_seq.get("variant_id") == variant_id and
2540
- orig_seq.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() == aa_seq_clean):
2541
- sequence_full_data[key].append(orig_seq)
2542
- break
2543
-
2544
- # Filter sequences that appear in at least 2 attempts (40% of 5 attempts)
2545
- min_appearances = max(2, len(valid_responses) // 2)
2546
- consistent_sequences = []
2547
-
2548
- for key, count in sequence_counts.items():
2549
- if count >= min_appearances:
2550
- # Use the first occurrence of the full data
2551
- if sequence_full_data[key]:
2552
- seq_data = sequence_full_data[key][0].copy()
2553
- # Add confidence based on how many times it appeared
2554
- seq_data["confidence"] = count / len(valid_responses)
2555
- seq_data["extraction_consistency"] = f"{count}/{len(valid_responses)} attempts"
2556
- consistent_sequences.append(seq_data)
2486
+ for resp in valid_responses:
2487
+ if isinstance(resp, list):
2488
+ for seq in resp:
2489
+ if isinstance(seq, dict) and "variant_id" in seq:
2490
+ # Create a key for this sequence (variant_id + cleaned aa_seq)
2491
+ variant_id = seq.get("variant_id", "")
2492
+ aa_seq = seq.get("aa_seq", "")
2493
+ if aa_seq:
2494
+ aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2495
+ key = f"{variant_id}|{aa_seq}"
2496
+
2497
+ if key not in sequence_counts:
2498
+ sequence_counts[key] = {"count": 0, "data": seq}
2499
+ sequence_counts[key]["count"] += 1
2500
+
2501
+ # Build result with sequences that appear in at least 3 attempts
2502
+ result = []
2503
+ for key, info in sequence_counts.items():
2504
+ if info["count"] >= 3: # Appears in at least 3/6 attempts
2505
+ seq_data = info["data"].copy()
2506
+ seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
2507
+ result.append(seq_data)
2508
+ log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
2509
+
2510
+ if result:
2511
+ log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
2512
+ return result
2557
2513
 
2558
- return consistent_sequences if consistent_sequences else None
2514
+ # If no sequences appear twice, return the most complete attempt
2515
+ best_attempt = max(valid_responses, key=lambda x: len(x) if isinstance(x, list) else 0)
2516
+ log.warning(f"No consensus sequences found, returning best attempt with {len(best_attempt)} sequences")
2517
+ return best_attempt
2559
2518
 
2560
2519
 
2561
- def _normalize_sequence_response(data: Any) -> List[Dict[str, Any]]:
2562
- """Normalize sequence response for comparison."""
2563
- if not isinstance(data, list):
2564
- return []
2565
-
2566
- normalized = []
2567
- for item in data:
2568
- if isinstance(item, dict):
2569
- # Extract key fields for comparison
2570
- normalized_item = {
2571
- "variant_id": item.get("variant_id", ""),
2572
- "aa_seq": item.get("aa_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("aa_seq") else "",
2573
- "dna_seq": item.get("dna_seq", "").replace(" ", "").replace("\n", "").upper() if item.get("dna_seq") else "",
2574
- "confidence": item.get("confidence", 0.0)
2575
- }
2576
- normalized.append(normalized_item)
2577
-
2578
- # Sort by variant_id for consistent comparison
2579
- return sorted(normalized, key=lambda x: x["variant_id"])
2580
-
2581
-
2582
- def _sequences_match(seq1: List[Dict[str, Any]], seq2: List[Dict[str, Any]]) -> bool:
2583
- """Check if two sequence response lists match on key fields."""
2584
- if len(seq1) != len(seq2):
2585
- return False
2586
-
2587
- for i, (s1, s2) in enumerate(zip(seq1, seq2)):
2588
- # Compare variant IDs
2589
- if s1.get("variant_id") != s2.get("variant_id"):
2590
- return False
2591
-
2592
- # Compare amino acid sequences (most critical)
2593
- aa1 = s1.get("aa_seq", "")
2594
- aa2 = s2.get("aa_seq", "")
2595
- if aa1 and aa2 and aa1 != aa2:
2596
- return False
2597
- elif bool(aa1) != bool(aa2): # One has sequence, other doesn't
2598
- return False
2599
-
2600
- # Compare DNA sequences if present
2601
- dna1 = s1.get("dna_seq", "")
2602
- dna2 = s2.get("dna_seq", "")
2603
- if dna1 and dna2 and dna1 != dna2:
2604
- return False
2605
-
2606
- return True
2607
2520
 
2608
2521
 
2609
2522
  def extract_sequences(text: str, model, *, debug_dir: str | Path | None = None, lineage_context: str = None, lineage_variants: List[Variant] = None) -> list[SequenceBlock]:
@@ -2624,18 +2537,7 @@ Match sequences to these known variants when possible. Variants may be labeled d
2624
2537
  else:
2625
2538
  prompt = base_prompt
2626
2539
 
2627
- # Add mutation validation context if we have lineage variants with mutations
2628
- if lineage_variants:
2629
- mutation_context = _build_mutation_validation_context(lineage_variants)
2630
- if mutation_context:
2631
- prompt = f"""{prompt}
2632
-
2633
- CRITICAL MUTATION VALIDATION:
2634
- {mutation_context}
2635
-
2636
- IMPORTANT: Double-check your sequence assignments by verifying mutations match the lineage relationships.
2637
- For example, if variant "III" has mutation "A100V" from parent "II", then position 100 in sequence "III" must be V, and position 100 in sequence "II" must be A.
2638
- """
2540
+ # Skip mutation validation context
2639
2541
 
2640
2542
  # Save the complete prompt for debugging
2641
2543
  if debug_dir:
@@ -2662,11 +2564,7 @@ For example, if variant "III" has mutation "A100V" from parent "II", then positi
2662
2564
 
2663
2565
  extracted_sequences = _parse_sequences(data)
2664
2566
 
2665
- # Post-process: validate sequences against mutations if we have lineage info
2666
- if lineage_variants:
2667
- validated_sequences = _validate_sequences_against_mutations(extracted_sequences, lineage_variants, model, debug_dir)
2668
- return validated_sequences
2669
-
2567
+ # Return extracted sequences without mutation validation
2670
2568
  return extracted_sequences
2671
2569
 
2672
2570
  # --- 7.4 JSON -> dataclass helpers -------------------------------------------
@@ -2701,6 +2599,19 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
2701
2599
  aa = _clean_seq(entry.get("aa_seq"), _VALID_AA)
2702
2600
  dna = _clean_seq(entry.get("dna_seq"), _VALID_DNA)
2703
2601
 
2602
+ # Check minimum length requirements
2603
+ # AA sequences should be > 50, DNA sequences should be > 150
2604
+ if aa and len(aa) <= 50:
2605
+ log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
2606
+ aa = None
2607
+ if dna and len(dna) <= 150:
2608
+ log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
2609
+ dna = None
2610
+
2611
+ # Skip if both sequences are too short or missing
2612
+ if not aa and not dna:
2613
+ continue
2614
+
2704
2615
  conf: float | None = None
2705
2616
  if aa:
2706
2617
  conf = sum(c in _VALID_AA for c in aa) / len(aa)
@@ -2943,12 +2854,15 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2943
2854
  validate_sequences=True
2944
2855
  )
2945
2856
 
2946
- if focused_text and len(focused_text) < len(text):
2947
- log.info("Reduced text from %d to %d chars using validated location",
2948
- len(text), len(focused_text))
2949
- else:
2950
- log.warning("Failed to reduce text size - focused_text length: %d, full text length: %d",
2951
- len(focused_text) if focused_text else 0, len(text))
2857
+ # Use focused text if we got any content, regardless of size
2858
+ if focused_text:
2859
+ if len(focused_text) < len(text):
2860
+ log.info("Reduced text from %d to %d chars using validated location",
2861
+ len(text), len(focused_text))
2862
+ else:
2863
+ log.info("Extracted focused text (%d chars) from validated location (full text: %d chars)",
2864
+ len(focused_text), len(text))
2865
+
2952
2866
  # Build lineage context if available
2953
2867
  lineage_context = None
2954
2868
  if lineage_variants:
@@ -2961,6 +2875,8 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2961
2875
  lineage_context = "\n".join(variant_info)
2962
2876
 
2963
2877
  return extract_sequences(focused_text, model, debug_dir=debug_dir, lineage_context=lineage_context, lineage_variants=lineage_variants)
2878
+ else:
2879
+ log.warning("Failed to extract focused text from validated location, will use full text")
2964
2880
  else:
2965
2881
  log.warning("Location validation failed or returned invalid location: %s",
2966
2882
  validation.get("reason", "Unknown"))
@@ -3113,12 +3029,6 @@ If you cannot determine certain fields, set them to null.
3113
3029
  seq = enzyme_info["wild_type_sequence"].upper().replace(" ", "").replace("\n", "")
3114
3030
  # Validate it looks like a protein sequence
3115
3031
  if seq and all(c in "ACDEFGHIKLMNPQRSTVWY*" for c in seq) and len(seq) > 50:
3116
- # Sanity check the sequence against known mutations
3117
- validated_seq = _validate_sequence_against_mutations(seq, variants, text, model)
3118
- if validated_seq:
3119
- seq = validated_seq
3120
- log.info(f"Sequence validated and potentially corrected by Gemini")
3121
-
3122
3032
  # Map to the first variant or wild-type
3123
3033
  wt_variant = next((v for v in variants if "WT" in v.variant_id.upper() or v.generation == 0), None)
3124
3034
  if wt_variant:
@@ -3422,7 +3332,7 @@ def _merge_lineage_and_sequences(
3422
3332
  log.info(f"After direct merge: {merged_aa} variants with aa_seq, {merged_dna} with dna_seq")
3423
3333
 
3424
3334
  # 3. If we have unmatched sequences and a model, use Gemini to match
3425
- if model and len(df_seq) > 0 and df['aa_seq'].isna().any():
3335
+ if model and len(df_seq) > 0 and (df['aa_seq'].isna().any() or df['dna_seq'].isna().any()):
3426
3336
  # Find unmatched entries - consider entries missing if they lack BOTH aa_seq and dna_seq
3427
3337
  missing_seq = df['aa_seq'].isna() & df['dna_seq'].isna()
3428
3338
  unmatched_lineage_ids = df[missing_seq]['variant_id'].tolist()
@@ -3437,14 +3347,9 @@ def _merge_lineage_and_sequences(
3437
3347
  log.info("Using Gemini to match variants")
3438
3348
 
3439
3349
  # Build prompt for Gemini
3440
- prompt = f"""Match enzyme variant IDs between two lists from the same paper.
3350
+ prompt = f"""Match enzyme variant IDs between two lists from the same paper using your best judgment.
3441
3351
 
3442
- Papers often use different naming conventions for the same variant:
3443
- - Lineage sections may use numeric IDs (e.g., "5295") or IDs with parenthetical numbers (e.g., "ᴅ-G0 (5308)")
3444
- - Sequence sections may use descriptive names (e.g., "ʟ-ApPgb-αEsA-G0", "ᴅ-ApPgb-αEsA-G0")
3445
-
3446
- Match variants by analyzing generation numbers, prefixes, and patterns. Some variant id are clearly mutations from a parent,
3447
- use your best judgement to not match mutations to a parent even though they might share a substring in the variant id.
3352
+ These IDs come from different sections of the paper and may use different naming conventions for the same variant.
3448
3353
 
3449
3354
  Lineage variant IDs (need sequences):
3450
3355
  {json.dumps(unmatched_lineage_ids)}
@@ -3452,8 +3357,13 @@ Lineage variant IDs (need sequences):
3452
3357
  Sequence variant IDs (have sequences):
3453
3358
  {json.dumps(unmatched_seqs['variant_id'].tolist())}
3454
3359
 
3360
+ IMPORTANT: A variant with mutations (indicated by mutation codes like letters and numbers after an underscore or space) is a DIFFERENT enzyme from its parent. Do not match mutation variants to their base sequences - they are distinct entities with different sequences due to the mutations.
3361
+
3362
+ Only match variants that represent the SAME enzyme, accounting for different naming conventions between sections.
3363
+
3455
3364
  Return ONLY a JSON object mapping lineage IDs to sequence IDs.
3456
3365
  Format: {{"lineage_id": "sequence_id", ...}}
3366
+ Only include matches you are confident represent the same variant.
3457
3367
  """
3458
3368
 
3459
3369
  try:
@@ -3733,16 +3643,27 @@ def run_pipeline(
3733
3643
  # 4. Extract sequences (Section 7) ----------------------------------------
3734
3644
  sequences = get_sequences(full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir, lineage_variants=lineage)
3735
3645
 
3736
- # 4a. Try PDB extraction if no sequences found -----------------------------
3737
- # Check if we need PDB sequences (no sequences or only partial sequences)
3738
- MIN_PROTEIN_LENGTH = 50 # Most proteins are >50 AA
3739
- needs_pdb = (not sequences or
3740
- all(s.aa_seq is None or (s.aa_seq and len(s.aa_seq) < MIN_PROTEIN_LENGTH)
3741
- for s in sequences))
3646
+ # 4a. First try to merge extracted sequences with lineage using Gemini matching
3647
+ # This allows fuzzy matching of complex variant IDs before external lookups
3648
+ doi = extract_doi(manuscript)
3649
+ df_merged = merge_and_score(lineage, sequences, doi, model)
3650
+
3651
+ # 4b. Check if ALL variants are missing sequences after merging
3652
+ # Only try external sources if no sequences were successfully matched
3653
+ all_missing_sequences = True
3654
+ if 'aa_seq' in df_merged.columns or 'dna_seq' in df_merged.columns:
3655
+ for _, row in df_merged.iterrows():
3656
+ has_aa = pd.notna(row.get('aa_seq'))
3657
+ has_dna = pd.notna(row.get('dna_seq'))
3658
+ if has_aa or has_dna:
3659
+ all_missing_sequences = False
3660
+ break
3742
3661
 
3743
- if needs_pdb:
3744
- log.info("No full-length sequences found in paper (only partial sequences < %d AA), attempting PDB extraction...",
3745
- MIN_PROTEIN_LENGTH)
3662
+ if all_missing_sequences:
3663
+ MIN_PROTEIN_LENGTH = 50 # Most proteins are >50 AA
3664
+ MIN_DNA_LENGTH = 150 # DNA sequences should be >150 nt
3665
+ log.info("No full-length sequences found in paper (only partial sequences < %d AA or < %d nt), attempting PDB extraction...",
3666
+ MIN_PROTEIN_LENGTH, MIN_DNA_LENGTH)
3746
3667
 
3747
3668
  # Extract PDB IDs from all PDFs
3748
3669
  pdb_ids = []
@@ -3780,7 +3701,13 @@ def run_pipeline(
3780
3701
  log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
3781
3702
 
3782
3703
  if pdb_seq_blocks:
3783
- sequences = pdb_seq_blocks
3704
+ # Update the dataframe with PDB sequences
3705
+ for seq_block in pdb_seq_blocks:
3706
+ mask = df_merged['variant_id'] == seq_block.variant_id
3707
+ if mask.any():
3708
+ df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
3709
+ df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
3710
+ df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
3784
3711
  log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
3785
3712
  break
3786
3713
  else:
@@ -3788,8 +3715,15 @@ def run_pipeline(
3788
3715
  else:
3789
3716
  log.warning("No PDB IDs found in paper")
3790
3717
 
3791
- # 4b. If still no sequences, try Gemini extraction as last resort
3792
- if not sequences or all(not s.aa_seq for s in sequences):
3718
+ # 4c. If still no sequences after PDB, try Gemini extraction as last resort
3719
+ # Re-check if all variants are still missing sequences
3720
+ still_all_missing = True
3721
+ for _, row in df_merged.iterrows():
3722
+ if pd.notna(row.get('aa_seq')) or pd.notna(row.get('dna_seq')):
3723
+ still_all_missing = False
3724
+ break
3725
+
3726
+ if still_all_missing:
3793
3727
  log.info("No sequences from PDB, attempting Gemini-based extraction...")
3794
3728
 
3795
3729
  gemini_sequences = extract_enzyme_info_with_gemini(full_text, lineage, model)
@@ -3813,14 +3747,19 @@ def run_pipeline(
3813
3747
  log.info(f"Added sequence for {variant.variant_id} via Gemini/UniProt: {len(seq)} residues")
3814
3748
 
3815
3749
  if gemini_seq_blocks:
3816
- sequences = gemini_seq_blocks
3750
+ # Update the dataframe with Gemini/UniProt sequences
3751
+ for seq_block in gemini_seq_blocks:
3752
+ mask = df_merged['variant_id'] == seq_block.variant_id
3753
+ if mask.any():
3754
+ df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
3755
+ df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
3756
+ df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'Gemini/UniProt')
3817
3757
  log.info(f"Successfully extracted {len(gemini_seq_blocks)} sequences via Gemini")
3818
3758
  else:
3819
3759
  log.warning("Failed to extract sequences via Gemini")
3820
3760
 
3821
- # 5. Merge & score (Section 8) --------------------------------------------
3822
- doi = extract_doi(manuscript)
3823
- df_final = merge_and_score(lineage, sequences, doi, model)
3761
+ # 5. Use the merged dataframe (already merged above)
3762
+ df_final = df_merged
3824
3763
 
3825
3764
  # 6. Write FINAL CSV -------------------------------------------------------
3826
3765
  if output_csv: