debase 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -465,7 +465,7 @@ def get_model():
465
465
  "temperature": 0.0, # Deterministic: always pick the most likely token
466
466
  "top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
467
467
  "top_k": 1, # Only consider the single most likely token
468
- "max_output_tokens": 32768, # Increased from 8192 to handle larger sequence extractions
468
+ "max_output_tokens": 65536, # Increased to 2x for handling larger lineage tables and sequences
469
469
  }
470
470
 
471
471
  # For Gemini 2.5 Flash, disable thinking tokens to save costs
@@ -758,13 +758,24 @@ mutations were introduced){campaign_specific}. Pay attention to the provided con
758
758
  ensure the location you return are actually lineage location with variants and mutations.
759
759
 
760
760
  Respond with a JSON array of objects, each containing:
761
- - "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
761
+ - "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
762
762
  - "type": one of "table", "figure", "section"
763
763
  - "confidence": your confidence score (0-100) that this location contains lineage data
764
764
  - "reason": brief explanation of why this location likely contains lineage
765
+ - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
766
+ - "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
765
767
  {campaign_field}
766
- IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
767
- NOT page numbers. Focus on the actual figure/table titles and numbers.
768
+ CRITICAL INSTRUCTIONS:
769
+ 1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
770
+ - Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
771
+ - Do NOT modify, standardize, or interpret the location - return it verbatim from the document
772
+ 2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
773
+ - This should be the complete caption as it appears in the document
774
+ - Include at least 200-300 characters to ensure unique matching
775
+ 3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
776
+ - Items like "Table S1", "Figure S2", etc. are typically in the SI
777
+ - Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
778
+ - If uncertain, use context clues from the text
768
779
 
769
780
  Order by confidence score (highest first). Tables showing complete variant lineages or
770
781
  mutation lists should be ranked higher than figures showing complete variant lineages.
@@ -774,9 +785,9 @@ Don't include oligonucleotide results or result from only one round.
774
785
 
775
786
  Example output:
776
787
  [
777
- {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
778
- {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
779
- {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
788
+ {{"location": "Table S1.", "type": "table", "confidence": 95, "reason": "Variant lineage table", "source": "si", "caption": "Table S1. Summary of mutations introduced during directed evolution of PA-G8. The table shows all variants tested in each round of SSM with their corresponding mutations and activities..."{campaign_example}}},
789
+ {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram", "source": "manuscript", "caption": "Figure 2B Phylogenetic tree showing the evolutionary relationships between enzyme variants. Each node represents a variant with mutations indicated on branches..."{campaign_example}}},
790
+ {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description", "source": "manuscript", "caption": "Section 3.2 Directed Evolution Campaign. We performed eight rounds of site-saturation mutagenesis..."{campaign_example}}}
780
791
  ]
781
792
  """.strip()
782
793
 
@@ -1461,10 +1472,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1461
1472
 
1462
1473
  # ---- 6.4 Public API -------------------------------------------------------
1463
1474
 
1464
- def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
1465
- """Extract text from a specific location (table, section, etc.) in the full text."""
1475
+ def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
1476
+ """Extract text from a specific location (table, section, etc.) in the full text.
1477
+
1478
+ Args:
1479
+ full_text: The full text to search in
1480
+ location: The location identifier (e.g., "Table S1")
1481
+ location_type: Type of location ("table", "figure", "section")
1482
+ caption_hint: Optional full caption text for fuzzy matching
1483
+ """
1466
1484
  import re
1467
1485
 
1486
+ # If caption hint is provided, try fuzzy matching first
1487
+ if caption_hint and len(caption_hint) > 20:
1488
+ log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
1489
+
1490
+ # Normalize texts for better matching (similar to reaction_info_extractor)
1491
+ def normalize_for_matching(text):
1492
+ # Remove extra whitespace, normalize spaces around punctuation
1493
+ text = ' '.join(text.split())
1494
+ # Normalize different dash types
1495
+ text = text.replace('–', '-').replace('—', '-')
1496
+ return text
1497
+
1498
+ normalized_hint = normalize_for_matching(caption_hint[:150]) # Use first 150 chars
1499
+ normalized_text = normalize_for_matching(full_text)
1500
+
1501
+ # Try to find ALL caption matches using character-based fuzzy matching
1502
+ all_matches = []
1503
+
1504
+ # Slide through the text looking for all matches above threshold
1505
+ hint_len = len(normalized_hint)
1506
+ for i in range(len(normalized_text) - hint_len + 1):
1507
+ snippet = normalized_text[i:i + hint_len]
1508
+ # Simple character-based similarity
1509
+ matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
1510
+ score = matches / hint_len
1511
+
1512
+ if score > 0.7: # 70% similarity threshold
1513
+ all_matches.append({
1514
+ 'norm_pos': i,
1515
+ 'score': score
1516
+ })
1517
+
1518
+ # If we found matches, extract from all of them
1519
+ if all_matches:
1520
+ log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
1521
+
1522
+ # Collect all occurrences from fuzzy matches
1523
+ all_occurrences = []
1524
+ seen_positions = set()
1525
+
1526
+ for match_info in all_matches:
1527
+ # Get the matched text from normalized version
1528
+ matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
1529
+
1530
+ # Find where this appears in the original text
1531
+ best_original_pos = -1
1532
+
1533
+ # Search in the original text for this specific match
1534
+ for i in range(len(full_text) - len(caption_hint) + 1):
1535
+ if i in seen_positions:
1536
+ continue
1537
+
1538
+ original_snippet = full_text[i:i + len(caption_hint)]
1539
+ # Normalize and compare
1540
+ normalized_snippet = normalize_for_matching(original_snippet)
1541
+ if normalized_snippet[:hint_len] == matched_normalized:
1542
+ # Found exact match after normalization
1543
+ best_original_pos = i
1544
+ seen_positions.add(i)
1545
+ break
1546
+
1547
+ if best_original_pos >= 0:
1548
+ # Extract generous context from this match position
1549
+ start = max(0, best_original_pos - 1000)
1550
+ end = min(len(full_text), best_original_pos + 10000)
1551
+ context = full_text[start:end]
1552
+
1553
+ all_occurrences.append({
1554
+ 'position': best_original_pos,
1555
+ 'context': context,
1556
+ 'score': match_info['score']
1557
+ })
1558
+ log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
1559
+
1560
+ if all_occurrences:
1561
+ # Sort by position to maintain document order
1562
+ all_occurrences.sort(key=lambda x: x['position'])
1563
+
1564
+ # Combine all occurrences
1565
+ combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
1566
+
1567
+ for i, occurrence in enumerate(all_occurrences, 1):
1568
+ combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
1569
+ combined_text += occurrence['context']
1570
+ combined_text += "\n\n"
1571
+
1572
+ # Apply same limit as table extraction
1573
+ if len(combined_text) > 150000:
1574
+ combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
1575
+
1576
+ log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
1577
+ return combined_text
1578
+ else:
1579
+ log.warning(f"Could not map any fuzzy matches back to original text")
1580
+ else:
1581
+ log.warning(f"No fuzzy matches found for caption above 70% threshold")
1582
+
1468
1583
  if location_type == 'table':
1469
1584
  # Find ALL mentions of this table and combine them
1470
1585
  location_clean = location.strip()
@@ -1506,6 +1621,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
1506
1621
 
1507
1622
  log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
1508
1623
 
1624
+ # Sort occurrences by position to maintain document order
1625
+ all_occurrences.sort(key=lambda x: x['position'])
1626
+
1509
1627
  # Combine all occurrences into one text for Gemini to analyze
1510
1628
  combined_text = f"=== All occurrences of {location_clean} ===\n\n"
1511
1629
 
@@ -1515,8 +1633,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
1515
1633
  combined_text += "\n\n"
1516
1634
 
1517
1635
  # Limit total length to avoid overwhelming the model
1518
- if len(combined_text) > 50000:
1519
- combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
1636
+ # Increased limit to ensure actual table content is included
1637
+ if len(combined_text) > 150000:
1638
+ combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
1520
1639
 
1521
1640
  return combined_text
1522
1641
 
@@ -1600,6 +1719,8 @@ def get_lineage(
1600
1719
  *,
1601
1720
  pdf_paths: Optional[List[Path]] = None,
1602
1721
  debug_dir: str | Path | None = None,
1722
+ manuscript_text: Optional[str] = None,
1723
+ si_text: Optional[str] = None,
1603
1724
  ) -> Tuple[List[Variant], List[Campaign]]:
1604
1725
  """
1605
1726
  High-level wrapper used by the pipeline.
@@ -1713,8 +1834,21 @@ def get_lineage(
1713
1834
  if location_type in ['table', 'text', 'section'] and not extracted_variants:
1714
1835
  log.info(f"Attempting text extraction for {location_type}: {location_str}")
1715
1836
 
1716
- # Extract the specific section/table from full text
1717
- section_text = _extract_location_text(full_text, location_str, location_type)
1837
+ # Determine which text to use based on source
1838
+ location_source = location.get('source', 'manuscript')
1839
+ if location_source == 'si' and si_text:
1840
+ text_to_search = si_text
1841
+ log.info(f"Using SI text for location {location_str}")
1842
+ elif location_source == 'manuscript' and manuscript_text:
1843
+ text_to_search = manuscript_text
1844
+ log.info(f"Using manuscript text for location {location_str}")
1845
+ else:
1846
+ text_to_search = full_text
1847
+ log.info(f"Using combined text for location {location_str} (fallback)")
1848
+
1849
+ # Extract the specific section/table from appropriate text
1850
+ caption_hint = location.get('caption', '')
1851
+ section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
1718
1852
  if section_text:
1719
1853
  log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
1720
1854
  # Save extracted section if debug enabled
@@ -2028,17 +2162,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
2028
2162
 
2029
2163
  Look for table of contents entries or section listings that mention sequences.
2030
2164
  Return a JSON array where each element has:
2031
- - "section": the section heading or description
2165
+ - "section": the section heading or description EXACTLY as it appears
2032
2166
  - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
2167
+ - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
2168
+ - "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
2033
2169
 
2034
2170
  Focus on:
2035
2171
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
2036
2172
  - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
2037
2173
  - Prioritize sections that mention "protein" or "amino acid" sequences
2038
2174
 
2039
- CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
2040
- - Correct: "53", "S12", "147"
2041
- - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
2175
+ CRITICAL:
2176
+ 1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
2177
+ - Correct: "53", "S12", "147"
2178
+ - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
2179
+ 2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
2180
+ - Pages with "S" prefix (e.g., "S53") are typically in the SI
2181
+ - Regular page numbers (e.g., "53") are typically in the main manuscript
2182
+ - Use context clues from the document structure
2042
2183
 
2043
2184
  Return [] if no sequence sections are found.
2044
2185
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -2278,44 +2419,34 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
2278
2419
 
2279
2420
  # --- 7.3 Main extraction prompt ---------------------------------------------
2280
2421
  _SEQ_EXTRACTION_PROMPT = """
2281
- Extract EVERY distinct enzyme-variant sequence you can find in the text.
2282
-
2283
- IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
2284
- - If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
2285
- - Only extract dna_seq if NO amino acid sequence is available for that variant
2286
- - This reduces redundancy since protein sequences are usually more relevant
2287
-
2288
- CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
2289
- - Papers often use different naming conventions in different sections
2290
- - DO NOT normalize or simplify variant IDs
2291
- - Extract the variant_id exactly as written where the sequence appears
2292
- - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
2293
-
2294
- SEQUENCE EXTRACTION RULES:
2295
- - Copy sequences EXACTLY as they appear in the text
2296
- - Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
2297
- - Do NOT add, remove, or modify any amino acids, or nucleotides
2298
- - Preserve the exact length and character sequence
2299
- - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
2300
- - Double-check that consecutive identical amino acids or nucleotides are copied correctly
2301
-
2302
- For each variant return:
2303
- * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
2304
- * aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
2305
- * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
2306
-
2307
- Respond ONLY with **minified JSON** that matches the schema below.
2308
- NO markdown, no code fences, no commentary.
2309
-
2310
- Schema:
2311
- ```json
2312
- {schema}
2313
- ```
2422
+ Extract ALL enzyme variant sequences from the text.
2423
+
2424
+ Rules:
2425
+ 1. Use EXACT variant IDs as they appear with each sequence
2426
+ 2. Copy sequences EXACTLY - preserve all amino acids/nucleotides including repeats
2427
+ 3. For each variant:
2428
+ - If amino acid sequence exists: set aa_seq to the sequence, set dna_seq to null
2429
+ - If ONLY DNA sequence exists: set dna_seq to the sequence, set aa_seq to null
2430
+ - NEVER include both aa_seq and dna_seq for the same variant
2431
+ - IMPORTANT: Always prefer amino acid sequences over DNA sequences when both are available
2432
+ 4. Return ONLY minified JSON, no markdown or commentary
2433
+
2434
+ CRITICAL SEQUENCE PRIORITY RULE:
2435
+ - If you find BOTH amino acid sequence AND DNA sequence for the same variant, ONLY return the amino acid sequence
2436
+ - Set dna_seq to null when aa_seq is available, even if DNA sequence is present in the text
2437
+ - Only return dna_seq when NO amino acid sequence exists for that variant
2438
+
2439
+ CRITICAL ACCURACY REQUIREMENTS:
2440
+ - Extract ONLY sequences that are explicitly present in the provided text
2441
+ - DO NOT generate, infer, or hallucinate any sequences
2442
+ - Every character in the sequence must be directly copied from the text
2443
+ - If a sequence appears truncated or incomplete in the text, extract only what is shown
2444
+ - Be extremely careful and accurate - sequence accuracy is critical for scientific validity
2445
+
2446
+ Schema: {schema}
2314
2447
 
2315
- TEXT (may be truncated):
2316
- ```
2448
+ TEXT:
2317
2449
  {text}
2318
- ```
2319
2450
  """.strip()
2320
2451
 
2321
2452
  def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
@@ -2390,7 +2521,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
2390
2521
 
2391
2522
 
2392
2523
  def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
2393
- """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
2524
+ """Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
2394
2525
 
2395
2526
  Can exit early after 2 attempts if the responses match exactly.
2396
2527
 
@@ -2404,9 +2535,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2404
2535
  The most common sequence JSON data or None if all attempts failed
2405
2536
  """
2406
2537
  responses = []
2407
- max_attempts = 6
2538
+ max_attempts = 3 # Reduced from 6 to 3 for performance
2408
2539
 
2409
- # Try 6 times with early match detection
2540
+ # Try 3 times with early match detection
2410
2541
  for attempt in range(max_attempts):
2411
2542
  try:
2412
2543
  log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2432,8 +2563,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2432
2563
 
2433
2564
  # Try to parse as JSON
2434
2565
  try:
2435
- parsed = json.loads(raw)
2436
- except json.JSONDecodeError:
2566
+ # First clean the response - remove any BOM or invisible characters
2567
+ raw_clean = raw.strip()
2568
+ if raw_clean.startswith('\ufeff'): # Remove BOM if present
2569
+ raw_clean = raw_clean[1:]
2570
+ parsed = json.loads(raw_clean)
2571
+ except json.JSONDecodeError as e:
2572
+ log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
2437
2573
  # Look for JSON array or object in the response
2438
2574
  json_start = -1
2439
2575
  json_end = -1
@@ -2482,17 +2618,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2482
2618
  responses.append(parsed)
2483
2619
  log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
2484
2620
 
2485
- # Early match detection after 2 attempts
2486
- if attempt >= 1: # After 2nd attempt (0-indexed)
2487
- valid_responses_so_far = [r for r in responses if r is not None]
2488
- if len(valid_responses_so_far) >= 2:
2489
- # Check if the last two valid responses match
2490
- if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2491
- log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2492
- # Add the matching response 4 more times to simulate consensus
2493
- for _ in range(max_attempts - attempt - 1):
2494
- responses.append(valid_responses_so_far[-1])
2495
- break
2621
+ # If we got a good response with sequences, we can check for early termination
2622
+ if isinstance(parsed, list) and len(parsed) > 0:
2623
+ # Early match detection after 2 attempts
2624
+ if attempt >= 1: # After 2nd attempt (0-indexed)
2625
+ valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
2626
+ if len(valid_responses_so_far) >= 2:
2627
+ # Check if the last two valid responses match
2628
+ if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2629
+ log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2630
+ # Add the matching response to fill remaining attempts
2631
+ for _ in range(max_attempts - attempt - 1):
2632
+ responses.append(valid_responses_so_far[-1])
2633
+ break
2634
+ # If this is the first attempt and we got sequences, continue to validate with at least one more
2635
+ elif attempt == 0 and len(parsed) > 5: # Got substantial sequences on first try
2636
+ log.info("Got substantial sequences on first attempt, will validate with one more")
2496
2637
 
2497
2638
  except Exception as e:
2498
2639
  log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
@@ -2852,9 +2993,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2852
2993
  focused_text = ""
2853
2994
  if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
2854
2995
  page_num = best_location['page']
2855
- # Extract current page plus next 15 pages
2996
+ # Extract current page plus next 5 pages (6 total) to prevent hallucination
2856
2997
  all_pages = []
2857
- for i in range(16): # Current + next 15
2998
+ for i in range(6): # Current + next 5 (6 pages total)
2858
2999
  if isinstance(page_num, str) and page_num.upper().startswith('S'):
2859
3000
  next_page = f"S{int(page_num[1:]) + i}"
2860
3001
  else:
@@ -2866,7 +3007,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2866
3007
  break
2867
3008
  if all_pages:
2868
3009
  focused_text = "\n".join(all_pages)
2869
- log.info("Extracted %d chars from pages %s through %d more pages",
3010
+ log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
2870
3011
  len(focused_text), page_num, len(all_pages) - 1)
2871
3012
 
2872
3013
  # Fallback to text search if page extraction didn't work
@@ -3152,6 +3293,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
3152
3293
  return {}
3153
3294
 
3154
3295
 
3296
+ def _match_variant_ids_with_gemini(
3297
+ lineage_variant_ids: List[str],
3298
+ pdb_variant_ids: List[str],
3299
+ model
3300
+ ) -> Dict[str, str]:
3301
+ """Use Gemini to match variant IDs that may have slight formatting differences.
3302
+
3303
+ Args:
3304
+ lineage_variant_ids: List of variant IDs from the lineage
3305
+ pdb_variant_ids: List of variant IDs from PDB matching
3306
+ model: Gemini model for matching
3307
+
3308
+ Returns:
3309
+ Dictionary mapping lineage_variant_id -> pdb_variant_id
3310
+ """
3311
+ if not lineage_variant_ids or not pdb_variant_ids or not model:
3312
+ return {}
3313
+
3314
+ # If the lists are identical, return direct mapping
3315
+ if set(lineage_variant_ids) == set(pdb_variant_ids):
3316
+ return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
3317
+
3318
+ # Use Gemini to match variant IDs that may have formatting differences
3319
+ prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
3320
+ These represent the same enzyme variants but may be formatted differently.
3321
+
3322
+ Lineage variant IDs:
3323
+ {json.dumps(lineage_variant_ids, indent=2)}
3324
+
3325
+ PDB variant IDs:
3326
+ {json.dumps(pdb_variant_ids, indent=2)}
3327
+
3328
+ Match variants that represent the SAME enzyme variant, accounting for:
3329
+ - Whitespace differences (extra spaces, tabs)
3330
+ - Character encoding differences
3331
+ - Minor formatting variations
3332
+
3333
+ Return ONLY a JSON object mapping lineage IDs to PDB IDs.
3334
+ Format: {{"lineage_id": "pdb_id", ...}}
3335
+ Only include matches you are confident represent the same variant.
3336
+ Return an empty object {{}} if no matches can be confidently made.
3337
+ """
3338
+
3339
+ try:
3340
+ response = model.generate_content(prompt)
3341
+ text = _extract_text(response).strip()
3342
+
3343
+ # Parse JSON response
3344
+ if text.startswith("```"):
3345
+ text = text.split("```")[1].strip()
3346
+ if text.startswith("json"):
3347
+ text = text[4:].strip()
3348
+
3349
+ # Clean up the text
3350
+ text = text.strip()
3351
+ if not text or text == "{}":
3352
+ return {}
3353
+
3354
+ matches = json.loads(text)
3355
+ log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
3356
+
3357
+ # Validate matches
3358
+ valid_matches = {}
3359
+ for lineage_id, pdb_id in matches.items():
3360
+ if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
3361
+ valid_matches[lineage_id] = pdb_id
3362
+ log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
3363
+ else:
3364
+ log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
3365
+
3366
+ return valid_matches
3367
+
3368
+ except Exception as e:
3369
+ log.warning(f"Failed to match variant IDs with Gemini: {e}")
3370
+ return {}
3371
+
3372
+
3155
3373
  def match_pdb_to_variants(
3156
3374
  pdb_sequences: Dict[str, str],
3157
3375
  variants: List[Variant],
@@ -3235,24 +3453,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
3235
3453
  text = _extract_text(response).strip()
3236
3454
 
3237
3455
  # Parse JSON response (expecting a single string)
3238
- if text.startswith("```"):
3456
+ # Look for JSON code blocks first
3457
+ if "```json" in text:
3458
+ # Extract content between ```json and ```
3459
+ import re
3460
+ json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
3461
+ if json_match:
3462
+ json_content = json_match.group(1).strip()
3463
+ try:
3464
+ # Parse as JSON and extract the string value
3465
+ parsed = json.loads(json_content)
3466
+ matched_variant = str(parsed).strip('"\'')
3467
+ except:
3468
+ # If JSON parsing fails, try to extract the quoted string
3469
+ quoted_match = re.search(r'"([^"]+)"', json_content)
3470
+ if quoted_match:
3471
+ matched_variant = quoted_match.group(1)
3472
+ else:
3473
+ matched_variant = json_content.strip('"\'')
3474
+ else:
3475
+ matched_variant = text.strip('"\'')
3476
+ elif text.startswith("```"):
3477
+ # Handle other code blocks
3239
3478
  text = text.split("```")[1].strip()
3240
3479
  if text.startswith("json"):
3241
3480
  text = text[4:].strip()
3481
+ matched_variant = text.strip('"\'')
3482
+ else:
3483
+ # Look for quoted strings in the response
3484
+ import re
3485
+ quoted_match = re.search(r'"([^"]+)"', text)
3486
+ if quoted_match:
3487
+ matched_variant = quoted_match.group(1)
3488
+ else:
3489
+ # Remove quotes if present
3490
+ matched_variant = text.strip('"\'')
3242
3491
 
3243
- # Remove quotes if present
3244
- text = text.strip('"\'')
3245
-
3246
- matched_variant = text
3492
+ log.info(f"Extracted variant name: '{matched_variant}' from response")
3247
3493
  log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
3248
3494
 
3249
3495
  # Return mapping with all chains pointing to the same variant
3250
3496
  mapping = {}
3251
- if matched_variant and any(v.variant_id == matched_variant for v in variants):
3252
- for chain_id in pdb_sequences:
3253
- mapping[matched_variant] = chain_id
3254
- break # Only use the first chain
3497
+ if matched_variant:
3498
+ # Debug logging
3499
+ variant_ids = [v.variant_id for v in variants]
3500
+ log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
3501
+
3502
+ # Check if the matched variant exists in the lineage
3503
+ found_variant = any(v.variant_id == matched_variant for v in variants)
3504
+ log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
3505
+
3506
+ if found_variant:
3507
+ for chain_id in pdb_sequences:
3508
+ mapping[matched_variant] = chain_id
3509
+ log.info(f"Created mapping: {matched_variant} -> {chain_id}")
3510
+ break # Only use the first chain
3511
+ else:
3512
+ log.warning(f"Variant '{matched_variant}' not found in lineage variants")
3513
+ # Try fuzzy matching
3514
+ for variant in variants:
3515
+ if variant.variant_id.strip() == matched_variant.strip():
3516
+ log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
3517
+ for chain_id in pdb_sequences:
3518
+ mapping[variant.variant_id] = chain_id
3519
+ log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
3520
+ break
3521
+ break
3522
+ else:
3523
+ log.warning("No matched variant extracted from response")
3255
3524
 
3525
+ log.info(f"Final mapping result: {mapping}")
3256
3526
  return mapping
3257
3527
 
3258
3528
  except Exception as e:
@@ -3634,14 +3904,28 @@ def run_pipeline(
3634
3904
  caption_text = limited_caption_concat(*pdf_paths)
3635
3905
  full_text = limited_concat(*pdf_paths)
3636
3906
 
3907
+ # Also load separate texts for manuscript and SI
3908
+ manuscript_text = limited_concat(manuscript) if manuscript else None
3909
+ si_text = limited_concat(si_path) if si_path else None
3910
+
3637
3911
  log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
3638
3912
  len(caption_text), len(full_text))
3913
+ if manuscript_text:
3914
+ log.info("Loaded %d chars from manuscript", len(manuscript_text))
3915
+ if si_text:
3916
+ log.info("Loaded %d chars from SI", len(si_text))
3639
3917
 
3640
3918
  # 2. Connect to Gemini -----------------------------------------------------
3641
3919
  model = get_model()
3642
3920
 
3643
3921
  # 3. Extract lineage (Section 6) ------------------------------------------
3644
- lineage, campaigns = get_lineage(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
3922
+ lineage, campaigns = get_lineage(
3923
+ caption_text, full_text, model,
3924
+ pdf_paths=pdf_paths,
3925
+ debug_dir=debug_dir,
3926
+ manuscript_text=manuscript_text,
3927
+ si_text=si_text
3928
+ )
3645
3929
 
3646
3930
  if not lineage:
3647
3931
  raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
@@ -3721,12 +4005,40 @@ def run_pipeline(
3721
4005
  pdb_sequences, lineage, full_text, model, pdb_id
3722
4006
  )
3723
4007
 
4008
+ log.info(f"PDB matching result: {variant_to_chain}")
4009
+ log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
4010
+ log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
4011
+
3724
4012
  # Convert to SequenceBlock objects
3725
4013
  pdb_seq_blocks = []
3726
- for variant in lineage:
3727
- if variant.variant_id in variant_to_chain:
3728
- chain_id = variant_to_chain[variant.variant_id]
3729
- if chain_id in pdb_sequences:
4014
+
4015
+ # Use Gemini-based matching for robust variant ID comparison
4016
+ if variant_to_chain and model:
4017
+ # Create a mapping using Gemini for robust string matching
4018
+ gemini_mapping = _match_variant_ids_with_gemini(
4019
+ lineage_variant_ids=[v.variant_id for v in lineage],
4020
+ pdb_variant_ids=list(variant_to_chain.keys()),
4021
+ model=model
4022
+ )
4023
+
4024
+ for variant in lineage:
4025
+ log.info(f"Processing variant: {variant.variant_id}")
4026
+
4027
+ # Try direct match first
4028
+ chain_id = variant_to_chain.get(variant.variant_id)
4029
+ log.info(f"Direct match for {variant.variant_id}: {chain_id}")
4030
+
4031
+ # If no direct match, try Gemini-based matching
4032
+ if not chain_id:
4033
+ matched_pdb_variant = gemini_mapping.get(variant.variant_id)
4034
+ log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
4035
+ if matched_pdb_variant:
4036
+ chain_id = variant_to_chain.get(matched_pdb_variant)
4037
+ log.info(f"Chain ID from Gemini match: {chain_id}")
4038
+
4039
+ if chain_id and chain_id in pdb_sequences:
4040
+ seq_length = len(pdb_sequences[chain_id])
4041
+ log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
3730
4042
  seq_block = SequenceBlock(
3731
4043
  variant_id=variant.variant_id,
3732
4044
  aa_seq=pdb_sequences[chain_id],
@@ -3737,6 +4049,26 @@ def run_pipeline(
3737
4049
  )
3738
4050
  pdb_seq_blocks.append(seq_block)
3739
4051
  log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
4052
+ else:
4053
+ log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
4054
+ else:
4055
+ # Fallback to direct matching if no model or no matches
4056
+ for variant in lineage:
4057
+ if variant.variant_id in variant_to_chain:
4058
+ chain_id = variant_to_chain[variant.variant_id]
4059
+ if chain_id in pdb_sequences:
4060
+ seq_block = SequenceBlock(
4061
+ variant_id=variant.variant_id,
4062
+ aa_seq=pdb_sequences[chain_id],
4063
+ dna_seq=None,
4064
+ confidence=1.0, # High confidence for PDB sequences
4065
+ truncated=False,
4066
+ metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
4067
+ )
4068
+ pdb_seq_blocks.append(seq_block)
4069
+ log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
4070
+
4071
+ log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
3740
4072
 
3741
4073
  if pdb_seq_blocks:
3742
4074
  # Update the dataframe with PDB sequences
@@ -3746,8 +4078,13 @@ def run_pipeline(
3746
4078
  df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
3747
4079
  df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
3748
4080
  df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
4081
+ log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
4082
+ else:
4083
+ log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
3749
4084
  log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
3750
4085
  break
4086
+ else:
4087
+ log.warning(f"No PDB sequence blocks were created for {pdb_id}")
3751
4088
  else:
3752
4089
  log.warning(f"No sequences found in PDB {pdb_id}")
3753
4090
  else: