debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,13 @@ import fitz
28
28
  import re
29
29
  import json
30
30
  import time
31
+
32
+ # Import universal caption pattern
33
+ try:
34
+ from .caption_pattern import get_universal_caption_pattern
35
+ except ImportError:
36
+ # Fallback if running as standalone script
37
+ from caption_pattern import get_universal_caption_pattern
31
38
  import logging
32
39
  from pathlib import Path
33
40
  from dataclasses import dataclass, field
@@ -113,17 +120,8 @@ _DOI_REGEX = re.compile(r"10\.[0-9]{4,9}/[-._;()/:A-Z0-9]+", re.I)
113
120
  # PDB ID regex - matches 4-character PDB codes
114
121
  _PDB_REGEX = re.compile(r"\b[1-9][A-Z0-9]{3}\b")
115
122
 
116
- # Improved caption prefix regex - captures most journal variants
117
- _CAPTION_PREFIX_RE = re.compile(
118
- r"""
119
- ^\s*
120
- (?:Fig(?:ure)?|Extended\s+Data\s+Fig|ED\s+Fig|Scheme|Chart|
121
- Table|Supp(?:lementary|l|\.?)\s+(?:Fig(?:ure)?|Table)) # label part
122
- \s*(?:S?\d+[A-Za-z]?|[IVX]+) # figure number
123
- [.:]?\s* # trailing punctuation/space
124
- """,
125
- re.I | re.X,
126
- )
123
+ # Use universal caption pattern
124
+ _CAPTION_PREFIX_RE = get_universal_caption_pattern()
127
125
 
128
126
 
129
127
  def _open_doc(pdf_path: str | Path | bytes):
@@ -467,7 +465,7 @@ def get_model():
467
465
  "temperature": 0.0, # Deterministic: always pick the most likely token
468
466
  "top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
469
467
  "top_k": 1, # Only consider the single most likely token
470
- "max_output_tokens": 32768, # Increased from 8192 to handle larger sequence extractions
468
+ "max_output_tokens": 65536, # Increased to 2x for handling larger lineage tables and sequences
471
469
  }
472
470
 
473
471
  # For Gemini 2.5 Flash, disable thinking tokens to save costs
@@ -760,13 +758,24 @@ mutations were introduced){campaign_specific}. Pay attention to the provided con
760
758
  ensure the location you return are actually lineage location with variants and mutations.
761
759
 
762
760
  Respond with a JSON array of objects, each containing:
763
- - "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
761
+ - "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
764
762
  - "type": one of "table", "figure", "section"
765
763
  - "confidence": your confidence score (0-100) that this location contains lineage data
766
764
  - "reason": brief explanation of why this location likely contains lineage
765
+ - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
766
+ - "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
767
767
  {campaign_field}
768
- IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
769
- NOT page numbers. Focus on the actual figure/table titles and numbers.
768
+ CRITICAL INSTRUCTIONS:
769
+ 1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
770
+ - Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
771
+ - Do NOT modify, standardize, or interpret the location - return it verbatim from the document
772
+ 2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
773
+ - This should be the complete caption as it appears in the document
774
+ - Include at least 200-300 characters to ensure unique matching
775
+ 3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
776
+ - Items like "Table S1", "Figure S2", etc. are typically in the SI
777
+ - Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
778
+ - If uncertain, use context clues from the text
770
779
 
771
780
  Order by confidence score (highest first). Tables showing complete variant lineages or
772
781
  mutation lists should be ranked higher than figures showing complete variant lineages.
@@ -776,9 +785,9 @@ Don't include oligonucleotide results or result from only one round.
776
785
 
777
786
  Example output:
778
787
  [
779
- {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
780
- {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
781
- {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
788
+ {{"location": "Table S1.", "type": "table", "confidence": 95, "reason": "Variant lineage table", "source": "si", "caption": "Table S1. Summary of mutations introduced during directed evolution of PA-G8. The table shows all variants tested in each round of SSM with their corresponding mutations and activities..."{campaign_example}}},
789
+ {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram", "source": "manuscript", "caption": "Figure 2B Phylogenetic tree showing the evolutionary relationships between enzyme variants. Each node represents a variant with mutations indicated on branches..."{campaign_example}}},
790
+ {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description", "source": "manuscript", "caption": "Section 3.2 Directed Evolution Campaign. We performed eight rounds of site-saturation mutagenesis..."{campaign_example}}}
782
791
  ]
783
792
  """.strip()
784
793
 
@@ -956,6 +965,9 @@ def identify_evolution_locations(
956
965
  campaign_context = f"\nYou are looking for lineage data for a SPECIFIC campaign:\n- Campaign: {camp.campaign_name}\n- Description: {camp.description}\n"
957
966
  if hasattr(camp, 'notes') and camp.notes:
958
967
  campaign_context += f"- Key identifiers: {camp.notes}\n"
968
+ if hasattr(camp, 'data_locations') and camp.data_locations:
969
+ campaign_context += f"- KNOWN DATA LOCATIONS: {', '.join(camp.data_locations)}\n"
970
+ campaign_context += " IMPORTANT: Prioritize these known locations highly!\n"
959
971
  campaign_specific = f" for the '{camp.campaign_name}' campaign"
960
972
  campaign_field = '\n- "campaign_id": "{}" (optional - include if this location is specific to one campaign)'.format(camp.campaign_id)
961
973
  campaign_example = f', "campaign_id": "{camp.campaign_id}"'
@@ -964,7 +976,10 @@ def identify_evolution_locations(
964
976
  campaign_context = "\nThis manuscript contains multiple directed evolution campaigns:\n"
965
977
  for camp in campaigns:
966
978
  campaign_context += f"- {camp.campaign_id}: {camp.campaign_name} - {camp.description}\n"
979
+ if hasattr(camp, 'data_locations') and camp.data_locations:
980
+ campaign_context += f" Known locations: {', '.join(camp.data_locations)}\n"
967
981
  campaign_context += "\nFind locations that contain lineage data for ANY of these campaigns.\n"
982
+ campaign_context += "IMPORTANT: Prioritize the known locations listed above!\n"
968
983
  campaign_specific = " for any of the identified campaigns"
969
984
  campaign_field = '\n- "campaign_id": "string" (optional - include if this location is specific to one campaign)'
970
985
  campaign_example = ', "campaign_id": "campaign_id_here"'
@@ -1041,6 +1056,7 @@ def extract_complete_lineage(
1041
1056
  campaign_id: Optional[str] = None,
1042
1057
  campaign_info: Optional[Campaign] = None,
1043
1058
  pdf_paths: Optional[List[Path]] = None,
1059
+ location_str: Optional[str] = None,
1044
1060
  ) -> List[Variant]:
1045
1061
  """Prompt Gemini for the full lineage and return a list[Variant]."""
1046
1062
  # Build campaign context
@@ -1060,6 +1076,21 @@ IMPORTANT:
1060
1076
  2. Include "campaign_id": "{campaign_info.campaign_id}" for each variant in your response.
1061
1077
  3. Use the lineage hint pattern above to identify which variants belong to this campaign.
1062
1078
  4. Include parent variants only if they are direct ancestors in this campaign's lineage.
1079
+ """
1080
+
1081
+ # Add location context if provided
1082
+ location_context = ""
1083
+ if location_str:
1084
+ location_context = f"""
1085
+
1086
+ LOCATION CONTEXT:
1087
+ You are extracting data SPECIFICALLY from: {location_str}
1088
+
1089
+ CRITICAL INSTRUCTIONS:
1090
+ - ONLY extract enzyme variants that appear in {location_str}
1091
+ - DO NOT include variants from other figures, tables, or sections
1092
+ - If {location_str} references variants from other locations, DO NOT include those unless they are explicitly shown in {location_str}
1093
+ - Focus strictly on the data presented within the boundaries of {location_str}
1063
1094
  """
1064
1095
 
1065
1096
  # Extract table of contents from PDFs if available
@@ -1096,8 +1127,11 @@ IMPORTANT:
1096
1127
  # Include TOC in the prompt text
1097
1128
  combined_text = toc_text + text if toc_text else text
1098
1129
 
1130
+ # Combine campaign and location context
1131
+ full_context = campaign_context + location_context
1132
+
1099
1133
  prompt = _LINEAGE_EXTRACT_PROMPT.format(
1100
- campaign_context=campaign_context,
1134
+ campaign_context=full_context,
1101
1135
  schema=_LINEAGE_SCHEMA_HINT,
1102
1136
  text=combined_text[:MAX_CHARS],
1103
1137
  )
@@ -1438,10 +1472,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1438
1472
 
1439
1473
  # ---- 6.4 Public API -------------------------------------------------------
1440
1474
 
1441
- def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
1442
- """Extract text from a specific location (table, section, etc.) in the full text."""
1475
+ def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
1476
+ """Extract text from a specific location (table, section, etc.) in the full text.
1477
+
1478
+ Args:
1479
+ full_text: The full text to search in
1480
+ location: The location identifier (e.g., "Table S1")
1481
+ location_type: Type of location ("table", "figure", "section")
1482
+ caption_hint: Optional full caption text for fuzzy matching
1483
+ """
1443
1484
  import re
1444
1485
 
1486
+ # If caption hint is provided, try fuzzy matching first
1487
+ if caption_hint and len(caption_hint) > 20:
1488
+ log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
1489
+
1490
+ # Normalize texts for better matching (similar to reaction_info_extractor)
1491
+ def normalize_for_matching(text):
1492
+ # Remove extra whitespace, normalize spaces around punctuation
1493
+ text = ' '.join(text.split())
1494
+ # Normalize different dash types
1495
+ text = text.replace('–', '-').replace('—', '-')
1496
+ return text
1497
+
1498
+ normalized_hint = normalize_for_matching(caption_hint[:150]) # Use first 150 chars
1499
+ normalized_text = normalize_for_matching(full_text)
1500
+
1501
+ # Try to find ALL caption matches using character-based fuzzy matching
1502
+ all_matches = []
1503
+
1504
+ # Slide through the text looking for all matches above threshold
1505
+ hint_len = len(normalized_hint)
1506
+ for i in range(len(normalized_text) - hint_len + 1):
1507
+ snippet = normalized_text[i:i + hint_len]
1508
+ # Simple character-based similarity
1509
+ matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
1510
+ score = matches / hint_len
1511
+
1512
+ if score > 0.7: # 70% similarity threshold
1513
+ all_matches.append({
1514
+ 'norm_pos': i,
1515
+ 'score': score
1516
+ })
1517
+
1518
+ # If we found matches, extract from all of them
1519
+ if all_matches:
1520
+ log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
1521
+
1522
+ # Collect all occurrences from fuzzy matches
1523
+ all_occurrences = []
1524
+ seen_positions = set()
1525
+
1526
+ for match_info in all_matches:
1527
+ # Get the matched text from normalized version
1528
+ matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
1529
+
1530
+ # Find where this appears in the original text
1531
+ best_original_pos = -1
1532
+
1533
+ # Search in the original text for this specific match
1534
+ for i in range(len(full_text) - len(caption_hint) + 1):
1535
+ if i in seen_positions:
1536
+ continue
1537
+
1538
+ original_snippet = full_text[i:i + len(caption_hint)]
1539
+ # Normalize and compare
1540
+ normalized_snippet = normalize_for_matching(original_snippet)
1541
+ if normalized_snippet[:hint_len] == matched_normalized:
1542
+ # Found exact match after normalization
1543
+ best_original_pos = i
1544
+ seen_positions.add(i)
1545
+ break
1546
+
1547
+ if best_original_pos >= 0:
1548
+ # Extract generous context from this match position
1549
+ start = max(0, best_original_pos - 1000)
1550
+ end = min(len(full_text), best_original_pos + 10000)
1551
+ context = full_text[start:end]
1552
+
1553
+ all_occurrences.append({
1554
+ 'position': best_original_pos,
1555
+ 'context': context,
1556
+ 'score': match_info['score']
1557
+ })
1558
+ log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
1559
+
1560
+ if all_occurrences:
1561
+ # Sort by position to maintain document order
1562
+ all_occurrences.sort(key=lambda x: x['position'])
1563
+
1564
+ # Combine all occurrences
1565
+ combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
1566
+
1567
+ for i, occurrence in enumerate(all_occurrences, 1):
1568
+ combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
1569
+ combined_text += occurrence['context']
1570
+ combined_text += "\n\n"
1571
+
1572
+ # Apply same limit as table extraction
1573
+ if len(combined_text) > 150000:
1574
+ combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
1575
+
1576
+ log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
1577
+ return combined_text
1578
+ else:
1579
+ log.warning(f"Could not map any fuzzy matches back to original text")
1580
+ else:
1581
+ log.warning(f"No fuzzy matches found for caption above 70% threshold")
1582
+
1445
1583
  if location_type == 'table':
1446
1584
  # Find ALL mentions of this table and combine them
1447
1585
  location_clean = location.strip()
@@ -1483,6 +1621,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
1483
1621
 
1484
1622
  log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
1485
1623
 
1624
+ # Sort occurrences by position to maintain document order
1625
+ all_occurrences.sort(key=lambda x: x['position'])
1626
+
1486
1627
  # Combine all occurrences into one text for Gemini to analyze
1487
1628
  combined_text = f"=== All occurrences of {location_clean} ===\n\n"
1488
1629
 
@@ -1492,8 +1633,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
1492
1633
  combined_text += "\n\n"
1493
1634
 
1494
1635
  # Limit total length to avoid overwhelming the model
1495
- if len(combined_text) > 50000:
1496
- combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
1636
+ # Increased limit to ensure actual table content is included
1637
+ if len(combined_text) > 150000:
1638
+ combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
1497
1639
 
1498
1640
  return combined_text
1499
1641
 
@@ -1577,6 +1719,8 @@ def get_lineage(
1577
1719
  *,
1578
1720
  pdf_paths: Optional[List[Path]] = None,
1579
1721
  debug_dir: str | Path | None = None,
1722
+ manuscript_text: Optional[str] = None,
1723
+ si_text: Optional[str] = None,
1580
1724
  ) -> Tuple[List[Variant], List[Campaign]]:
1581
1725
  """
1582
1726
  High-level wrapper used by the pipeline.
@@ -1690,8 +1834,21 @@ def get_lineage(
1690
1834
  if location_type in ['table', 'text', 'section'] and not extracted_variants:
1691
1835
  log.info(f"Attempting text extraction for {location_type}: {location_str}")
1692
1836
 
1693
- # Extract the specific section/table from full text
1694
- section_text = _extract_location_text(full_text, location_str, location_type)
1837
+ # Determine which text to use based on source
1838
+ location_source = location.get('source', 'manuscript')
1839
+ if location_source == 'si' and si_text:
1840
+ text_to_search = si_text
1841
+ log.info(f"Using SI text for location {location_str}")
1842
+ elif location_source == 'manuscript' and manuscript_text:
1843
+ text_to_search = manuscript_text
1844
+ log.info(f"Using manuscript text for location {location_str}")
1845
+ else:
1846
+ text_to_search = full_text
1847
+ log.info(f"Using combined text for location {location_str} (fallback)")
1848
+
1849
+ # Extract the specific section/table from appropriate text
1850
+ caption_hint = location.get('caption', '')
1851
+ section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
1695
1852
  if section_text:
1696
1853
  log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
1697
1854
  # Save extracted section if debug enabled
@@ -1705,7 +1862,8 @@ def get_lineage(
1705
1862
  debug_dir=debug_dir,
1706
1863
  campaign_id=campaign.campaign_id,
1707
1864
  campaign_info=campaign,
1708
- pdf_paths=pdf_paths
1865
+ pdf_paths=pdf_paths,
1866
+ location_str=location_str
1709
1867
  )
1710
1868
  if variants:
1711
1869
  log.info(f"Extracted {len(variants)} variants from {location_type}")
@@ -2004,17 +2162,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
2004
2162
 
2005
2163
  Look for table of contents entries or section listings that mention sequences.
2006
2164
  Return a JSON array where each element has:
2007
- - "section": the section heading or description
2165
+ - "section": the section heading or description EXACTLY as it appears
2008
2166
  - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
2167
+ - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
2168
+ - "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
2009
2169
 
2010
2170
  Focus on:
2011
2171
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
2012
2172
  - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
2013
2173
  - Prioritize sections that mention "protein" or "amino acid" sequences
2014
2174
 
2015
- CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
2016
- - Correct: "53", "S12", "147"
2017
- - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
2175
+ CRITICAL:
2176
+ 1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
2177
+ - Correct: "53", "S12", "147"
2178
+ - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
2179
+ 2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
2180
+ - Pages with "S" prefix (e.g., "S53") are typically in the SI
2181
+ - Regular page numbers (e.g., "53") are typically in the main manuscript
2182
+ - Use context clues from the document structure
2018
2183
 
2019
2184
  Return [] if no sequence sections are found.
2020
2185
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -2254,44 +2419,34 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
2254
2419
 
2255
2420
  # --- 7.3 Main extraction prompt ---------------------------------------------
2256
2421
  _SEQ_EXTRACTION_PROMPT = """
2257
- Extract EVERY distinct enzyme-variant sequence you can find in the text.
2258
-
2259
- IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
2260
- - If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
2261
- - Only extract dna_seq if NO amino acid sequence is available for that variant
2262
- - This reduces redundancy since protein sequences are usually more relevant
2263
-
2264
- CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
2265
- - Papers often use different naming conventions in different sections
2266
- - DO NOT normalize or simplify variant IDs
2267
- - Extract the variant_id exactly as written where the sequence appears
2268
- - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
2269
-
2270
- SEQUENCE EXTRACTION RULES:
2271
- - Copy sequences EXACTLY as they appear in the text
2272
- - Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
2273
- - Do NOT add, remove, or modify any amino acids, or nucleotides
2274
- - Preserve the exact length and character sequence
2275
- - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
2276
- - Double-check that consecutive identical amino acids or nucleotides are copied correctly
2277
-
2278
- For each variant return:
2279
- * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
2280
- * aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
2281
- * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
2282
-
2283
- Respond ONLY with **minified JSON** that matches the schema below.
2284
- NO markdown, no code fences, no commentary.
2285
-
2286
- Schema:
2287
- ```json
2288
- {schema}
2289
- ```
2422
+ Extract ALL enzyme variant sequences from the text.
2423
+
2424
+ Rules:
2425
+ 1. Use EXACT variant IDs as they appear with each sequence
2426
+ 2. Copy sequences EXACTLY - preserve all amino acids/nucleotides including repeats
2427
+ 3. For each variant:
2428
+ - If amino acid sequence exists: set aa_seq to the sequence, set dna_seq to null
2429
+ - If ONLY DNA sequence exists: set dna_seq to the sequence, set aa_seq to null
2430
+ - NEVER include both aa_seq and dna_seq for the same variant
2431
+ - IMPORTANT: Always prefer amino acid sequences over DNA sequences when both are available
2432
+ 4. Return ONLY minified JSON, no markdown or commentary
2433
+
2434
+ CRITICAL SEQUENCE PRIORITY RULE:
2435
+ - If you find BOTH amino acid sequence AND DNA sequence for the same variant, ONLY return the amino acid sequence
2436
+ - Set dna_seq to null when aa_seq is available, even if DNA sequence is present in the text
2437
+ - Only return dna_seq when NO amino acid sequence exists for that variant
2438
+
2439
+ CRITICAL ACCURACY REQUIREMENTS:
2440
+ - Extract ONLY sequences that are explicitly present in the provided text
2441
+ - DO NOT generate, infer, or hallucinate any sequences
2442
+ - Every character in the sequence must be directly copied from the text
2443
+ - If a sequence appears truncated or incomplete in the text, extract only what is shown
2444
+ - Be extremely careful and accurate - sequence accuracy is critical for scientific validity
2445
+
2446
+ Schema: {schema}
2290
2447
 
2291
- TEXT (may be truncated):
2292
- ```
2448
+ TEXT:
2293
2449
  {text}
2294
- ```
2295
2450
  """.strip()
2296
2451
 
2297
2452
  def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
@@ -2366,7 +2521,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
2366
2521
 
2367
2522
 
2368
2523
  def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
2369
- """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
2524
+ """Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
2370
2525
 
2371
2526
  Can exit early after 2 attempts if the responses match exactly.
2372
2527
 
@@ -2380,9 +2535,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2380
2535
  The most common sequence JSON data or None if all attempts failed
2381
2536
  """
2382
2537
  responses = []
2383
- max_attempts = 6
2538
+ max_attempts = 3 # Reduced from 6 to 3 for performance
2384
2539
 
2385
- # Try 6 times with early match detection
2540
+ # Try 3 times with early match detection
2386
2541
  for attempt in range(max_attempts):
2387
2542
  try:
2388
2543
  log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2408,8 +2563,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2408
2563
 
2409
2564
  # Try to parse as JSON
2410
2565
  try:
2411
- parsed = json.loads(raw)
2412
- except json.JSONDecodeError:
2566
+ # First clean the response - remove any BOM or invisible characters
2567
+ raw_clean = raw.strip()
2568
+ if raw_clean.startswith('\ufeff'): # Remove BOM if present
2569
+ raw_clean = raw_clean[1:]
2570
+ parsed = json.loads(raw_clean)
2571
+ except json.JSONDecodeError as e:
2572
+ log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
2413
2573
  # Look for JSON array or object in the response
2414
2574
  json_start = -1
2415
2575
  json_end = -1
@@ -2458,17 +2618,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2458
2618
  responses.append(parsed)
2459
2619
  log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
2460
2620
 
2461
- # Early match detection after 2 attempts
2462
- if attempt >= 1: # After 2nd attempt (0-indexed)
2463
- valid_responses_so_far = [r for r in responses if r is not None]
2464
- if len(valid_responses_so_far) >= 2:
2465
- # Check if the last two valid responses match
2466
- if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2467
- log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2468
- # Add the matching response 4 more times to simulate consensus
2469
- for _ in range(max_attempts - attempt - 1):
2470
- responses.append(valid_responses_so_far[-1])
2471
- break
2621
+ # If we got a good response with sequences, we can check for early termination
2622
+ if isinstance(parsed, list) and len(parsed) > 0:
2623
+ # Early match detection after 2 attempts
2624
+ if attempt >= 1: # After 2nd attempt (0-indexed)
2625
+ valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
2626
+ if len(valid_responses_so_far) >= 2:
2627
+ # Check if the last two valid responses match
2628
+ if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2629
+ log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2630
+ # Add the matching response to fill remaining attempts
2631
+ for _ in range(max_attempts - attempt - 1):
2632
+ responses.append(valid_responses_so_far[-1])
2633
+ break
2634
+ # If this is the first attempt and we got sequences, continue to validate with at least one more
2635
+ elif attempt == 0 and len(parsed) > 5: # Got substantial sequences on first try
2636
+ log.info("Got substantial sequences on first attempt, will validate with one more")
2472
2637
 
2473
2638
  except Exception as e:
2474
2639
  log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
@@ -2828,9 +2993,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2828
2993
  focused_text = ""
2829
2994
  if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
2830
2995
  page_num = best_location['page']
2831
- # Extract current page plus next 15 pages
2996
+ # Extract current page plus next 5 pages (6 total) to prevent hallucination
2832
2997
  all_pages = []
2833
- for i in range(16): # Current + next 15
2998
+ for i in range(6): # Current + next 5 (6 pages total)
2834
2999
  if isinstance(page_num, str) and page_num.upper().startswith('S'):
2835
3000
  next_page = f"S{int(page_num[1:]) + i}"
2836
3001
  else:
@@ -2842,7 +3007,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2842
3007
  break
2843
3008
  if all_pages:
2844
3009
  focused_text = "\n".join(all_pages)
2845
- log.info("Extracted %d chars from pages %s through %d more pages",
3010
+ log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
2846
3011
  len(focused_text), page_num, len(all_pages) - 1)
2847
3012
 
2848
3013
  # Fallback to text search if page extraction didn't work
@@ -3128,6 +3293,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
3128
3293
  return {}
3129
3294
 
3130
3295
 
3296
+ def _match_variant_ids_with_gemini(
3297
+ lineage_variant_ids: List[str],
3298
+ pdb_variant_ids: List[str],
3299
+ model
3300
+ ) -> Dict[str, str]:
3301
+ """Use Gemini to match variant IDs that may have slight formatting differences.
3302
+
3303
+ Args:
3304
+ lineage_variant_ids: List of variant IDs from the lineage
3305
+ pdb_variant_ids: List of variant IDs from PDB matching
3306
+ model: Gemini model for matching
3307
+
3308
+ Returns:
3309
+ Dictionary mapping lineage_variant_id -> pdb_variant_id
3310
+ """
3311
+ if not lineage_variant_ids or not pdb_variant_ids or not model:
3312
+ return {}
3313
+
3314
+ # If the lists are identical, return direct mapping
3315
+ if set(lineage_variant_ids) == set(pdb_variant_ids):
3316
+ return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
3317
+
3318
+ # Use Gemini to match variant IDs that may have formatting differences
3319
+ prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
3320
+ These represent the same enzyme variants but may be formatted differently.
3321
+
3322
+ Lineage variant IDs:
3323
+ {json.dumps(lineage_variant_ids, indent=2)}
3324
+
3325
+ PDB variant IDs:
3326
+ {json.dumps(pdb_variant_ids, indent=2)}
3327
+
3328
+ Match variants that represent the SAME enzyme variant, accounting for:
3329
+ - Whitespace differences (extra spaces, tabs)
3330
+ - Character encoding differences
3331
+ - Minor formatting variations
3332
+
3333
+ Return ONLY a JSON object mapping lineage IDs to PDB IDs.
3334
+ Format: {{"lineage_id": "pdb_id", ...}}
3335
+ Only include matches you are confident represent the same variant.
3336
+ Return an empty object {{}} if no matches can be confidently made.
3337
+ """
3338
+
3339
+ try:
3340
+ response = model.generate_content(prompt)
3341
+ text = _extract_text(response).strip()
3342
+
3343
+ # Parse JSON response
3344
+ if text.startswith("```"):
3345
+ text = text.split("```")[1].strip()
3346
+ if text.startswith("json"):
3347
+ text = text[4:].strip()
3348
+
3349
+ # Clean up the text
3350
+ text = text.strip()
3351
+ if not text or text == "{}":
3352
+ return {}
3353
+
3354
+ matches = json.loads(text)
3355
+ log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
3356
+
3357
+ # Validate matches
3358
+ valid_matches = {}
3359
+ for lineage_id, pdb_id in matches.items():
3360
+ if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
3361
+ valid_matches[lineage_id] = pdb_id
3362
+ log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
3363
+ else:
3364
+ log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
3365
+
3366
+ return valid_matches
3367
+
3368
+ except Exception as e:
3369
+ log.warning(f"Failed to match variant IDs with Gemini: {e}")
3370
+ return {}
3371
+
3372
+
3131
3373
  def match_pdb_to_variants(
3132
3374
  pdb_sequences: Dict[str, str],
3133
3375
  variants: List[Variant],
@@ -3211,24 +3453,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
3211
3453
  text = _extract_text(response).strip()
3212
3454
 
3213
3455
  # Parse JSON response (expecting a single string)
3214
- if text.startswith("```"):
3456
+ # Look for JSON code blocks first
3457
+ if "```json" in text:
3458
+ # Extract content between ```json and ```
3459
+ import re
3460
+ json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
3461
+ if json_match:
3462
+ json_content = json_match.group(1).strip()
3463
+ try:
3464
+ # Parse as JSON and extract the string value
3465
+ parsed = json.loads(json_content)
3466
+ matched_variant = str(parsed).strip('"\'')
3467
+ except:
3468
+ # If JSON parsing fails, try to extract the quoted string
3469
+ quoted_match = re.search(r'"([^"]+)"', json_content)
3470
+ if quoted_match:
3471
+ matched_variant = quoted_match.group(1)
3472
+ else:
3473
+ matched_variant = json_content.strip('"\'')
3474
+ else:
3475
+ matched_variant = text.strip('"\'')
3476
+ elif text.startswith("```"):
3477
+ # Handle other code blocks
3215
3478
  text = text.split("```")[1].strip()
3216
3479
  if text.startswith("json"):
3217
3480
  text = text[4:].strip()
3481
+ matched_variant = text.strip('"\'')
3482
+ else:
3483
+ # Look for quoted strings in the response
3484
+ import re
3485
+ quoted_match = re.search(r'"([^"]+)"', text)
3486
+ if quoted_match:
3487
+ matched_variant = quoted_match.group(1)
3488
+ else:
3489
+ # Remove quotes if present
3490
+ matched_variant = text.strip('"\'')
3218
3491
 
3219
- # Remove quotes if present
3220
- text = text.strip('"\'')
3221
-
3222
- matched_variant = text
3492
+ log.info(f"Extracted variant name: '{matched_variant}' from response")
3223
3493
  log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
3224
3494
 
3225
3495
  # Return mapping with all chains pointing to the same variant
3226
3496
  mapping = {}
3227
- if matched_variant and any(v.variant_id == matched_variant for v in variants):
3228
- for chain_id in pdb_sequences:
3229
- mapping[matched_variant] = chain_id
3230
- break # Only use the first chain
3497
+ if matched_variant:
3498
+ # Debug logging
3499
+ variant_ids = [v.variant_id for v in variants]
3500
+ log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
3501
+
3502
+ # Check if the matched variant exists in the lineage
3503
+ found_variant = any(v.variant_id == matched_variant for v in variants)
3504
+ log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
3505
+
3506
+ if found_variant:
3507
+ for chain_id in pdb_sequences:
3508
+ mapping[matched_variant] = chain_id
3509
+ log.info(f"Created mapping: {matched_variant} -> {chain_id}")
3510
+ break # Only use the first chain
3511
+ else:
3512
+ log.warning(f"Variant '{matched_variant}' not found in lineage variants")
3513
+ # Try fuzzy matching
3514
+ for variant in variants:
3515
+ if variant.variant_id.strip() == matched_variant.strip():
3516
+ log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
3517
+ for chain_id in pdb_sequences:
3518
+ mapping[variant.variant_id] = chain_id
3519
+ log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
3520
+ break
3521
+ break
3522
+ else:
3523
+ log.warning("No matched variant extracted from response")
3231
3524
 
3525
+ log.info(f"Final mapping result: {mapping}")
3232
3526
  return mapping
3233
3527
 
3234
3528
  except Exception as e:
@@ -3364,6 +3658,9 @@ Only match variants that represent the SAME enzyme, accounting for different nam
3364
3658
  Return ONLY a JSON object mapping lineage IDs to sequence IDs.
3365
3659
  Format: {{"lineage_id": "sequence_id", ...}}
3366
3660
  Only include matches you are confident represent the same variant.
3661
+
3662
+ DO NOT include any explanation, reasoning, or text other than the JSON object.
3663
+ Response must be valid JSON that starts with {{ and ends with }}
3367
3664
  """
3368
3665
 
3369
3666
  try:
@@ -3406,17 +3703,28 @@ Only include matches you are confident represent the same variant.
3406
3703
  log.error(f"Full cleaned text: {text}")
3407
3704
  # Try to extract JSON from within the response
3408
3705
  import re
3409
- json_match = re.search(r'\{.*\}', text, re.DOTALL)
3410
- if json_match:
3706
+ # First try to find JSON in code blocks
3707
+ code_block_match = re.search(r'```json\s*(\{[^`]*\})\s*```', text, re.DOTALL)
3708
+ if code_block_match:
3411
3709
  try:
3412
- matches = json.loads(json_match.group(0))
3413
- log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
3710
+ matches = json.loads(code_block_match.group(1))
3711
+ log.info(f"Successfully extracted JSON from code block: {len(matches)} matches")
3414
3712
  except json.JSONDecodeError:
3415
- log.error("Failed to extract JSON from response")
3713
+ log.error("Failed to parse JSON from code block")
3416
3714
  matches = {}
3417
3715
  else:
3418
- log.error("No JSON object found in response")
3419
- matches = {}
3716
+ # Try to find standalone JSON object (non-greedy, looking for balanced braces)
3717
+ json_match = re.search(r'(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})', text)
3718
+ if json_match:
3719
+ try:
3720
+ matches = json.loads(json_match.group(1))
3721
+ log.info(f"Successfully extracted JSON from response: {len(matches)} matches")
3722
+ except json.JSONDecodeError:
3723
+ log.error("Failed to extract JSON from response")
3724
+ matches = {}
3725
+ else:
3726
+ log.error("No JSON object found in response")
3727
+ matches = {}
3420
3728
 
3421
3729
  # Create a mapping of sequence IDs to their data for efficient lookup
3422
3730
  seq_data_map = {row['variant_id']: row for idx, row in unmatched_seqs.iterrows()}
@@ -3596,14 +3904,28 @@ def run_pipeline(
3596
3904
  caption_text = limited_caption_concat(*pdf_paths)
3597
3905
  full_text = limited_concat(*pdf_paths)
3598
3906
 
3907
+ # Also load separate texts for manuscript and SI
3908
+ manuscript_text = limited_concat(manuscript) if manuscript else None
3909
+ si_text = limited_concat(si_path) if si_path else None
3910
+
3599
3911
  log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
3600
3912
  len(caption_text), len(full_text))
3913
+ if manuscript_text:
3914
+ log.info("Loaded %d chars from manuscript", len(manuscript_text))
3915
+ if si_text:
3916
+ log.info("Loaded %d chars from SI", len(si_text))
3601
3917
 
3602
3918
  # 2. Connect to Gemini -----------------------------------------------------
3603
3919
  model = get_model()
3604
3920
 
3605
3921
  # 3. Extract lineage (Section 6) ------------------------------------------
3606
- lineage, campaigns = get_lineage(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
3922
+ lineage, campaigns = get_lineage(
3923
+ caption_text, full_text, model,
3924
+ pdf_paths=pdf_paths,
3925
+ debug_dir=debug_dir,
3926
+ manuscript_text=manuscript_text,
3927
+ si_text=si_text
3928
+ )
3607
3929
 
3608
3930
  if not lineage:
3609
3931
  raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
@@ -3683,12 +4005,40 @@ def run_pipeline(
3683
4005
  pdb_sequences, lineage, full_text, model, pdb_id
3684
4006
  )
3685
4007
 
4008
+ log.info(f"PDB matching result: {variant_to_chain}")
4009
+ log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
4010
+ log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
4011
+
3686
4012
  # Convert to SequenceBlock objects
3687
4013
  pdb_seq_blocks = []
3688
- for variant in lineage:
3689
- if variant.variant_id in variant_to_chain:
3690
- chain_id = variant_to_chain[variant.variant_id]
3691
- if chain_id in pdb_sequences:
4014
+
4015
+ # Use Gemini-based matching for robust variant ID comparison
4016
+ if variant_to_chain and model:
4017
+ # Create a mapping using Gemini for robust string matching
4018
+ gemini_mapping = _match_variant_ids_with_gemini(
4019
+ lineage_variant_ids=[v.variant_id for v in lineage],
4020
+ pdb_variant_ids=list(variant_to_chain.keys()),
4021
+ model=model
4022
+ )
4023
+
4024
+ for variant in lineage:
4025
+ log.info(f"Processing variant: {variant.variant_id}")
4026
+
4027
+ # Try direct match first
4028
+ chain_id = variant_to_chain.get(variant.variant_id)
4029
+ log.info(f"Direct match for {variant.variant_id}: {chain_id}")
4030
+
4031
+ # If no direct match, try Gemini-based matching
4032
+ if not chain_id:
4033
+ matched_pdb_variant = gemini_mapping.get(variant.variant_id)
4034
+ log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
4035
+ if matched_pdb_variant:
4036
+ chain_id = variant_to_chain.get(matched_pdb_variant)
4037
+ log.info(f"Chain ID from Gemini match: {chain_id}")
4038
+
4039
+ if chain_id and chain_id in pdb_sequences:
4040
+ seq_length = len(pdb_sequences[chain_id])
4041
+ log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
3692
4042
  seq_block = SequenceBlock(
3693
4043
  variant_id=variant.variant_id,
3694
4044
  aa_seq=pdb_sequences[chain_id],
@@ -3699,6 +4049,26 @@ def run_pipeline(
3699
4049
  )
3700
4050
  pdb_seq_blocks.append(seq_block)
3701
4051
  log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
4052
+ else:
4053
+ log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
4054
+ else:
4055
+ # Fallback to direct matching if no model or no matches
4056
+ for variant in lineage:
4057
+ if variant.variant_id in variant_to_chain:
4058
+ chain_id = variant_to_chain[variant.variant_id]
4059
+ if chain_id in pdb_sequences:
4060
+ seq_block = SequenceBlock(
4061
+ variant_id=variant.variant_id,
4062
+ aa_seq=pdb_sequences[chain_id],
4063
+ dna_seq=None,
4064
+ confidence=1.0, # High confidence for PDB sequences
4065
+ truncated=False,
4066
+ metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
4067
+ )
4068
+ pdb_seq_blocks.append(seq_block)
4069
+ log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
4070
+
4071
+ log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
3702
4072
 
3703
4073
  if pdb_seq_blocks:
3704
4074
  # Update the dataframe with PDB sequences
@@ -3708,8 +4078,13 @@ def run_pipeline(
3708
4078
  df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
3709
4079
  df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
3710
4080
  df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
4081
+ log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
4082
+ else:
4083
+ log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
3711
4084
  log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
3712
4085
  break
4086
+ else:
4087
+ log.warning(f"No PDB sequence blocks were created for {pdb_id}")
3713
4088
  else:
3714
4089
  log.warning(f"No sequences found in PDB {pdb_id}")
3715
4090
  else: