debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -336,7 +336,7 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
336
336
  return "\n".join(chunks)
337
337
 
338
338
 
339
- def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None) -> Optional[bytes]:
339
+ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None, caption_text: str = "") -> Optional[bytes]:
340
340
  """Extract a specific figure from a PDF by finding its caption.
341
341
 
342
342
  Returns the figure as PNG bytes if found, None otherwise.
@@ -345,64 +345,49 @@ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Option
345
345
  figure_bytes = None
346
346
 
347
347
  try:
348
- # Search for the exact figure caption text
349
- search_text = figure_id.strip()
348
+ # Use caption text if provided, otherwise use figure_id
349
+ if caption_text:
350
+ # Use first 50 chars of caption for searching (enough to be unique)
351
+ search_text = caption_text[:50].strip()
352
+ log.info(f"Searching for figure using caption: '{search_text}...'")
353
+ else:
354
+ search_text = figure_id.strip()
355
+ log.info(f"Searching for figure using ID: '{search_text}'")
350
356
 
351
357
  for page_num, page in enumerate(doc):
352
- # Search for the caption text on this page
353
- text_instances = page.search_for(search_text)
358
+ page_text = page.get_text()
354
359
 
355
- if text_instances:
356
- log.info(f"Found caption '{figure_id}' on page {page_num + 1}")
360
+ # Check if caption text appears on this page
361
+ if search_text in page_text:
362
+ log.info(f"Found caption on page {page_num + 1}")
357
363
 
358
- # Get the position of the first instance
359
- caption_rect = text_instances[0]
364
+ # Search for the exact text position
365
+ text_instances = page.search_for(search_text)
360
366
 
361
- # Get all images on this page
362
- image_list = page.get_images()
367
+ if text_instances:
368
+ # Get the position of the caption
369
+ caption_rect = text_instances[0]
363
370
 
364
- if image_list:
365
- # Find the image closest to and above the caption
366
- best_img = None
367
- best_distance = float('inf')
368
-
369
- for img_index, img in enumerate(image_list):
370
- # Get image position
371
- xref = img[0]
372
- img_rects = page.get_image_rects(xref)
373
-
374
- if img_rects:
375
- img_rect = img_rects[0]
376
-
377
- # Check if image is above the caption and calculate distance
378
- if img_rect.y1 <= caption_rect.y0: # Image bottom is above caption top
379
- distance = caption_rect.y0 - img_rect.y1
380
- if distance < best_distance and distance < 100: # Within reasonable distance
381
- best_distance = distance
382
- best_img = xref
383
-
384
- if best_img is not None:
385
- # Extract the identified image
386
- pix = fitz.Pixmap(doc, best_img)
387
-
388
- if pix.n - pix.alpha < 4: # GRAY or RGB
389
- figure_bytes = pix.tobytes("png")
390
- else: # Convert CMYK to RGB
391
- pix2 = fitz.Pixmap(fitz.csRGB, pix)
392
- figure_bytes = pix2.tobytes("png")
393
- pix2 = None
394
- pix = None
395
-
396
- # Save to debug directory if provided
397
- if debug_dir and figure_bytes:
398
- debug_path = Path(debug_dir)
399
- debug_path.mkdir(parents=True, exist_ok=True)
400
- fig_file = debug_path / f"figure_{figure_id.replace(' ', '_').replace('.', '')}_{int(time.time())}.png"
401
- with open(fig_file, 'wb') as f:
402
- f.write(figure_bytes)
403
- log.info(f"Saved figure to: {fig_file}")
404
-
405
- break
371
+ # Instead of trying to extract individual images,
372
+ # extract the ENTIRE PAGE as an image
373
+ # This ensures we get the complete figure with all panels
374
+ log.info(f"Extracting entire page {page_num + 1} containing figure {figure_id}")
375
+
376
+ # Use high resolution for clarity
377
+ mat = fitz.Matrix(3.0, 3.0) # 3x zoom
378
+ pix = page.get_pixmap(matrix=mat)
379
+ figure_bytes = pix.tobytes("png")
380
+
381
+ # Save the extracted figure if debug is enabled
382
+ if debug_dir and figure_bytes:
383
+ debug_path = Path(debug_dir)
384
+ debug_path.mkdir(parents=True, exist_ok=True)
385
+ figure_file = debug_path / f"figure_{figure_id.replace(' ', '_')}_{int(time.time())}.png"
386
+ with open(figure_file, 'wb') as f:
387
+ f.write(figure_bytes)
388
+ log.info(f"Saved figure to: {figure_file}")
389
+
390
+ break # Found the figure, no need to continue
406
391
 
407
392
  finally:
408
393
  doc.close()
@@ -465,7 +450,7 @@ def get_model():
465
450
  "temperature": 0.0, # Deterministic: always pick the most likely token
466
451
  "top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
467
452
  "top_k": 1, # Only consider the single most likely token
468
- "max_output_tokens": 32768, # Increased from 8192 to handle larger sequence extractions
453
+ "max_output_tokens": 65536, # Increased to 2x for handling larger lineage tables and sequences
469
454
  }
470
455
 
471
456
  # For Gemini 2.5 Flash, disable thinking tokens to save costs
@@ -685,39 +670,39 @@ from typing import List, Dict, Any
685
670
  # ---- 6.0 Campaign identification prompts -----------------------------------
686
671
 
687
672
  _CAMPAIGN_IDENTIFICATION_PROMPT = """
688
- You are an expert reader of protein engineering manuscripts.
689
- Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.
690
-
691
- Each campaign represents a separate evolutionary lineage targeting different:
692
- - Model reactions (e.g., different chemical transformations)
693
- - Substrate scopes
694
- - Activities (e.g., different enzymatic reactions)
673
+ Identify directed evolution LINEAGE campaigns in this manuscript.
695
674
 
675
+ A campaign is a multi-round directed evolution effort that creates a FAMILY of variants through iterative cycles.
696
676
  Look for:
697
- 1. Different model substrates/products mentioned (e.g., different substrate/product pairs)
698
- 2. Distinct enzyme lineage names (e.g., different variant naming patterns)
699
- 3. Separate evolution trees or lineage tables
700
- 4. Different reaction schemes or transformations
677
+ - Multiple rounds/generations of evolution (e.g., "8 rounds of evolution", "5 generations")
678
+ - Lineage trees or variant families (e.g., "L1→L2→L3→L4", "WT→M1→M2→M3")
679
+ - Progressive improvement through iterations
680
+ - Parent-child relationships across multiple variants
681
+
682
+ Do NOT include:
683
+ - Single-point mutation studies or individual variant characterization
684
+ - Simple site-saturation mutagenesis at one position
685
+
686
+ IMPORTANT: Include previously evolved lineages IF they are the main focus of THIS paper (e.g., characterizing a previously evolved enzyme lineage with new substrates/conditions)
687
+
688
+ Key phrases: "rounds of directed evolution", "iterative evolution", "evolutionary lineage", "variant lineage", "generations of evolution"
701
689
 
702
690
  Return a JSON array of campaigns:
703
691
  [
704
692
  {{
705
693
  "campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
706
694
  "campaign_name": "descriptive name",
707
- "description": "what this campaign evolved for",
695
+ "description": "what THIS STUDY evolved for",
708
696
  "model_substrate": "substrate name/id",
709
697
  "model_product": "product name/id",
710
698
  "substrate_id": "id from paper (e.g., 1a)",
711
699
  "product_id": "id from paper (e.g., 2a)",
712
700
  "data_locations": ["Table S1", "Figure 1"],
713
701
  "lineage_hint": "enzyme name pattern",
714
- "notes": "additional context"
702
+ "notes": "evidence this was evolved in THIS study"
715
703
  }}
716
704
  ]
717
705
 
718
- IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
719
- Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
720
-
721
706
  TEXT:
722
707
  {text}
723
708
  """.strip()
@@ -757,26 +742,46 @@ lineage of enzyme variants (i.e. which variant came from which parent and what
757
742
  mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
758
743
  ensure the location you return are actually lineage location with variants and mutations.
759
744
 
745
+ IMPORTANT SCORING CRITERIA:
746
+ - Locations that explicitly mention "lineage" should be scored MUCH HIGHER (90-100)
747
+ - Locations mentioning "evolutionary tree", "phylogenetic", "genealogy", or "ancestry" should also score high (85-95)
748
+ - Locations that only mention "variants" without lineage context should score lower (60-80)
749
+ - Generic tables of variants without parent-child relationships should score lowest (40-60)
750
+
760
751
  Respond with a JSON array of objects, each containing:
761
- - "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
752
+ - "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
762
753
  - "type": one of "table", "figure", "section"
763
- - "confidence": your confidence score (0-100) that this location contains lineage data
754
+ - "confidence": your confidence score (0-100) that this location contains lineage data (PRIORITIZE "lineage" mentions!)
764
755
  - "reason": brief explanation of why this location likely contains lineage
756
+ - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
757
+ - "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
765
758
  {campaign_field}
766
- IMPORTANT: Return ONLY figure/table identifiers like "Figure 2" or "Table S1",
767
- NOT page numbers. Focus on the actual figure/table titles and numbers.
768
-
769
- Order by confidence score (highest first). Tables showing complete variant lineages or
770
- mutation lists should be ranked higher than figures showing complete variant lineages.
771
- Sections are used when no suitable tables/figures exist.
759
+ CRITICAL INSTRUCTIONS:
760
+ 1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
761
+ - Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
762
+ - Do NOT modify, standardize, or interpret the location - return it verbatim from the document
763
+ 2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
764
+ - This should be the complete caption as it appears in the document
765
+ - Include at least 200-300 characters to ensure unique matching
766
+ 3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
767
+ - Items like "Table S1", "Figure S2", etc. are typically in the SI
768
+ - Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
769
+ - If uncertain, use context clues from the text
770
+
771
+ Order by confidence score (highest first), with special priority for:
772
+ 1. Tables/figures explicitly mentioning "lineage" or "evolutionary tree" (score 90-100)
773
+ 2. Tables showing complete parent-child relationships with mutations (score 80-95)
774
+ 3. Figures showing evolutionary/phylogenetic trees (score 75-90)
775
+ 4. Tables listing variants with parent information (score 70-85)
776
+ 5. Generic variant tables without clear lineage information (score 40-70)
772
777
 
773
778
  Don't include oligonucleotide results or result from only one round.
774
779
 
775
780
  Example output:
776
781
  [
777
- {{"location": "Table S1", "type": "table", "confidence": 95, "reason": "Variant lineage table"{campaign_example}}},
778
- {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram"{campaign_example}}},
779
- {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description"{campaign_example}}}
782
+ {{"location": "Table S1.", "type": "table", "confidence": 98, "reason": "Complete enzyme lineage table with parent-child relationships", "source": "si", "caption": "Table S1. Complete lineage of enzyme variants showing the evolutionary progression from wild-type through eight rounds of directed evolution. Each variant is listed with its parent and accumulated mutations..."{campaign_example}}},
783
+ {{"location": "Figure 2B", "type": "figure", "confidence": 92, "reason": "Evolutionary tree explicitly showing lineage", "source": "manuscript", "caption": "Figure 2B Evolutionary lineage tree depicting the complete genealogy of engineered variants. Branches show parent-child relationships with mutations annotated..."{campaign_example}}},
784
+ {{"location": "Table 2", "type": "table", "confidence": 75, "reason": "Variant table with parent information", "source": "manuscript", "caption": "Table 2. Summary of enzyme variants generated in this study. Parent templates and mutations are indicated for each variant..."{campaign_example}}}
780
785
  ]
781
786
  """.strip()
782
787
 
@@ -908,6 +913,9 @@ def identify_evolution_locations(
908
913
  pdf_paths: Optional[List[Path]] = None,
909
914
  ) -> List[dict]:
910
915
  """Ask Gemini where in the paper the lineage is probably described."""
916
+ # Extract manuscript pages as images (in addition to text)
917
+ manuscript_images = []
918
+
911
919
  # Extract table of contents from PDFs if available
912
920
  toc_text = ""
913
921
  if pdf_paths:
@@ -938,6 +946,27 @@ def identify_evolution_locations(
938
946
 
939
947
  if toc_sections:
940
948
  toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
949
+
950
+ # Extract manuscript pages as images
951
+ if len(pdf_paths) >= 1:
952
+ manuscript_pdf = pdf_paths[0]
953
+ log.info(f"Extracting manuscript pages as images from: {manuscript_pdf.name}")
954
+
955
+ doc = _open_doc(manuscript_pdf)
956
+ try:
957
+ # Extract up to 10 pages as images
958
+ for page_num in range(min(10, len(doc))):
959
+ page = doc[page_num]
960
+ # Render page as image
961
+ mat = fitz.Matrix(2, 2) # 2x zoom for better quality
962
+ pix = page.get_pixmap(matrix=mat)
963
+ img_bytes = pix.tobytes("png")
964
+ manuscript_images.append(img_bytes)
965
+ log.debug(f"Extracted page {page_num + 1} as image ({len(img_bytes)} bytes)")
966
+ finally:
967
+ doc.close()
968
+
969
+ log.info(f"Extracted {len(manuscript_images)} manuscript pages as images")
941
970
 
942
971
  # Include TOC before the main text
943
972
  combined_text = toc_text + text if toc_text else text
@@ -979,15 +1008,80 @@ def identify_evolution_locations(
979
1008
  campaign_specific=campaign_specific,
980
1009
  campaign_field=campaign_field,
981
1010
  campaign_example=campaign_example
982
- ) + "\n\nTEXT:\n" + combined_text
1011
+ )
1012
+
983
1013
  locs: List[dict] = []
984
1014
  try:
985
- locs = generate_json_with_retry(
986
- model,
987
- prompt,
988
- debug_dir=debug_dir,
989
- tag="locate",
990
- )
1015
+ if manuscript_images:
1016
+ # Use vision API with manuscript images and SI text
1017
+ log.info("Using vision API with %d manuscript page images and SI text", len(manuscript_images))
1018
+
1019
+ # Convert images to PIL format for Gemini
1020
+ import PIL.Image
1021
+ import io
1022
+
1023
+ pil_images = []
1024
+ for img_bytes in manuscript_images:
1025
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1026
+ pil_images.append(image)
1027
+
1028
+ # Build multimodal prompt with caption text AND manuscript images
1029
+ multimodal_prompt = [prompt + "\n\nTEXT (Captions and sections):\n" + combined_text]
1030
+
1031
+ # Add manuscript page images
1032
+ multimodal_prompt.append("\n\n=== MANUSCRIPT PAGES (as images for additional context) ===\n")
1033
+ multimodal_prompt.extend(pil_images)
1034
+
1035
+ # Save debug info if requested
1036
+ if debug_dir:
1037
+ debug_path = Path(debug_dir)
1038
+ debug_path.mkdir(parents=True, exist_ok=True)
1039
+
1040
+ # Save prompt
1041
+ prompt_file = debug_path / f"locate_vision_prompt_{int(time.time())}.txt"
1042
+ _dump(f"=== VISION PROMPT FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nManuscript pages: {len(manuscript_images)}\nText length: {len(combined_text)} chars\n{'='*80}\n\n{prompt}\n\nTEXT (Captions and sections):\n{combined_text[:2000]}...(truncated)\n\n[{len(manuscript_images)} manuscript page images]",
1043
+ prompt_file)
1044
+
1045
+ # Save manuscript page samples
1046
+ for i, img_bytes in enumerate(manuscript_images[:3]): # Save first 3 pages
1047
+ img_file = debug_path / f"locate_manuscript_page_{i+1}_{int(time.time())}.png"
1048
+ _dump(img_bytes, img_file)
1049
+
1050
+ # Generate content with vision
1051
+ response = model.generate_content(multimodal_prompt)
1052
+ raw = response.text
1053
+
1054
+ # Parse JSON from response
1055
+ try:
1056
+ # Save raw response if debug enabled
1057
+ if debug_dir:
1058
+ response_file = Path(debug_dir) / f"locate_vision_response_{int(time.time())}.txt"
1059
+ _dump(f"=== VISION RESPONSE FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}", response_file)
1060
+
1061
+ # Try to parse JSON
1062
+ try:
1063
+ locs = json.loads(raw)
1064
+ except json.JSONDecodeError:
1065
+ # Try to extract JSON from response
1066
+ json_match = re.search(r'\[.*\]', raw, re.DOTALL)
1067
+ if json_match:
1068
+ locs = json.loads(json_match.group(0))
1069
+ else:
1070
+ log.warning("Could not parse JSON from vision response")
1071
+ locs = []
1072
+ except Exception as e:
1073
+ log.warning(f"Error parsing vision response: {e}")
1074
+ locs = []
1075
+
1076
+ else:
1077
+ # Fall back to text-only mode
1078
+ prompt += "\n\nTEXT:\n" + combined_text
1079
+ locs = generate_json_with_retry(
1080
+ model,
1081
+ prompt,
1082
+ debug_dir=debug_dir,
1083
+ tag="locate",
1084
+ )
991
1085
  except Exception as exc: # pragma: no cover
992
1086
  log.warning("identify_evolution_locations(): %s", exc)
993
1087
 
@@ -1288,7 +1382,7 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
1288
1382
 
1289
1383
  return False
1290
1384
 
1291
- def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
1385
+ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 50000, validate_sequences: bool = False) -> str:
1292
1386
  """Extract text around identified locations."""
1293
1387
  if not locations:
1294
1388
  return text
@@ -1461,10 +1555,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
1461
1555
 
1462
1556
  # ---- 6.4 Public API -------------------------------------------------------
1463
1557
 
1464
- def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
1465
- """Extract text from a specific location (table, section, etc.) in the full text."""
1558
+ def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
1559
+ """Extract text from a specific location (table, section, etc.) in the full text.
1560
+
1561
+ Args:
1562
+ full_text: The full text to search in
1563
+ location: The location identifier (e.g., "Table S1")
1564
+ location_type: Type of location ("table", "figure", "section")
1565
+ caption_hint: Optional full caption text for fuzzy matching
1566
+ """
1466
1567
  import re
1467
1568
 
1569
+ # If caption hint is provided, try fuzzy matching first
1570
+ if caption_hint and len(caption_hint) > 20:
1571
+ log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
1572
+
1573
+ # Normalize texts for better matching (similar to reaction_info_extractor)
1574
+ def normalize_for_matching(text):
1575
+ # Remove extra whitespace, normalize spaces around punctuation
1576
+ text = ' '.join(text.split())
1577
+ # Normalize different dash types
1578
+ text = text.replace('–', '-').replace('—', '-')
1579
+ return text
1580
+
1581
+ normalized_hint = normalize_for_matching(caption_hint[:150]) # Use first 150 chars
1582
+ normalized_text = normalize_for_matching(full_text)
1583
+
1584
+ # Try to find ALL caption matches using character-based fuzzy matching
1585
+ all_matches = []
1586
+
1587
+ # Slide through the text looking for all matches above threshold
1588
+ hint_len = len(normalized_hint)
1589
+ for i in range(len(normalized_text) - hint_len + 1):
1590
+ snippet = normalized_text[i:i + hint_len]
1591
+ # Simple character-based similarity
1592
+ matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
1593
+ score = matches / hint_len
1594
+
1595
+ if score > 0.7: # 70% similarity threshold
1596
+ all_matches.append({
1597
+ 'norm_pos': i,
1598
+ 'score': score
1599
+ })
1600
+
1601
+ # If we found matches, extract from all of them
1602
+ if all_matches:
1603
+ log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
1604
+
1605
+ # Collect all occurrences from fuzzy matches
1606
+ all_occurrences = []
1607
+ seen_positions = set()
1608
+
1609
+ for match_info in all_matches:
1610
+ # Get the matched text from normalized version
1611
+ matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
1612
+
1613
+ # Find where this appears in the original text
1614
+ best_original_pos = -1
1615
+
1616
+ # Search in the original text for this specific match
1617
+ for i in range(len(full_text) - len(caption_hint) + 1):
1618
+ if i in seen_positions:
1619
+ continue
1620
+
1621
+ original_snippet = full_text[i:i + len(caption_hint)]
1622
+ # Normalize and compare
1623
+ normalized_snippet = normalize_for_matching(original_snippet)
1624
+ if normalized_snippet[:hint_len] == matched_normalized:
1625
+ # Found exact match after normalization
1626
+ best_original_pos = i
1627
+ seen_positions.add(i)
1628
+ break
1629
+
1630
+ if best_original_pos >= 0:
1631
+ # Extract generous context from this match position
1632
+ start = max(0, best_original_pos - 1000)
1633
+ end = min(len(full_text), best_original_pos + 10000)
1634
+ context = full_text[start:end]
1635
+
1636
+ all_occurrences.append({
1637
+ 'position': best_original_pos,
1638
+ 'context': context,
1639
+ 'score': match_info['score']
1640
+ })
1641
+ log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
1642
+
1643
+ if all_occurrences:
1644
+ # Sort by position to maintain document order
1645
+ all_occurrences.sort(key=lambda x: x['position'])
1646
+
1647
+ # Combine all occurrences
1648
+ combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
1649
+
1650
+ for i, occurrence in enumerate(all_occurrences, 1):
1651
+ combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
1652
+ combined_text += occurrence['context']
1653
+ combined_text += "\n\n"
1654
+
1655
+ # Apply same limit as table extraction
1656
+ if len(combined_text) > 150000:
1657
+ combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
1658
+
1659
+ log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
1660
+ return combined_text
1661
+ else:
1662
+ log.warning(f"Could not map any fuzzy matches back to original text")
1663
+ else:
1664
+ log.warning(f"No fuzzy matches found for caption above 70% threshold")
1665
+
1468
1666
  if location_type == 'table':
1469
1667
  # Find ALL mentions of this table and combine them
1470
1668
  location_clean = location.strip()
@@ -1506,6 +1704,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
1506
1704
 
1507
1705
  log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
1508
1706
 
1707
+ # Sort occurrences by position to maintain document order
1708
+ all_occurrences.sort(key=lambda x: x['position'])
1709
+
1509
1710
  # Combine all occurrences into one text for Gemini to analyze
1510
1711
  combined_text = f"=== All occurrences of {location_clean} ===\n\n"
1511
1712
 
@@ -1515,8 +1716,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
1515
1716
  combined_text += "\n\n"
1516
1717
 
1517
1718
  # Limit total length to avoid overwhelming the model
1518
- if len(combined_text) > 50000:
1519
- combined_text = combined_text[:50000] + "\n\n[Truncated due to length...]"
1719
+ # Increased limit to ensure actual table content is included
1720
+ if len(combined_text) > 150000:
1721
+ combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
1520
1722
 
1521
1723
  return combined_text
1522
1724
 
@@ -1600,6 +1802,8 @@ def get_lineage(
1600
1802
  *,
1601
1803
  pdf_paths: Optional[List[Path]] = None,
1602
1804
  debug_dir: str | Path | None = None,
1805
+ manuscript_text: Optional[str] = None,
1806
+ si_text: Optional[str] = None,
1603
1807
  ) -> Tuple[List[Variant], List[Campaign]]:
1604
1808
  """
1605
1809
  High-level wrapper used by the pipeline.
@@ -1667,54 +1871,72 @@ def get_lineage(
1667
1871
  for loc in locations:
1668
1872
  log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
1669
1873
 
1670
- # Try to extract from the best location
1874
+ # Sort locations by confidence and use the highest confidence one
1875
+ locations_sorted = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
1876
+ log.info(f"Using highest confidence location: {locations_sorted[0]['location']} (confidence: {locations_sorted[0]['confidence']})")
1877
+
1878
+ # Use the highest confidence location as primary location
1879
+ primary_location = locations_sorted[0]
1880
+
1881
+ # Extract location details
1882
+ location_str = primary_location.get('location', '')
1883
+ location_type = primary_location.get('type', '')
1884
+ confidence = primary_location.get('confidence', 0)
1885
+ caption_text = primary_location.get('caption', '')
1886
+
1887
+ # Initialize extracted variants list
1671
1888
  extracted_variants = []
1672
- for location in locations:
1673
- if extracted_variants:
1674
- break # Already got variants
1675
-
1676
- location_str = location.get('location', '')
1677
- location_type = location.get('type', '')
1678
- confidence = location.get('confidence', 0)
1889
+
1890
+ # Try figure extraction for high-confidence figures
1891
+ if location_type == 'figure' and confidence >= 70 and pdf_paths:
1892
+ log.info(f"Attempting to extract figure: {location_str}")
1679
1893
 
1680
- # Try figure extraction for high-confidence figures
1681
- if location_type == 'figure' and confidence >= 70 and pdf_paths:
1682
- log.info(f"Attempting to extract figure: {location_str}")
1683
-
1684
- figure_bytes = None
1685
- for pdf_path in pdf_paths:
1686
- figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
1687
- if figure_bytes:
1688
- log.info(f"Successfully extracted figure from {pdf_path.name}")
1689
- break
1690
-
1894
+ figure_bytes = None
1895
+ for pdf_path in pdf_paths:
1896
+ figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text=caption_text)
1691
1897
  if figure_bytes:
1692
- # Save figure if debug enabled
1693
- if debug_dir:
1694
- debug_path = Path(debug_dir)
1695
- debug_path.mkdir(parents=True, exist_ok=True)
1696
- figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
1697
- _dump(figure_bytes, figure_file)
1698
- log.info(f"Saved figure to: {figure_file}")
1699
-
1700
- # Extract lineage from figure
1701
- variants = extract_lineage_from_figure(
1702
- figure_bytes, model,
1703
- debug_dir=debug_dir,
1704
- campaign_id=campaign.campaign_id,
1705
- campaign_info=campaign
1706
- )
1707
- if variants:
1708
- log.info(f"Extracted {len(variants)} variants from figure")
1709
- extracted_variants = variants
1710
- continue
1898
+ log.info(f"Successfully extracted figure from {pdf_path.name}")
1899
+ break
1711
1900
 
1712
- # Try table/text extraction
1713
- if location_type in ['table', 'text', 'section'] and not extracted_variants:
1901
+ if figure_bytes:
1902
+ # Save figure if debug enabled
1903
+ if debug_dir:
1904
+ debug_path = Path(debug_dir)
1905
+ debug_path.mkdir(parents=True, exist_ok=True)
1906
+ figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
1907
+ _dump(figure_bytes, figure_file)
1908
+ log.info(f"Saved figure to: {figure_file}")
1909
+
1910
+ # Extract lineage from figure
1911
+ variants = extract_lineage_from_figure(
1912
+ figure_bytes, model,
1913
+ debug_dir=debug_dir,
1914
+ campaign_id=campaign.campaign_id,
1915
+ campaign_info=campaign
1916
+ )
1917
+ if variants:
1918
+ log.info(f"Extracted {len(variants)} variants from figure")
1919
+ extracted_variants = variants
1920
+
1921
+ # Try table/text extraction if no figure extraction or if not a figure
1922
+ if not extracted_variants and location_type in ['table', 'text', 'section']:
1714
1923
  log.info(f"Attempting text extraction for {location_type}: {location_str}")
1715
1924
 
1716
- # Extract the specific section/table from full text
1717
- section_text = _extract_location_text(full_text, location_str, location_type)
1925
+ # Determine which text to use based on source
1926
+ location_source = location.get('source', 'manuscript')
1927
+ if location_source == 'si' and si_text:
1928
+ text_to_search = si_text
1929
+ log.info(f"Using SI text for location {location_str}")
1930
+ elif location_source == 'manuscript' and manuscript_text:
1931
+ text_to_search = manuscript_text
1932
+ log.info(f"Using manuscript text for location {location_str}")
1933
+ else:
1934
+ text_to_search = full_text
1935
+ log.info(f"Using combined text for location {location_str} (fallback)")
1936
+
1937
+ # Extract the specific section/table from appropriate text
1938
+ caption_hint = location.get('caption', '')
1939
+ section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
1718
1940
  if section_text:
1719
1941
  log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
1720
1942
  # Save extracted section if debug enabled
@@ -1940,8 +2162,9 @@ def get_lineage(
1940
2162
 
1941
2163
  # Try to extract the figure from available PDFs
1942
2164
  figure_bytes = None
2165
+ # Note: This fallback path doesn't have the caption text
1943
2166
  for pdf_path in pdf_paths:
1944
- figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
2167
+ figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text="")
1945
2168
  if figure_bytes:
1946
2169
  log.info("Successfully extracted figure from %s", pdf_path.name)
1947
2170
  break
@@ -1980,7 +2203,7 @@ def get_lineage(
1980
2203
  # Use text-based extraction (works for tables and text sections)
1981
2204
  # Extract from full text, not caption text - use only primary location
1982
2205
  # Use more context for tables since they often span multiple pages
1983
- context_size = 15000 if location_type == 'table' else 5000
2206
+ context_size = 75000 if location_type == 'table' else 50000
1984
2207
  focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
1985
2208
  log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
1986
2209
  len(full_text), len(focused_text),
@@ -2028,17 +2251,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
2028
2251
 
2029
2252
  Look for table of contents entries or section listings that mention sequences.
2030
2253
  Return a JSON array where each element has:
2031
- - "section": the section heading or description
2254
+ - "section": the section heading or description EXACTLY as it appears
2032
2255
  - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
2256
+ - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
2257
+ - "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
2033
2258
 
2034
2259
  Focus on:
2035
2260
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
2036
2261
  - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
2037
2262
  - Prioritize sections that mention "protein" or "amino acid" sequences
2038
2263
 
2039
- CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
2040
- - Correct: "53", "S12", "147"
2041
- - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
2264
+ CRITICAL:
2265
+ 1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
2266
+ - Correct: "53", "S12", "147"
2267
+ - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
2268
+ 2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
2269
+ - Pages with "S" prefix (e.g., "S53") are typically in the SI
2270
+ - Regular page numbers (e.g., "53") are typically in the main manuscript
2271
+ - Use context clues from the document structure
2042
2272
 
2043
2273
  Return [] if no sequence sections are found.
2044
2274
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -2236,7 +2466,7 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
2236
2466
  # Fallback to text search if page extraction didn't work
2237
2467
  if not sample_text:
2238
2468
  sample_text = _extract_text_at_locations(
2239
- text, [location], context_chars=2000, validate_sequences=False
2469
+ text, [location], context_chars=20000, validate_sequences=False
2240
2470
  )
2241
2471
 
2242
2472
  samples.append({
@@ -2278,44 +2508,30 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
2278
2508
 
2279
2509
  # --- 7.3 Main extraction prompt ---------------------------------------------
2280
2510
  _SEQ_EXTRACTION_PROMPT = """
2281
- Extract EVERY distinct enzyme-variant sequence you can find in the text.
2282
-
2283
- IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
2284
- - If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
2285
- - Only extract dna_seq if NO amino acid sequence is available for that variant
2286
- - This reduces redundancy since protein sequences are usually more relevant
2287
-
2288
- CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
2289
- - Papers often use different naming conventions in different sections
2290
- - DO NOT normalize or simplify variant IDs
2291
- - Extract the variant_id exactly as written where the sequence appears
2292
- - Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
2293
-
2294
- SEQUENCE EXTRACTION RULES:
2295
- - Copy sequences EXACTLY as they appear in the text
2296
- - Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
2297
- - Do NOT add, remove, or modify any amino acids, or nucleotides
2298
- - Preserve the exact length and character sequence
2299
- - If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
2300
- - Double-check that consecutive identical amino acids or nucleotides are copied correctly
2301
-
2302
- For each variant return:
2303
- * variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
2304
- * aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
2305
- * dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
2306
-
2307
- Respond ONLY with **minified JSON** that matches the schema below.
2308
- NO markdown, no code fences, no commentary.
2511
+ Extract ALL enzyme variant sequences from the text. Copy sequences EXACTLY as they appear - character by character.
2309
2512
 
2310
- Schema:
2311
- ```json
2312
- {schema}
2313
- ```
2513
+ KEY RULES:
2514
+ 1. EXHAUSTIVE SEARCH: If a variant appears multiple times, check ALL occurrences and extract the LONGEST sequence
2515
+ 2. MULTI-PAGE: Sequences span pages. Skip page numbers (66, 67, etc.) that interrupt sequences
2516
+ 3. MERGE IF NEEDED: If sequence continues after page break, combine the parts
2517
+ 4. NO MODIFICATIONS: Copy exactly - no edits or improvements
2314
2518
 
2315
- TEXT (may be truncated):
2316
- ```
2519
+ IMPORTANT: The same variant may appear multiple times with different sequence lengths. Always use the longest one.
2520
+
2521
+ SEQUENCE PRIORITY:
2522
+ - If BOTH amino acid AND DNA exist → use amino acid ONLY
2523
+ - For DNA: If mixed case, extract UPPERCASE only (lowercase=backbone)
2524
+ - Return minified JSON only
2525
+
2526
+ ACCURACY:
2527
+ - Extract ONLY what's written
2528
+ - Never hallucinate
2529
+ - Check entire document - complete sequences often appear later
2530
+
2531
+ Schema: {schema}
2532
+
2533
+ TEXT:
2317
2534
  {text}
2318
- ```
2319
2535
  """.strip()
2320
2536
 
2321
2537
  def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
@@ -2390,7 +2606,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
2390
2606
 
2391
2607
 
2392
2608
  def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
2393
- """Extract sequence JSON using Gemini with up to 6 attempts, returning most common result.
2609
+ """Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
2394
2610
 
2395
2611
  Can exit early after 2 attempts if the responses match exactly.
2396
2612
 
@@ -2404,9 +2620,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2404
2620
  The most common sequence JSON data or None if all attempts failed
2405
2621
  """
2406
2622
  responses = []
2407
- max_attempts = 6
2623
+ max_attempts = 5 # 5 attempts for better consensus
2408
2624
 
2409
- # Try 6 times with early match detection
2625
+ # Try 5 times with early match detection
2410
2626
  for attempt in range(max_attempts):
2411
2627
  try:
2412
2628
  log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2432,8 +2648,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2432
2648
 
2433
2649
  # Try to parse as JSON
2434
2650
  try:
2435
- parsed = json.loads(raw)
2436
- except json.JSONDecodeError:
2651
+ # First clean the response - remove any BOM or invisible characters
2652
+ raw_clean = raw.strip()
2653
+ if raw_clean.startswith('\ufeff'): # Remove BOM if present
2654
+ raw_clean = raw_clean[1:]
2655
+ parsed = json.loads(raw_clean)
2656
+ except json.JSONDecodeError as e:
2657
+ log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
2437
2658
  # Look for JSON array or object in the response
2438
2659
  json_start = -1
2439
2660
  json_end = -1
@@ -2482,17 +2703,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2482
2703
  responses.append(parsed)
2483
2704
  log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
2484
2705
 
2485
- # Early match detection after 2 attempts
2486
- if attempt >= 1: # After 2nd attempt (0-indexed)
2487
- valid_responses_so_far = [r for r in responses if r is not None]
2488
- if len(valid_responses_so_far) >= 2:
2489
- # Check if the last two valid responses match
2490
- if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2491
- log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2492
- # Add the matching response 4 more times to simulate consensus
2493
- for _ in range(max_attempts - attempt - 1):
2494
- responses.append(valid_responses_so_far[-1])
2495
- break
2706
+ # If we got a good response with sequences, we can check for early termination
2707
+ if isinstance(parsed, list) and len(parsed) > 0:
2708
+ # Early match detection after 2 attempts
2709
+ if attempt >= 1: # After 2nd attempt (0-indexed)
2710
+ valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
2711
+ if len(valid_responses_so_far) >= 2:
2712
+ # Check if the last two valid responses match
2713
+ if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
2714
+ log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
2715
+ # Add the matching response to fill remaining attempts
2716
+ for _ in range(max_attempts - attempt - 1):
2717
+ responses.append(valid_responses_so_far[-1])
2718
+ break
2719
+ # If this is the first attempt and we got sequences, continue to validate with at least one more
2720
+ elif attempt == 0 and len(parsed) > 5: # Got substantial sequences on first try
2721
+ log.info("Got substantial sequences on first attempt, will validate with one more")
2496
2722
 
2497
2723
  except Exception as e:
2498
2724
  log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
@@ -2511,28 +2737,39 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2511
2737
  if isinstance(resp, list):
2512
2738
  for seq in resp:
2513
2739
  if isinstance(seq, dict) and "variant_id" in seq:
2514
- # Create a key for this sequence (variant_id + cleaned aa_seq)
2740
+ # Create a key for this sequence (variant_id + cleaned sequence)
2515
2741
  variant_id = seq.get("variant_id", "")
2516
2742
  aa_seq = seq.get("aa_seq", "")
2743
+ dna_seq = seq.get("dna_seq", "")
2744
+
2745
+ # Clean sequences for comparison
2517
2746
  if aa_seq:
2518
2747
  aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2519
- key = f"{variant_id}|{aa_seq}"
2748
+ if dna_seq:
2749
+ dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
2750
+
2751
+ # Use whichever sequence is present for the key
2752
+ seq_for_key = aa_seq if aa_seq else (dna_seq if dna_seq else "")
2753
+ key = f"{variant_id}|{seq_for_key}"
2520
2754
 
2521
2755
  if key not in sequence_counts:
2522
2756
  sequence_counts[key] = {"count": 0, "data": seq}
2523
2757
  sequence_counts[key]["count"] += 1
2524
2758
 
2525
- # Build result with sequences that appear in at least 3 attempts
2759
+ # Build result with sequences that appear in at least 2 attempts
2760
+ # Sort by count (descending) to prioritize sequences with higher consensus
2526
2761
  result = []
2527
- for key, info in sequence_counts.items():
2528
- if info["count"] >= 3: # Appears in at least 3/6 attempts
2762
+ sorted_sequences = sorted(sequence_counts.items(), key=lambda x: x[1]["count"], reverse=True)
2763
+
2764
+ for key, info in sorted_sequences:
2765
+ if info["count"] >= 2: # Appears in at least 2/5 attempts
2529
2766
  seq_data = info["data"].copy()
2530
2767
  seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
2531
2768
  result.append(seq_data)
2532
2769
  log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
2533
2770
 
2534
2771
  if result:
2535
- log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
2772
+ log.info(f"Extracted {len(result)} sequences with at least 2/{max_attempts} consensus")
2536
2773
  return result
2537
2774
 
2538
2775
  # If no sequences appear twice, return the most complete attempt
@@ -2628,11 +2865,30 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
2628
2865
  if aa and len(aa) <= 50:
2629
2866
  log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
2630
2867
  aa = None
2631
- if dna and len(dna) <= 150:
2632
- log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
2633
- dna = None
2868
+
2869
+ # Validate DNA sequences
2870
+ if dna:
2871
+ if len(dna) <= 150:
2872
+ log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
2873
+ dna = None
2874
+ # Check if DNA sequence length is divisible by 3
2875
+ elif len(dna) % 3 != 0:
2876
+ log.warning(f"Skipping DNA sequence for {vid}: length {len(dna)} not divisible by 3")
2877
+ dna = None
2878
+ else:
2879
+ # Check for stop codons in the middle of the sequence
2880
+ stop_codons = {'TAA', 'TAG', 'TGA'}
2881
+ has_internal_stop = False
2882
+ for i in range(0, len(dna) - 3, 3):
2883
+ codon = dna[i:i+3]
2884
+ if codon in stop_codons:
2885
+ log.warning(f"Skipping DNA sequence for {vid}: internal stop codon {codon} at position {i}")
2886
+ has_internal_stop = True
2887
+ break
2888
+ if has_internal_stop:
2889
+ dna = None
2634
2890
 
2635
- # Skip if both sequences are too short or missing
2891
+ # Skip if both sequences are invalid or missing
2636
2892
  if not aa and not dna:
2637
2893
  continue
2638
2894
 
@@ -2852,9 +3108,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2852
3108
  focused_text = ""
2853
3109
  if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
2854
3110
  page_num = best_location['page']
2855
- # Extract current page plus next 15 pages
3111
+ # Extract current page plus next 5 pages (6 total) to prevent hallucination
2856
3112
  all_pages = []
2857
- for i in range(16): # Current + next 15
3113
+ for i in range(6): # Current + next 5 (6 pages total)
2858
3114
  if isinstance(page_num, str) and page_num.upper().startswith('S'):
2859
3115
  next_page = f"S{int(page_num[1:]) + i}"
2860
3116
  else:
@@ -2866,7 +3122,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2866
3122
  break
2867
3123
  if all_pages:
2868
3124
  focused_text = "\n".join(all_pages)
2869
- log.info("Extracted %d chars from pages %s through %d more pages",
3125
+ log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
2870
3126
  len(focused_text), page_num, len(all_pages) - 1)
2871
3127
 
2872
3128
  # Fallback to text search if page extraction didn't work
@@ -2874,7 +3130,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
2874
3130
  log.info("Page extraction did not return text, falling back to text search")
2875
3131
  focused_text = _extract_text_at_locations(
2876
3132
  text, [best_location],
2877
- context_chars=max(min_length, 30000),
3133
+ context_chars=max(min_length, 50000),
2878
3134
  validate_sequences=True
2879
3135
  )
2880
3136
 
@@ -3152,6 +3408,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
3152
3408
  return {}
3153
3409
 
3154
3410
 
3411
+ def _match_variant_ids_with_gemini(
3412
+ lineage_variant_ids: List[str],
3413
+ pdb_variant_ids: List[str],
3414
+ model
3415
+ ) -> Dict[str, str]:
3416
+ """Use Gemini to match variant IDs that may have slight formatting differences.
3417
+
3418
+ Args:
3419
+ lineage_variant_ids: List of variant IDs from the lineage
3420
+ pdb_variant_ids: List of variant IDs from PDB matching
3421
+ model: Gemini model for matching
3422
+
3423
+ Returns:
3424
+ Dictionary mapping lineage_variant_id -> pdb_variant_id
3425
+ """
3426
+ if not lineage_variant_ids or not pdb_variant_ids or not model:
3427
+ return {}
3428
+
3429
+ # If the lists are identical, return direct mapping
3430
+ if set(lineage_variant_ids) == set(pdb_variant_ids):
3431
+ return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
3432
+
3433
+ # Use Gemini to match variant IDs that may have formatting differences
3434
+ prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
3435
+ These represent the same enzyme variants but may be formatted differently.
3436
+
3437
+ Lineage variant IDs:
3438
+ {json.dumps(lineage_variant_ids, indent=2)}
3439
+
3440
+ PDB variant IDs:
3441
+ {json.dumps(pdb_variant_ids, indent=2)}
3442
+
3443
+ Match variants that represent the SAME enzyme variant, accounting for:
3444
+ - Whitespace differences (extra spaces, tabs)
3445
+ - Character encoding differences
3446
+ - Minor formatting variations
3447
+
3448
+ Return ONLY a JSON object mapping lineage IDs to PDB IDs.
3449
+ Format: {{"lineage_id": "pdb_id", ...}}
3450
+ Only include matches you are confident represent the same variant.
3451
+ Return an empty object {{}} if no matches can be confidently made.
3452
+ """
3453
+
3454
+ try:
3455
+ response = model.generate_content(prompt)
3456
+ text = _extract_text(response).strip()
3457
+
3458
+ # Parse JSON response
3459
+ if text.startswith("```"):
3460
+ text = text.split("```")[1].strip()
3461
+ if text.startswith("json"):
3462
+ text = text[4:].strip()
3463
+
3464
+ # Clean up the text
3465
+ text = text.strip()
3466
+ if not text or text == "{}":
3467
+ return {}
3468
+
3469
+ matches = json.loads(text)
3470
+ log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
3471
+
3472
+ # Validate matches
3473
+ valid_matches = {}
3474
+ for lineage_id, pdb_id in matches.items():
3475
+ if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
3476
+ valid_matches[lineage_id] = pdb_id
3477
+ log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
3478
+ else:
3479
+ log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
3480
+
3481
+ return valid_matches
3482
+
3483
+ except Exception as e:
3484
+ log.warning(f"Failed to match variant IDs with Gemini: {e}")
3485
+ return {}
3486
+
3487
+
3155
3488
  def match_pdb_to_variants(
3156
3489
  pdb_sequences: Dict[str, str],
3157
3490
  variants: List[Variant],
@@ -3235,24 +3568,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
3235
3568
  text = _extract_text(response).strip()
3236
3569
 
3237
3570
  # Parse JSON response (expecting a single string)
3238
- if text.startswith("```"):
3571
+ # Look for JSON code blocks first
3572
+ if "```json" in text:
3573
+ # Extract content between ```json and ```
3574
+ import re
3575
+ json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
3576
+ if json_match:
3577
+ json_content = json_match.group(1).strip()
3578
+ try:
3579
+ # Parse as JSON and extract the string value
3580
+ parsed = json.loads(json_content)
3581
+ matched_variant = str(parsed).strip('"\'')
3582
+ except:
3583
+ # If JSON parsing fails, try to extract the quoted string
3584
+ quoted_match = re.search(r'"([^"]+)"', json_content)
3585
+ if quoted_match:
3586
+ matched_variant = quoted_match.group(1)
3587
+ else:
3588
+ matched_variant = json_content.strip('"\'')
3589
+ else:
3590
+ matched_variant = text.strip('"\'')
3591
+ elif text.startswith("```"):
3592
+ # Handle other code blocks
3239
3593
  text = text.split("```")[1].strip()
3240
3594
  if text.startswith("json"):
3241
3595
  text = text[4:].strip()
3596
+ matched_variant = text.strip('"\'')
3597
+ else:
3598
+ # Look for quoted strings in the response
3599
+ import re
3600
+ quoted_match = re.search(r'"([^"]+)"', text)
3601
+ if quoted_match:
3602
+ matched_variant = quoted_match.group(1)
3603
+ else:
3604
+ # Remove quotes if present
3605
+ matched_variant = text.strip('"\'')
3242
3606
 
3243
- # Remove quotes if present
3244
- text = text.strip('"\'')
3245
-
3246
- matched_variant = text
3607
+ log.info(f"Extracted variant name: '{matched_variant}' from response")
3247
3608
  log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
3248
3609
 
3249
3610
  # Return mapping with all chains pointing to the same variant
3250
3611
  mapping = {}
3251
- if matched_variant and any(v.variant_id == matched_variant for v in variants):
3252
- for chain_id in pdb_sequences:
3253
- mapping[matched_variant] = chain_id
3254
- break # Only use the first chain
3612
+ if matched_variant:
3613
+ # Debug logging
3614
+ variant_ids = [v.variant_id for v in variants]
3615
+ log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
3616
+
3617
+ # Check if the matched variant exists in the lineage
3618
+ found_variant = any(v.variant_id == matched_variant for v in variants)
3619
+ log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
3620
+
3621
+ if found_variant:
3622
+ for chain_id in pdb_sequences:
3623
+ mapping[matched_variant] = chain_id
3624
+ log.info(f"Created mapping: {matched_variant} -> {chain_id}")
3625
+ break # Only use the first chain
3626
+ else:
3627
+ log.warning(f"Variant '{matched_variant}' not found in lineage variants")
3628
+ # Try fuzzy matching
3629
+ for variant in variants:
3630
+ if variant.variant_id.strip() == matched_variant.strip():
3631
+ log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
3632
+ for chain_id in pdb_sequences:
3633
+ mapping[variant.variant_id] = chain_id
3634
+ log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
3635
+ break
3636
+ break
3637
+ else:
3638
+ log.warning("No matched variant extracted from response")
3255
3639
 
3640
+ log.info(f"Final mapping result: {mapping}")
3256
3641
  return mapping
3257
3642
 
3258
3643
  except Exception as e:
@@ -3634,14 +4019,28 @@ def run_pipeline(
3634
4019
  caption_text = limited_caption_concat(*pdf_paths)
3635
4020
  full_text = limited_concat(*pdf_paths)
3636
4021
 
4022
+ # Also load separate texts for manuscript and SI
4023
+ manuscript_text = limited_concat(manuscript) if manuscript else None
4024
+ si_text = limited_concat(si_path) if si_path else None
4025
+
3637
4026
  log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
3638
4027
  len(caption_text), len(full_text))
4028
+ if manuscript_text:
4029
+ log.info("Loaded %d chars from manuscript", len(manuscript_text))
4030
+ if si_text:
4031
+ log.info("Loaded %d chars from SI", len(si_text))
3639
4032
 
3640
4033
  # 2. Connect to Gemini -----------------------------------------------------
3641
4034
  model = get_model()
3642
4035
 
3643
4036
  # 3. Extract lineage (Section 6) ------------------------------------------
3644
- lineage, campaigns = get_lineage(caption_text, full_text, model, pdf_paths=pdf_paths, debug_dir=debug_dir)
4037
+ lineage, campaigns = get_lineage(
4038
+ caption_text, full_text, model,
4039
+ pdf_paths=pdf_paths,
4040
+ debug_dir=debug_dir,
4041
+ manuscript_text=manuscript_text,
4042
+ si_text=si_text
4043
+ )
3645
4044
 
3646
4045
  if not lineage:
3647
4046
  raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
@@ -3721,12 +4120,40 @@ def run_pipeline(
3721
4120
  pdb_sequences, lineage, full_text, model, pdb_id
3722
4121
  )
3723
4122
 
4123
+ log.info(f"PDB matching result: {variant_to_chain}")
4124
+ log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
4125
+ log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
4126
+
3724
4127
  # Convert to SequenceBlock objects
3725
4128
  pdb_seq_blocks = []
3726
- for variant in lineage:
3727
- if variant.variant_id in variant_to_chain:
3728
- chain_id = variant_to_chain[variant.variant_id]
3729
- if chain_id in pdb_sequences:
4129
+
4130
+ # Use Gemini-based matching for robust variant ID comparison
4131
+ if variant_to_chain and model:
4132
+ # Create a mapping using Gemini for robust string matching
4133
+ gemini_mapping = _match_variant_ids_with_gemini(
4134
+ lineage_variant_ids=[v.variant_id for v in lineage],
4135
+ pdb_variant_ids=list(variant_to_chain.keys()),
4136
+ model=model
4137
+ )
4138
+
4139
+ for variant in lineage:
4140
+ log.info(f"Processing variant: {variant.variant_id}")
4141
+
4142
+ # Try direct match first
4143
+ chain_id = variant_to_chain.get(variant.variant_id)
4144
+ log.info(f"Direct match for {variant.variant_id}: {chain_id}")
4145
+
4146
+ # If no direct match, try Gemini-based matching
4147
+ if not chain_id:
4148
+ matched_pdb_variant = gemini_mapping.get(variant.variant_id)
4149
+ log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
4150
+ if matched_pdb_variant:
4151
+ chain_id = variant_to_chain.get(matched_pdb_variant)
4152
+ log.info(f"Chain ID from Gemini match: {chain_id}")
4153
+
4154
+ if chain_id and chain_id in pdb_sequences:
4155
+ seq_length = len(pdb_sequences[chain_id])
4156
+ log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
3730
4157
  seq_block = SequenceBlock(
3731
4158
  variant_id=variant.variant_id,
3732
4159
  aa_seq=pdb_sequences[chain_id],
@@ -3737,6 +4164,26 @@ def run_pipeline(
3737
4164
  )
3738
4165
  pdb_seq_blocks.append(seq_block)
3739
4166
  log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
4167
+ else:
4168
+ log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
4169
+ else:
4170
+ # Fallback to direct matching if no model or no matches
4171
+ for variant in lineage:
4172
+ if variant.variant_id in variant_to_chain:
4173
+ chain_id = variant_to_chain[variant.variant_id]
4174
+ if chain_id in pdb_sequences:
4175
+ seq_block = SequenceBlock(
4176
+ variant_id=variant.variant_id,
4177
+ aa_seq=pdb_sequences[chain_id],
4178
+ dna_seq=None,
4179
+ confidence=1.0, # High confidence for PDB sequences
4180
+ truncated=False,
4181
+ metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
4182
+ )
4183
+ pdb_seq_blocks.append(seq_block)
4184
+ log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
4185
+
4186
+ log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
3740
4187
 
3741
4188
  if pdb_seq_blocks:
3742
4189
  # Update the dataframe with PDB sequences
@@ -3746,8 +4193,13 @@ def run_pipeline(
3746
4193
  df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
3747
4194
  df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
3748
4195
  df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
4196
+ log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
4197
+ else:
4198
+ log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
3749
4199
  log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
3750
4200
  break
4201
+ else:
4202
+ log.warning(f"No PDB sequence blocks were created for {pdb_id}")
3751
4203
  else:
3752
4204
  log.warning(f"No sequences found in PDB {pdb_id}")
3753
4205
  else: