debase 0.6.2__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {debase-0.6.2/src/debase.egg-info → debase-0.7.0}/PKG-INFO +1 -1
  2. {debase-0.6.2 → debase-0.7.0}/src/debase/_version.py +1 -1
  3. {debase-0.6.2 → debase-0.7.0}/src/debase/enzyme_lineage_extractor.py +278 -163
  4. {debase-0.6.2 → debase-0.7.0}/src/debase/lineage_format.py +11 -5
  5. {debase-0.6.2 → debase-0.7.0}/src/debase/reaction_info_extractor.py +209 -36
  6. {debase-0.6.2 → debase-0.7.0}/src/debase/substrate_scope_extractor.py +157 -56
  7. {debase-0.6.2 → debase-0.7.0/src/debase.egg-info}/PKG-INFO +1 -1
  8. {debase-0.6.2 → debase-0.7.0}/.gitignore +0 -0
  9. {debase-0.6.2 → debase-0.7.0}/LICENSE +0 -0
  10. {debase-0.6.2 → debase-0.7.0}/MANIFEST.in +0 -0
  11. {debase-0.6.2 → debase-0.7.0}/README.md +0 -0
  12. {debase-0.6.2 → debase-0.7.0}/environment.yml +0 -0
  13. {debase-0.6.2 → debase-0.7.0}/manuscript/DEBase_LLM_Validater.ipynb +0 -0
  14. {debase-0.6.2 → debase-0.7.0}/pyproject.toml +0 -0
  15. {debase-0.6.2 → debase-0.7.0}/setup.cfg +0 -0
  16. {debase-0.6.2 → debase-0.7.0}/setup.py +0 -0
  17. {debase-0.6.2 → debase-0.7.0}/src/__init__.py +0 -0
  18. {debase-0.6.2 → debase-0.7.0}/src/debase/__init__.py +0 -0
  19. {debase-0.6.2 → debase-0.7.0}/src/debase/__main__.py +0 -0
  20. {debase-0.6.2 → debase-0.7.0}/src/debase/build_db.py +0 -0
  21. {debase-0.6.2 → debase-0.7.0}/src/debase/campaign_utils.py +0 -0
  22. {debase-0.6.2 → debase-0.7.0}/src/debase/caption_pattern.py +0 -0
  23. {debase-0.6.2 → debase-0.7.0}/src/debase/cleanup_sequence.py +0 -0
  24. {debase-0.6.2 → debase-0.7.0}/src/debase/wrapper.py +0 -0
  25. {debase-0.6.2 → debase-0.7.0}/src/debase.egg-info/SOURCES.txt +0 -0
  26. {debase-0.6.2 → debase-0.7.0}/src/debase.egg-info/dependency_links.txt +0 -0
  27. {debase-0.6.2 → debase-0.7.0}/src/debase.egg-info/entry_points.txt +0 -0
  28. {debase-0.6.2 → debase-0.7.0}/src/debase.egg-info/requires.txt +0 -0
  29. {debase-0.6.2 → debase-0.7.0}/src/debase.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: debase
3
- Version: 0.6.2
3
+ Version: 0.7.0
4
4
  Summary: Enzyme lineage analysis and sequence extraction package
5
5
  Home-page: https://github.com/YuemingLong/DEBase
6
6
  Author: DEBase Team
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.6.2"
3
+ __version__ = "0.7.0"
@@ -336,7 +336,7 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
336
336
  return "\n".join(chunks)
337
337
 
338
338
 
339
- def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None) -> Optional[bytes]:
339
+ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None, caption_text: str = "") -> Optional[bytes]:
340
340
  """Extract a specific figure from a PDF by finding its caption.
341
341
 
342
342
  Returns the figure as PNG bytes if found, None otherwise.
@@ -345,64 +345,49 @@ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Option
345
345
  figure_bytes = None
346
346
 
347
347
  try:
348
- # Search for the exact figure caption text
349
- search_text = figure_id.strip()
348
+ # Use caption text if provided, otherwise use figure_id
349
+ if caption_text:
350
+ # Use first 50 chars of caption for searching (enough to be unique)
351
+ search_text = caption_text[:50].strip()
352
+ log.info(f"Searching for figure using caption: '{search_text}...'")
353
+ else:
354
+ search_text = figure_id.strip()
355
+ log.info(f"Searching for figure using ID: '{search_text}'")
350
356
 
351
357
  for page_num, page in enumerate(doc):
352
- # Search for the caption text on this page
353
- text_instances = page.search_for(search_text)
358
+ page_text = page.get_text()
354
359
 
355
- if text_instances:
356
- log.info(f"Found caption '{figure_id}' on page {page_num + 1}")
360
+ # Check if caption text appears on this page
361
+ if search_text in page_text:
362
+ log.info(f"Found caption on page {page_num + 1}")
357
363
 
358
- # Get the position of the first instance
359
- caption_rect = text_instances[0]
364
+ # Search for the exact text position
365
+ text_instances = page.search_for(search_text)
360
366
 
361
- # Get all images on this page
362
- image_list = page.get_images()
367
+ if text_instances:
368
+ # Get the position of the caption
369
+ caption_rect = text_instances[0]
363
370
 
364
- if image_list:
365
- # Find the image closest to and above the caption
366
- best_img = None
367
- best_distance = float('inf')
368
-
369
- for img_index, img in enumerate(image_list):
370
- # Get image position
371
- xref = img[0]
372
- img_rects = page.get_image_rects(xref)
373
-
374
- if img_rects:
375
- img_rect = img_rects[0]
376
-
377
- # Check if image is above the caption and calculate distance
378
- if img_rect.y1 <= caption_rect.y0: # Image bottom is above caption top
379
- distance = caption_rect.y0 - img_rect.y1
380
- if distance < best_distance and distance < 100: # Within reasonable distance
381
- best_distance = distance
382
- best_img = xref
383
-
384
- if best_img is not None:
385
- # Extract the identified image
386
- pix = fitz.Pixmap(doc, best_img)
387
-
388
- if pix.n - pix.alpha < 4: # GRAY or RGB
389
- figure_bytes = pix.tobytes("png")
390
- else: # Convert CMYK to RGB
391
- pix2 = fitz.Pixmap(fitz.csRGB, pix)
392
- figure_bytes = pix2.tobytes("png")
393
- pix2 = None
394
- pix = None
395
-
396
- # Save to debug directory if provided
397
- if debug_dir and figure_bytes:
398
- debug_path = Path(debug_dir)
399
- debug_path.mkdir(parents=True, exist_ok=True)
400
- fig_file = debug_path / f"figure_{figure_id.replace(' ', '_').replace('.', '')}_{int(time.time())}.png"
401
- with open(fig_file, 'wb') as f:
402
- f.write(figure_bytes)
403
- log.info(f"Saved figure to: {fig_file}")
404
-
405
- break
371
+ # Instead of trying to extract individual images,
372
+ # extract the ENTIRE PAGE as an image
373
+ # This ensures we get the complete figure with all panels
374
+ log.info(f"Extracting entire page {page_num + 1} containing figure {figure_id}")
375
+
376
+ # Use high resolution for clarity
377
+ mat = fitz.Matrix(3.0, 3.0) # 3x zoom
378
+ pix = page.get_pixmap(matrix=mat)
379
+ figure_bytes = pix.tobytes("png")
380
+
381
+ # Save the extracted figure if debug is enabled
382
+ if debug_dir and figure_bytes:
383
+ debug_path = Path(debug_dir)
384
+ debug_path.mkdir(parents=True, exist_ok=True)
385
+ figure_file = debug_path / f"figure_{figure_id.replace(' ', '_')}_{int(time.time())}.png"
386
+ with open(figure_file, 'wb') as f:
387
+ f.write(figure_bytes)
388
+ log.info(f"Saved figure to: {figure_file}")
389
+
390
+ break # Found the figure, no need to continue
406
391
 
407
392
  finally:
408
393
  doc.close()
@@ -685,39 +670,39 @@ from typing import List, Dict, Any
685
670
  # ---- 6.0 Campaign identification prompts -----------------------------------
686
671
 
687
672
  _CAMPAIGN_IDENTIFICATION_PROMPT = """
688
- You are an expert reader of protein engineering manuscripts.
689
- Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.
690
-
691
- Each campaign represents a separate evolutionary lineage targeting different:
692
- - Model reactions (e.g., different chemical transformations)
693
- - Substrate scopes
694
- - Activities (e.g., different enzymatic reactions)
673
+ Identify directed evolution LINEAGE campaigns in this manuscript.
695
674
 
675
+ A campaign is a multi-round directed evolution effort that creates a FAMILY of variants through iterative cycles.
696
676
  Look for:
697
- 1. Different model substrates/products mentioned (e.g., different substrate/product pairs)
698
- 2. Distinct enzyme lineage names (e.g., different variant naming patterns)
699
- 3. Separate evolution trees or lineage tables
700
- 4. Different reaction schemes or transformations
677
+ - Multiple rounds/generations of evolution (e.g., "8 rounds of evolution", "5 generations")
678
+ - Lineage trees or variant families (e.g., "L1→L2→L3→L4", "WT→M1→M2→M3")
679
+ - Progressive improvement through iterations
680
+ - Parent-child relationships across multiple variants
681
+
682
+ Do NOT include:
683
+ - Single-point mutation studies or individual variant characterization
684
+ - Simple site-saturation mutagenesis at one position
685
+
686
+ IMPORTANT: Include previously evolved lineages IF they are the main focus of THIS paper (e.g., characterizing a previously evolved enzyme lineage with new substrates/conditions)
687
+
688
+ Key phrases: "rounds of directed evolution", "iterative evolution", "evolutionary lineage", "variant lineage", "generations of evolution"
701
689
 
702
690
  Return a JSON array of campaigns:
703
691
  [
704
692
  {{
705
693
  "campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
706
694
  "campaign_name": "descriptive name",
707
- "description": "what this campaign evolved for",
695
+ "description": "what THIS STUDY evolved for",
708
696
  "model_substrate": "substrate name/id",
709
697
  "model_product": "product name/id",
710
698
  "substrate_id": "id from paper (e.g., 1a)",
711
699
  "product_id": "id from paper (e.g., 2a)",
712
700
  "data_locations": ["Table S1", "Figure 1"],
713
701
  "lineage_hint": "enzyme name pattern",
714
- "notes": "additional context"
702
+ "notes": "evidence this was evolved in THIS study"
715
703
  }}
716
704
  ]
717
705
 
718
- IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
719
- Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
720
-
721
706
  TEXT:
722
707
  {text}
723
708
  """.strip()
@@ -757,10 +742,16 @@ lineage of enzyme variants (i.e. which variant came from which parent and what
757
742
  mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
758
743
  ensure the location you return are actually lineage location with variants and mutations.
759
744
 
745
+ IMPORTANT SCORING CRITERIA:
746
+ - Locations that explicitly mention "lineage" should be scored MUCH HIGHER (90-100)
747
+ - Locations mentioning "evolutionary tree", "phylogenetic", "genealogy", or "ancestry" should also score high (85-95)
748
+ - Locations that only mention "variants" without lineage context should score lower (60-80)
749
+ - Generic tables of variants without parent-child relationships should score lowest (40-60)
750
+
760
751
  Respond with a JSON array of objects, each containing:
761
752
  - "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
762
753
  - "type": one of "table", "figure", "section"
763
- - "confidence": your confidence score (0-100) that this location contains lineage data
754
+ - "confidence": your confidence score (0-100) that this location contains lineage data (PRIORITIZE "lineage" mentions!)
764
755
  - "reason": brief explanation of why this location likely contains lineage
765
756
  - "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
766
757
  - "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
@@ -777,17 +768,20 @@ CRITICAL INSTRUCTIONS:
777
768
  - Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
778
769
  - If uncertain, use context clues from the text
779
770
 
780
- Order by confidence score (highest first). Tables showing complete variant lineages or
781
- mutation lists should be ranked higher than figures showing complete variant lineages.
782
- Sections are used when no suitable tables/figures exist.
771
+ Order by confidence score (highest first), with special priority for:
772
+ 1. Tables/figures explicitly mentioning "lineage" or "evolutionary tree" (score 90-100)
773
+ 2. Tables showing complete parent-child relationships with mutations (score 80-95)
774
+ 3. Figures showing evolutionary/phylogenetic trees (score 75-90)
775
+ 4. Tables listing variants with parent information (score 70-85)
776
+ 5. Generic variant tables without clear lineage information (score 40-70)
783
777
 
784
778
  Don't include oligonucleotide results or result from only one round.
785
779
 
786
780
  Example output:
787
781
  [
788
- {{"location": "Table S1.", "type": "table", "confidence": 95, "reason": "Variant lineage table", "source": "si", "caption": "Table S1. Summary of mutations introduced during directed evolution of PA-G8. The table shows all variants tested in each round of SSM with their corresponding mutations and activities..."{campaign_example}}},
789
- {{"location": "Figure 2B", "type": "figure", "confidence": 70, "reason": "Phylogenetic tree diagram", "source": "manuscript", "caption": "Figure 2B Phylogenetic tree showing the evolutionary relationships between enzyme variants. Each node represents a variant with mutations indicated on branches..."{campaign_example}}},
790
- {{"location": "Section 3.2", "type": "section", "confidence": 60, "reason": "Evolution description", "source": "manuscript", "caption": "Section 3.2 Directed Evolution Campaign. We performed eight rounds of site-saturation mutagenesis..."{campaign_example}}}
782
+ {{"location": "Table S1.", "type": "table", "confidence": 98, "reason": "Complete enzyme lineage table with parent-child relationships", "source": "si", "caption": "Table S1. Complete lineage of enzyme variants showing the evolutionary progression from wild-type through eight rounds of directed evolution. Each variant is listed with its parent and accumulated mutations..."{campaign_example}}},
783
+ {{"location": "Figure 2B", "type": "figure", "confidence": 92, "reason": "Evolutionary tree explicitly showing lineage", "source": "manuscript", "caption": "Figure 2B Evolutionary lineage tree depicting the complete genealogy of engineered variants. Branches show parent-child relationships with mutations annotated..."{campaign_example}}},
784
+ {{"location": "Table 2", "type": "table", "confidence": 75, "reason": "Variant table with parent information", "source": "manuscript", "caption": "Table 2. Summary of enzyme variants generated in this study. Parent templates and mutations are indicated for each variant..."{campaign_example}}}
791
785
  ]
792
786
  """.strip()
793
787
 
@@ -919,6 +913,9 @@ def identify_evolution_locations(
919
913
  pdf_paths: Optional[List[Path]] = None,
920
914
  ) -> List[dict]:
921
915
  """Ask Gemini where in the paper the lineage is probably described."""
916
+ # Extract manuscript pages as images (in addition to text)
917
+ manuscript_images = []
918
+
922
919
  # Extract table of contents from PDFs if available
923
920
  toc_text = ""
924
921
  if pdf_paths:
@@ -949,6 +946,27 @@ def identify_evolution_locations(
949
946
 
950
947
  if toc_sections:
951
948
  toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
949
+
950
+ # Extract manuscript pages as images
951
+ if len(pdf_paths) >= 1:
952
+ manuscript_pdf = pdf_paths[0]
953
+ log.info(f"Extracting manuscript pages as images from: {manuscript_pdf.name}")
954
+
955
+ doc = _open_doc(manuscript_pdf)
956
+ try:
957
+ # Extract up to 10 pages as images
958
+ for page_num in range(min(10, len(doc))):
959
+ page = doc[page_num]
960
+ # Render page as image
961
+ mat = fitz.Matrix(2, 2) # 2x zoom for better quality
962
+ pix = page.get_pixmap(matrix=mat)
963
+ img_bytes = pix.tobytes("png")
964
+ manuscript_images.append(img_bytes)
965
+ log.debug(f"Extracted page {page_num + 1} as image ({len(img_bytes)} bytes)")
966
+ finally:
967
+ doc.close()
968
+
969
+ log.info(f"Extracted {len(manuscript_images)} manuscript pages as images")
952
970
 
953
971
  # Include TOC before the main text
954
972
  combined_text = toc_text + text if toc_text else text
@@ -990,15 +1008,80 @@ def identify_evolution_locations(
990
1008
  campaign_specific=campaign_specific,
991
1009
  campaign_field=campaign_field,
992
1010
  campaign_example=campaign_example
993
- ) + "\n\nTEXT:\n" + combined_text
1011
+ )
1012
+
994
1013
  locs: List[dict] = []
995
1014
  try:
996
- locs = generate_json_with_retry(
997
- model,
998
- prompt,
999
- debug_dir=debug_dir,
1000
- tag="locate",
1001
- )
1015
+ if manuscript_images:
1016
+ # Use vision API with manuscript images and SI text
1017
+ log.info("Using vision API with %d manuscript page images and SI text", len(manuscript_images))
1018
+
1019
+ # Convert images to PIL format for Gemini
1020
+ import PIL.Image
1021
+ import io
1022
+
1023
+ pil_images = []
1024
+ for img_bytes in manuscript_images:
1025
+ image = PIL.Image.open(io.BytesIO(img_bytes))
1026
+ pil_images.append(image)
1027
+
1028
+ # Build multimodal prompt with caption text AND manuscript images
1029
+ multimodal_prompt = [prompt + "\n\nTEXT (Captions and sections):\n" + combined_text]
1030
+
1031
+ # Add manuscript page images
1032
+ multimodal_prompt.append("\n\n=== MANUSCRIPT PAGES (as images for additional context) ===\n")
1033
+ multimodal_prompt.extend(pil_images)
1034
+
1035
+ # Save debug info if requested
1036
+ if debug_dir:
1037
+ debug_path = Path(debug_dir)
1038
+ debug_path.mkdir(parents=True, exist_ok=True)
1039
+
1040
+ # Save prompt
1041
+ prompt_file = debug_path / f"locate_vision_prompt_{int(time.time())}.txt"
1042
+ _dump(f"=== VISION PROMPT FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nManuscript pages: {len(manuscript_images)}\nText length: {len(combined_text)} chars\n{'='*80}\n\n{prompt}\n\nTEXT (Captions and sections):\n{combined_text[:2000]}...(truncated)\n\n[{len(manuscript_images)} manuscript page images]",
1043
+ prompt_file)
1044
+
1045
+ # Save manuscript page samples
1046
+ for i, img_bytes in enumerate(manuscript_images[:3]): # Save first 3 pages
1047
+ img_file = debug_path / f"locate_manuscript_page_{i+1}_{int(time.time())}.png"
1048
+ _dump(img_bytes, img_file)
1049
+
1050
+ # Generate content with vision
1051
+ response = model.generate_content(multimodal_prompt)
1052
+ raw = response.text
1053
+
1054
+ # Parse JSON from response
1055
+ try:
1056
+ # Save raw response if debug enabled
1057
+ if debug_dir:
1058
+ response_file = Path(debug_dir) / f"locate_vision_response_{int(time.time())}.txt"
1059
+ _dump(f"=== VISION RESPONSE FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}", response_file)
1060
+
1061
+ # Try to parse JSON
1062
+ try:
1063
+ locs = json.loads(raw)
1064
+ except json.JSONDecodeError:
1065
+ # Try to extract JSON from response
1066
+ json_match = re.search(r'\[.*\]', raw, re.DOTALL)
1067
+ if json_match:
1068
+ locs = json.loads(json_match.group(0))
1069
+ else:
1070
+ log.warning("Could not parse JSON from vision response")
1071
+ locs = []
1072
+ except Exception as e:
1073
+ log.warning(f"Error parsing vision response: {e}")
1074
+ locs = []
1075
+
1076
+ else:
1077
+ # Fall back to text-only mode
1078
+ prompt += "\n\nTEXT:\n" + combined_text
1079
+ locs = generate_json_with_retry(
1080
+ model,
1081
+ prompt,
1082
+ debug_dir=debug_dir,
1083
+ tag="locate",
1084
+ )
1002
1085
  except Exception as exc: # pragma: no cover
1003
1086
  log.warning("identify_evolution_locations(): %s", exc)
1004
1087
 
@@ -1299,7 +1382,7 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
1299
1382
 
1300
1383
  return False
1301
1384
 
1302
- def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 5000, validate_sequences: bool = False) -> str:
1385
+ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 50000, validate_sequences: bool = False) -> str:
1303
1386
  """Extract text around identified locations."""
1304
1387
  if not locations:
1305
1388
  return text
@@ -1788,50 +1871,55 @@ def get_lineage(
1788
1871
  for loc in locations:
1789
1872
  log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
1790
1873
 
1791
- # Try to extract from the best location
1874
+ # Sort locations by confidence and use the highest confidence one
1875
+ locations_sorted = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
1876
+ log.info(f"Using highest confidence location: {locations_sorted[0]['location']} (confidence: {locations_sorted[0]['confidence']})")
1877
+
1878
+ # Use the highest confidence location as primary location
1879
+ primary_location = locations_sorted[0]
1880
+
1881
+ # Extract location details
1882
+ location_str = primary_location.get('location', '')
1883
+ location_type = primary_location.get('type', '')
1884
+ confidence = primary_location.get('confidence', 0)
1885
+ caption_text = primary_location.get('caption', '')
1886
+
1887
+ # Initialize extracted variants list
1792
1888
  extracted_variants = []
1793
- for location in locations:
1794
- if extracted_variants:
1795
- break # Already got variants
1796
-
1797
- location_str = location.get('location', '')
1798
- location_type = location.get('type', '')
1799
- confidence = location.get('confidence', 0)
1889
+
1890
+ # Try figure extraction for high-confidence figures
1891
+ if location_type == 'figure' and confidence >= 70 and pdf_paths:
1892
+ log.info(f"Attempting to extract figure: {location_str}")
1800
1893
 
1801
- # Try figure extraction for high-confidence figures
1802
- if location_type == 'figure' and confidence >= 70 and pdf_paths:
1803
- log.info(f"Attempting to extract figure: {location_str}")
1804
-
1805
- figure_bytes = None
1806
- for pdf_path in pdf_paths:
1807
- figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
1808
- if figure_bytes:
1809
- log.info(f"Successfully extracted figure from {pdf_path.name}")
1810
- break
1811
-
1894
+ figure_bytes = None
1895
+ for pdf_path in pdf_paths:
1896
+ figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text=caption_text)
1812
1897
  if figure_bytes:
1813
- # Save figure if debug enabled
1814
- if debug_dir:
1815
- debug_path = Path(debug_dir)
1816
- debug_path.mkdir(parents=True, exist_ok=True)
1817
- figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
1818
- _dump(figure_bytes, figure_file)
1819
- log.info(f"Saved figure to: {figure_file}")
1820
-
1821
- # Extract lineage from figure
1822
- variants = extract_lineage_from_figure(
1823
- figure_bytes, model,
1824
- debug_dir=debug_dir,
1825
- campaign_id=campaign.campaign_id,
1826
- campaign_info=campaign
1827
- )
1828
- if variants:
1829
- log.info(f"Extracted {len(variants)} variants from figure")
1830
- extracted_variants = variants
1831
- continue
1898
+ log.info(f"Successfully extracted figure from {pdf_path.name}")
1899
+ break
1832
1900
 
1833
- # Try table/text extraction
1834
- if location_type in ['table', 'text', 'section'] and not extracted_variants:
1901
+ if figure_bytes:
1902
+ # Save figure if debug enabled
1903
+ if debug_dir:
1904
+ debug_path = Path(debug_dir)
1905
+ debug_path.mkdir(parents=True, exist_ok=True)
1906
+ figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
1907
+ _dump(figure_bytes, figure_file)
1908
+ log.info(f"Saved figure to: {figure_file}")
1909
+
1910
+ # Extract lineage from figure
1911
+ variants = extract_lineage_from_figure(
1912
+ figure_bytes, model,
1913
+ debug_dir=debug_dir,
1914
+ campaign_id=campaign.campaign_id,
1915
+ campaign_info=campaign
1916
+ )
1917
+ if variants:
1918
+ log.info(f"Extracted {len(variants)} variants from figure")
1919
+ extracted_variants = variants
1920
+
1921
+ # Try table/text extraction if no figure extraction or if not a figure
1922
+ if not extracted_variants and location_type in ['table', 'text', 'section']:
1835
1923
  log.info(f"Attempting text extraction for {location_type}: {location_str}")
1836
1924
 
1837
1925
  # Determine which text to use based on source
@@ -2074,8 +2162,9 @@ def get_lineage(
2074
2162
 
2075
2163
  # Try to extract the figure from available PDFs
2076
2164
  figure_bytes = None
2165
+ # Note: This fallback path doesn't have the caption text
2077
2166
  for pdf_path in pdf_paths:
2078
- figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
2167
+ figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text="")
2079
2168
  if figure_bytes:
2080
2169
  log.info("Successfully extracted figure from %s", pdf_path.name)
2081
2170
  break
@@ -2114,7 +2203,7 @@ def get_lineage(
2114
2203
  # Use text-based extraction (works for tables and text sections)
2115
2204
  # Extract from full text, not caption text - use only primary location
2116
2205
  # Use more context for tables since they often span multiple pages
2117
- context_size = 15000 if location_type == 'table' else 5000
2206
+ context_size = 75000 if location_type == 'table' else 50000
2118
2207
  focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
2119
2208
  log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
2120
2209
  len(full_text), len(focused_text),
@@ -2377,7 +2466,7 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
2377
2466
  # Fallback to text search if page extraction didn't work
2378
2467
  if not sample_text:
2379
2468
  sample_text = _extract_text_at_locations(
2380
- text, [location], context_chars=2000, validate_sequences=False
2469
+ text, [location], context_chars=20000, validate_sequences=False
2381
2470
  )
2382
2471
 
2383
2472
  samples.append({
@@ -2419,29 +2508,25 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
2419
2508
 
2420
2509
  # --- 7.3 Main extraction prompt ---------------------------------------------
2421
2510
  _SEQ_EXTRACTION_PROMPT = """
2422
- Extract ALL enzyme variant sequences from the text.
2423
-
2424
- Rules:
2425
- 1. Use EXACT variant IDs as they appear with each sequence
2426
- 2. Copy sequences EXACTLY - preserve all amino acids/nucleotides including repeats
2427
- 3. For each variant:
2428
- - If amino acid sequence exists: set aa_seq to the sequence, set dna_seq to null
2429
- - If ONLY DNA sequence exists: set dna_seq to the sequence, set aa_seq to null
2430
- - NEVER include both aa_seq and dna_seq for the same variant
2431
- - IMPORTANT: Always prefer amino acid sequences over DNA sequences when both are available
2432
- 4. Return ONLY minified JSON, no markdown or commentary
2433
-
2434
- CRITICAL SEQUENCE PRIORITY RULE:
2435
- - If you find BOTH amino acid sequence AND DNA sequence for the same variant, ONLY return the amino acid sequence
2436
- - Set dna_seq to null when aa_seq is available, even if DNA sequence is present in the text
2437
- - Only return dna_seq when NO amino acid sequence exists for that variant
2438
-
2439
- CRITICAL ACCURACY REQUIREMENTS:
2440
- - Extract ONLY sequences that are explicitly present in the provided text
2441
- - DO NOT generate, infer, or hallucinate any sequences
2442
- - Every character in the sequence must be directly copied from the text
2443
- - If a sequence appears truncated or incomplete in the text, extract only what is shown
2444
- - Be extremely careful and accurate - sequence accuracy is critical for scientific validity
2511
+ Extract ALL enzyme variant sequences from the text. Copy sequences EXACTLY as they appear - character by character.
2512
+
2513
+ KEY RULES:
2514
+ 1. EXHAUSTIVE SEARCH: If a variant appears multiple times, check ALL occurrences and extract the LONGEST sequence
2515
+ 2. MULTI-PAGE: Sequences span pages. Skip page numbers (66, 67, etc.) that interrupt sequences
2516
+ 3. MERGE IF NEEDED: If sequence continues after page break, combine the parts
2517
+ 4. NO MODIFICATIONS: Copy exactly - no edits or improvements
2518
+
2519
+ IMPORTANT: The same variant may appear multiple times with different sequence lengths. Always use the longest one.
2520
+
2521
+ SEQUENCE PRIORITY:
2522
+ - If BOTH amino acid AND DNA exist → use amino acid ONLY
2523
+ - For DNA: If mixed case, extract UPPERCASE only (lowercase=backbone)
2524
+ - Return minified JSON only
2525
+
2526
+ ACCURACY:
2527
+ - Extract ONLY what's written
2528
+ - Never hallucinate
2529
+ - Check entire document - complete sequences often appear later
2445
2530
 
2446
2531
  Schema: {schema}
2447
2532
 
@@ -2535,9 +2620,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2535
2620
  The most common sequence JSON data or None if all attempts failed
2536
2621
  """
2537
2622
  responses = []
2538
- max_attempts = 3 # Reduced from 6 to 3 for performance
2623
+ max_attempts = 5 # 5 attempts for better consensus
2539
2624
 
2540
- # Try 3 times with early match detection
2625
+ # Try 5 times with early match detection
2541
2626
  for attempt in range(max_attempts):
2542
2627
  try:
2543
2628
  log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
@@ -2652,28 +2737,39 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
2652
2737
  if isinstance(resp, list):
2653
2738
  for seq in resp:
2654
2739
  if isinstance(seq, dict) and "variant_id" in seq:
2655
- # Create a key for this sequence (variant_id + cleaned aa_seq)
2740
+ # Create a key for this sequence (variant_id + cleaned sequence)
2656
2741
  variant_id = seq.get("variant_id", "")
2657
2742
  aa_seq = seq.get("aa_seq", "")
2743
+ dna_seq = seq.get("dna_seq", "")
2744
+
2745
+ # Clean sequences for comparison
2658
2746
  if aa_seq:
2659
2747
  aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
2660
- key = f"{variant_id}|{aa_seq}"
2748
+ if dna_seq:
2749
+ dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
2750
+
2751
+ # Use whichever sequence is present for the key
2752
+ seq_for_key = aa_seq if aa_seq else (dna_seq if dna_seq else "")
2753
+ key = f"{variant_id}|{seq_for_key}"
2661
2754
 
2662
2755
  if key not in sequence_counts:
2663
2756
  sequence_counts[key] = {"count": 0, "data": seq}
2664
2757
  sequence_counts[key]["count"] += 1
2665
2758
 
2666
- # Build result with sequences that appear in at least 3 attempts
2759
+ # Build result with sequences that appear in at least 2 attempts
2760
+ # Sort by count (descending) to prioritize sequences with higher consensus
2667
2761
  result = []
2668
- for key, info in sequence_counts.items():
2669
- if info["count"] >= 3: # Appears in at least 3/6 attempts
2762
+ sorted_sequences = sorted(sequence_counts.items(), key=lambda x: x[1]["count"], reverse=True)
2763
+
2764
+ for key, info in sorted_sequences:
2765
+ if info["count"] >= 2: # Appears in at least 2/5 attempts
2670
2766
  seq_data = info["data"].copy()
2671
2767
  seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
2672
2768
  result.append(seq_data)
2673
2769
  log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
2674
2770
 
2675
2771
  if result:
2676
- log.info(f"Extracted {len(result)} sequences with at least 3/{max_attempts} consensus")
2772
+ log.info(f"Extracted {len(result)} sequences with at least 2/{max_attempts} consensus")
2677
2773
  return result
2678
2774
 
2679
2775
  # If no sequences appear twice, return the most complete attempt
@@ -2769,11 +2865,30 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
2769
2865
  if aa and len(aa) <= 50:
2770
2866
  log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
2771
2867
  aa = None
2772
- if dna and len(dna) <= 150:
2773
- log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
2774
- dna = None
2868
+
2869
+ # Validate DNA sequences
2870
+ if dna:
2871
+ if len(dna) <= 150:
2872
+ log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
2873
+ dna = None
2874
+ # Check if DNA sequence length is divisible by 3
2875
+ elif len(dna) % 3 != 0:
2876
+ log.warning(f"Skipping DNA sequence for {vid}: length {len(dna)} not divisible by 3")
2877
+ dna = None
2878
+ else:
2879
+ # Check for stop codons in the middle of the sequence
2880
+ stop_codons = {'TAA', 'TAG', 'TGA'}
2881
+ has_internal_stop = False
2882
+ for i in range(0, len(dna) - 3, 3):
2883
+ codon = dna[i:i+3]
2884
+ if codon in stop_codons:
2885
+ log.warning(f"Skipping DNA sequence for {vid}: internal stop codon {codon} at position {i}")
2886
+ has_internal_stop = True
2887
+ break
2888
+ if has_internal_stop:
2889
+ dna = None
2775
2890
 
2776
- # Skip if both sequences are too short or missing
2891
+ # Skip if both sequences are invalid or missing
2777
2892
  if not aa and not dna:
2778
2893
  continue
2779
2894
 
@@ -3015,7 +3130,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
3015
3130
  log.info("Page extraction did not return text, falling back to text search")
3016
3131
  focused_text = _extract_text_at_locations(
3017
3132
  text, [best_location],
3018
- context_chars=max(min_length, 30000),
3133
+ context_chars=max(min_length, 50000),
3019
3134
  validate_sequences=True
3020
3135
  )
3021
3136