debase 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +278 -163
- debase/lineage_format.py +11 -5
- debase/reaction_info_extractor.py +209 -36
- debase/substrate_scope_extractor.py +157 -56
- {debase-0.6.2.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
- debase-0.7.0.dist-info/RECORD +18 -0
- debase-0.6.2.dist-info/RECORD +0 -18
- {debase-0.6.2.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
- {debase-0.6.2.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
- {debase-0.6.2.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.2.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -336,7 +336,7 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
|
|
336
336
|
return "\n".join(chunks)
|
337
337
|
|
338
338
|
|
339
|
-
def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None) -> Optional[bytes]:
|
339
|
+
def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None, caption_text: str = "") -> Optional[bytes]:
|
340
340
|
"""Extract a specific figure from a PDF by finding its caption.
|
341
341
|
|
342
342
|
Returns the figure as PNG bytes if found, None otherwise.
|
@@ -345,64 +345,49 @@ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Option
|
|
345
345
|
figure_bytes = None
|
346
346
|
|
347
347
|
try:
|
348
|
-
#
|
349
|
-
|
348
|
+
# Use caption text if provided, otherwise use figure_id
|
349
|
+
if caption_text:
|
350
|
+
# Use first 50 chars of caption for searching (enough to be unique)
|
351
|
+
search_text = caption_text[:50].strip()
|
352
|
+
log.info(f"Searching for figure using caption: '{search_text}...'")
|
353
|
+
else:
|
354
|
+
search_text = figure_id.strip()
|
355
|
+
log.info(f"Searching for figure using ID: '{search_text}'")
|
350
356
|
|
351
357
|
for page_num, page in enumerate(doc):
|
352
|
-
|
353
|
-
text_instances = page.search_for(search_text)
|
358
|
+
page_text = page.get_text()
|
354
359
|
|
355
|
-
if
|
356
|
-
|
360
|
+
# Check if caption text appears on this page
|
361
|
+
if search_text in page_text:
|
362
|
+
log.info(f"Found caption on page {page_num + 1}")
|
357
363
|
|
358
|
-
#
|
359
|
-
|
364
|
+
# Search for the exact text position
|
365
|
+
text_instances = page.search_for(search_text)
|
360
366
|
|
361
|
-
|
362
|
-
|
367
|
+
if text_instances:
|
368
|
+
# Get the position of the caption
|
369
|
+
caption_rect = text_instances[0]
|
363
370
|
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
if best_img is not None:
|
385
|
-
# Extract the identified image
|
386
|
-
pix = fitz.Pixmap(doc, best_img)
|
387
|
-
|
388
|
-
if pix.n - pix.alpha < 4: # GRAY or RGB
|
389
|
-
figure_bytes = pix.tobytes("png")
|
390
|
-
else: # Convert CMYK to RGB
|
391
|
-
pix2 = fitz.Pixmap(fitz.csRGB, pix)
|
392
|
-
figure_bytes = pix2.tobytes("png")
|
393
|
-
pix2 = None
|
394
|
-
pix = None
|
395
|
-
|
396
|
-
# Save to debug directory if provided
|
397
|
-
if debug_dir and figure_bytes:
|
398
|
-
debug_path = Path(debug_dir)
|
399
|
-
debug_path.mkdir(parents=True, exist_ok=True)
|
400
|
-
fig_file = debug_path / f"figure_{figure_id.replace(' ', '_').replace('.', '')}_{int(time.time())}.png"
|
401
|
-
with open(fig_file, 'wb') as f:
|
402
|
-
f.write(figure_bytes)
|
403
|
-
log.info(f"Saved figure to: {fig_file}")
|
404
|
-
|
405
|
-
break
|
371
|
+
# Instead of trying to extract individual images,
|
372
|
+
# extract the ENTIRE PAGE as an image
|
373
|
+
# This ensures we get the complete figure with all panels
|
374
|
+
log.info(f"Extracting entire page {page_num + 1} containing figure {figure_id}")
|
375
|
+
|
376
|
+
# Use high resolution for clarity
|
377
|
+
mat = fitz.Matrix(3.0, 3.0) # 3x zoom
|
378
|
+
pix = page.get_pixmap(matrix=mat)
|
379
|
+
figure_bytes = pix.tobytes("png")
|
380
|
+
|
381
|
+
# Save the extracted figure if debug is enabled
|
382
|
+
if debug_dir and figure_bytes:
|
383
|
+
debug_path = Path(debug_dir)
|
384
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
385
|
+
figure_file = debug_path / f"figure_{figure_id.replace(' ', '_')}_{int(time.time())}.png"
|
386
|
+
with open(figure_file, 'wb') as f:
|
387
|
+
f.write(figure_bytes)
|
388
|
+
log.info(f"Saved figure to: {figure_file}")
|
389
|
+
|
390
|
+
break # Found the figure, no need to continue
|
406
391
|
|
407
392
|
finally:
|
408
393
|
doc.close()
|
@@ -685,39 +670,39 @@ from typing import List, Dict, Any
|
|
685
670
|
# ---- 6.0 Campaign identification prompts -----------------------------------
|
686
671
|
|
687
672
|
_CAMPAIGN_IDENTIFICATION_PROMPT = """
|
688
|
-
|
689
|
-
Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.
|
690
|
-
|
691
|
-
Each campaign represents a separate evolutionary lineage targeting different:
|
692
|
-
- Model reactions (e.g., different chemical transformations)
|
693
|
-
- Substrate scopes
|
694
|
-
- Activities (e.g., different enzymatic reactions)
|
673
|
+
Identify directed evolution LINEAGE campaigns in this manuscript.
|
695
674
|
|
675
|
+
A campaign is a multi-round directed evolution effort that creates a FAMILY of variants through iterative cycles.
|
696
676
|
Look for:
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
677
|
+
- Multiple rounds/generations of evolution (e.g., "8 rounds of evolution", "5 generations")
|
678
|
+
- Lineage trees or variant families (e.g., "L1→L2→L3→L4", "WT→M1→M2→M3")
|
679
|
+
- Progressive improvement through iterations
|
680
|
+
- Parent-child relationships across multiple variants
|
681
|
+
|
682
|
+
Do NOT include:
|
683
|
+
- Single-point mutation studies or individual variant characterization
|
684
|
+
- Simple site-saturation mutagenesis at one position
|
685
|
+
|
686
|
+
IMPORTANT: Include previously evolved lineages IF they are the main focus of THIS paper (e.g., characterizing a previously evolved enzyme lineage with new substrates/conditions)
|
687
|
+
|
688
|
+
Key phrases: "rounds of directed evolution", "iterative evolution", "evolutionary lineage", "variant lineage", "generations of evolution"
|
701
689
|
|
702
690
|
Return a JSON array of campaigns:
|
703
691
|
[
|
704
692
|
{{
|
705
693
|
"campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
|
706
694
|
"campaign_name": "descriptive name",
|
707
|
-
"description": "what
|
695
|
+
"description": "what THIS STUDY evolved for",
|
708
696
|
"model_substrate": "substrate name/id",
|
709
697
|
"model_product": "product name/id",
|
710
698
|
"substrate_id": "id from paper (e.g., 1a)",
|
711
699
|
"product_id": "id from paper (e.g., 2a)",
|
712
700
|
"data_locations": ["Table S1", "Figure 1"],
|
713
701
|
"lineage_hint": "enzyme name pattern",
|
714
|
-
"notes": "
|
702
|
+
"notes": "evidence this was evolved in THIS study"
|
715
703
|
}}
|
716
704
|
]
|
717
705
|
|
718
|
-
IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
|
719
|
-
Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
|
720
|
-
|
721
706
|
TEXT:
|
722
707
|
{text}
|
723
708
|
""".strip()
|
@@ -757,10 +742,16 @@ lineage of enzyme variants (i.e. which variant came from which parent and what
|
|
757
742
|
mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
|
758
743
|
ensure the location you return are actually lineage location with variants and mutations.
|
759
744
|
|
745
|
+
IMPORTANT SCORING CRITERIA:
|
746
|
+
- Locations that explicitly mention "lineage" should be scored MUCH HIGHER (90-100)
|
747
|
+
- Locations mentioning "evolutionary tree", "phylogenetic", "genealogy", or "ancestry" should also score high (85-95)
|
748
|
+
- Locations that only mention "variants" without lineage context should score lower (60-80)
|
749
|
+
- Generic tables of variants without parent-child relationships should score lowest (40-60)
|
750
|
+
|
760
751
|
Respond with a JSON array of objects, each containing:
|
761
752
|
- "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
762
753
|
- "type": one of "table", "figure", "section"
|
763
|
-
- "confidence": your confidence score (0-100) that this location contains lineage data
|
754
|
+
- "confidence": your confidence score (0-100) that this location contains lineage data (PRIORITIZE "lineage" mentions!)
|
764
755
|
- "reason": brief explanation of why this location likely contains lineage
|
765
756
|
- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
|
766
757
|
- "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
|
@@ -777,17 +768,20 @@ CRITICAL INSTRUCTIONS:
|
|
777
768
|
- Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
|
778
769
|
- If uncertain, use context clues from the text
|
779
770
|
|
780
|
-
Order by confidence score (highest first)
|
781
|
-
|
782
|
-
|
771
|
+
Order by confidence score (highest first), with special priority for:
|
772
|
+
1. Tables/figures explicitly mentioning "lineage" or "evolutionary tree" (score 90-100)
|
773
|
+
2. Tables showing complete parent-child relationships with mutations (score 80-95)
|
774
|
+
3. Figures showing evolutionary/phylogenetic trees (score 75-90)
|
775
|
+
4. Tables listing variants with parent information (score 70-85)
|
776
|
+
5. Generic variant tables without clear lineage information (score 40-70)
|
783
777
|
|
784
778
|
Don't include oligonucleotide results or result from only one round.
|
785
779
|
|
786
780
|
Example output:
|
787
781
|
[
|
788
|
-
{{"location": "Table S1.", "type": "table", "confidence":
|
789
|
-
{{"location": "Figure 2B", "type": "figure", "confidence":
|
790
|
-
{{"location": "
|
782
|
+
{{"location": "Table S1.", "type": "table", "confidence": 98, "reason": "Complete enzyme lineage table with parent-child relationships", "source": "si", "caption": "Table S1. Complete lineage of enzyme variants showing the evolutionary progression from wild-type through eight rounds of directed evolution. Each variant is listed with its parent and accumulated mutations..."{campaign_example}}},
|
783
|
+
{{"location": "Figure 2B", "type": "figure", "confidence": 92, "reason": "Evolutionary tree explicitly showing lineage", "source": "manuscript", "caption": "Figure 2B Evolutionary lineage tree depicting the complete genealogy of engineered variants. Branches show parent-child relationships with mutations annotated..."{campaign_example}}},
|
784
|
+
{{"location": "Table 2", "type": "table", "confidence": 75, "reason": "Variant table with parent information", "source": "manuscript", "caption": "Table 2. Summary of enzyme variants generated in this study. Parent templates and mutations are indicated for each variant..."{campaign_example}}}
|
791
785
|
]
|
792
786
|
""".strip()
|
793
787
|
|
@@ -919,6 +913,9 @@ def identify_evolution_locations(
|
|
919
913
|
pdf_paths: Optional[List[Path]] = None,
|
920
914
|
) -> List[dict]:
|
921
915
|
"""Ask Gemini where in the paper the lineage is probably described."""
|
916
|
+
# Extract manuscript pages as images (in addition to text)
|
917
|
+
manuscript_images = []
|
918
|
+
|
922
919
|
# Extract table of contents from PDFs if available
|
923
920
|
toc_text = ""
|
924
921
|
if pdf_paths:
|
@@ -949,6 +946,27 @@ def identify_evolution_locations(
|
|
949
946
|
|
950
947
|
if toc_sections:
|
951
948
|
toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
|
949
|
+
|
950
|
+
# Extract manuscript pages as images
|
951
|
+
if len(pdf_paths) >= 1:
|
952
|
+
manuscript_pdf = pdf_paths[0]
|
953
|
+
log.info(f"Extracting manuscript pages as images from: {manuscript_pdf.name}")
|
954
|
+
|
955
|
+
doc = _open_doc(manuscript_pdf)
|
956
|
+
try:
|
957
|
+
# Extract up to 10 pages as images
|
958
|
+
for page_num in range(min(10, len(doc))):
|
959
|
+
page = doc[page_num]
|
960
|
+
# Render page as image
|
961
|
+
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
962
|
+
pix = page.get_pixmap(matrix=mat)
|
963
|
+
img_bytes = pix.tobytes("png")
|
964
|
+
manuscript_images.append(img_bytes)
|
965
|
+
log.debug(f"Extracted page {page_num + 1} as image ({len(img_bytes)} bytes)")
|
966
|
+
finally:
|
967
|
+
doc.close()
|
968
|
+
|
969
|
+
log.info(f"Extracted {len(manuscript_images)} manuscript pages as images")
|
952
970
|
|
953
971
|
# Include TOC before the main text
|
954
972
|
combined_text = toc_text + text if toc_text else text
|
@@ -990,15 +1008,80 @@ def identify_evolution_locations(
|
|
990
1008
|
campaign_specific=campaign_specific,
|
991
1009
|
campaign_field=campaign_field,
|
992
1010
|
campaign_example=campaign_example
|
993
|
-
)
|
1011
|
+
)
|
1012
|
+
|
994
1013
|
locs: List[dict] = []
|
995
1014
|
try:
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1015
|
+
if manuscript_images:
|
1016
|
+
# Use vision API with manuscript images and SI text
|
1017
|
+
log.info("Using vision API with %d manuscript page images and SI text", len(manuscript_images))
|
1018
|
+
|
1019
|
+
# Convert images to PIL format for Gemini
|
1020
|
+
import PIL.Image
|
1021
|
+
import io
|
1022
|
+
|
1023
|
+
pil_images = []
|
1024
|
+
for img_bytes in manuscript_images:
|
1025
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1026
|
+
pil_images.append(image)
|
1027
|
+
|
1028
|
+
# Build multimodal prompt with caption text AND manuscript images
|
1029
|
+
multimodal_prompt = [prompt + "\n\nTEXT (Captions and sections):\n" + combined_text]
|
1030
|
+
|
1031
|
+
# Add manuscript page images
|
1032
|
+
multimodal_prompt.append("\n\n=== MANUSCRIPT PAGES (as images for additional context) ===\n")
|
1033
|
+
multimodal_prompt.extend(pil_images)
|
1034
|
+
|
1035
|
+
# Save debug info if requested
|
1036
|
+
if debug_dir:
|
1037
|
+
debug_path = Path(debug_dir)
|
1038
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1039
|
+
|
1040
|
+
# Save prompt
|
1041
|
+
prompt_file = debug_path / f"locate_vision_prompt_{int(time.time())}.txt"
|
1042
|
+
_dump(f"=== VISION PROMPT FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nManuscript pages: {len(manuscript_images)}\nText length: {len(combined_text)} chars\n{'='*80}\n\n{prompt}\n\nTEXT (Captions and sections):\n{combined_text[:2000]}...(truncated)\n\n[{len(manuscript_images)} manuscript page images]",
|
1043
|
+
prompt_file)
|
1044
|
+
|
1045
|
+
# Save manuscript page samples
|
1046
|
+
for i, img_bytes in enumerate(manuscript_images[:3]): # Save first 3 pages
|
1047
|
+
img_file = debug_path / f"locate_manuscript_page_{i+1}_{int(time.time())}.png"
|
1048
|
+
_dump(img_bytes, img_file)
|
1049
|
+
|
1050
|
+
# Generate content with vision
|
1051
|
+
response = model.generate_content(multimodal_prompt)
|
1052
|
+
raw = response.text
|
1053
|
+
|
1054
|
+
# Parse JSON from response
|
1055
|
+
try:
|
1056
|
+
# Save raw response if debug enabled
|
1057
|
+
if debug_dir:
|
1058
|
+
response_file = Path(debug_dir) / f"locate_vision_response_{int(time.time())}.txt"
|
1059
|
+
_dump(f"=== VISION RESPONSE FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}", response_file)
|
1060
|
+
|
1061
|
+
# Try to parse JSON
|
1062
|
+
try:
|
1063
|
+
locs = json.loads(raw)
|
1064
|
+
except json.JSONDecodeError:
|
1065
|
+
# Try to extract JSON from response
|
1066
|
+
json_match = re.search(r'\[.*\]', raw, re.DOTALL)
|
1067
|
+
if json_match:
|
1068
|
+
locs = json.loads(json_match.group(0))
|
1069
|
+
else:
|
1070
|
+
log.warning("Could not parse JSON from vision response")
|
1071
|
+
locs = []
|
1072
|
+
except Exception as e:
|
1073
|
+
log.warning(f"Error parsing vision response: {e}")
|
1074
|
+
locs = []
|
1075
|
+
|
1076
|
+
else:
|
1077
|
+
# Fall back to text-only mode
|
1078
|
+
prompt += "\n\nTEXT:\n" + combined_text
|
1079
|
+
locs = generate_json_with_retry(
|
1080
|
+
model,
|
1081
|
+
prompt,
|
1082
|
+
debug_dir=debug_dir,
|
1083
|
+
tag="locate",
|
1084
|
+
)
|
1002
1085
|
except Exception as exc: # pragma: no cover
|
1003
1086
|
log.warning("identify_evolution_locations(): %s", exc)
|
1004
1087
|
|
@@ -1299,7 +1382,7 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
|
|
1299
1382
|
|
1300
1383
|
return False
|
1301
1384
|
|
1302
|
-
def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int =
|
1385
|
+
def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 50000, validate_sequences: bool = False) -> str:
|
1303
1386
|
"""Extract text around identified locations."""
|
1304
1387
|
if not locations:
|
1305
1388
|
return text
|
@@ -1788,50 +1871,55 @@ def get_lineage(
|
|
1788
1871
|
for loc in locations:
|
1789
1872
|
log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
|
1790
1873
|
|
1791
|
-
#
|
1874
|
+
# Sort locations by confidence and use the highest confidence one
|
1875
|
+
locations_sorted = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
|
1876
|
+
log.info(f"Using highest confidence location: {locations_sorted[0]['location']} (confidence: {locations_sorted[0]['confidence']})")
|
1877
|
+
|
1878
|
+
# Use the highest confidence location as primary location
|
1879
|
+
primary_location = locations_sorted[0]
|
1880
|
+
|
1881
|
+
# Extract location details
|
1882
|
+
location_str = primary_location.get('location', '')
|
1883
|
+
location_type = primary_location.get('type', '')
|
1884
|
+
confidence = primary_location.get('confidence', 0)
|
1885
|
+
caption_text = primary_location.get('caption', '')
|
1886
|
+
|
1887
|
+
# Initialize extracted variants list
|
1792
1888
|
extracted_variants = []
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
location_str = location.get('location', '')
|
1798
|
-
location_type = location.get('type', '')
|
1799
|
-
confidence = location.get('confidence', 0)
|
1889
|
+
|
1890
|
+
# Try figure extraction for high-confidence figures
|
1891
|
+
if location_type == 'figure' and confidence >= 70 and pdf_paths:
|
1892
|
+
log.info(f"Attempting to extract figure: {location_str}")
|
1800
1893
|
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1805
|
-
figure_bytes = None
|
1806
|
-
for pdf_path in pdf_paths:
|
1807
|
-
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
|
1808
|
-
if figure_bytes:
|
1809
|
-
log.info(f"Successfully extracted figure from {pdf_path.name}")
|
1810
|
-
break
|
1811
|
-
|
1894
|
+
figure_bytes = None
|
1895
|
+
for pdf_path in pdf_paths:
|
1896
|
+
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text=caption_text)
|
1812
1897
|
if figure_bytes:
|
1813
|
-
|
1814
|
-
|
1815
|
-
debug_path = Path(debug_dir)
|
1816
|
-
debug_path.mkdir(parents=True, exist_ok=True)
|
1817
|
-
figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
|
1818
|
-
_dump(figure_bytes, figure_file)
|
1819
|
-
log.info(f"Saved figure to: {figure_file}")
|
1820
|
-
|
1821
|
-
# Extract lineage from figure
|
1822
|
-
variants = extract_lineage_from_figure(
|
1823
|
-
figure_bytes, model,
|
1824
|
-
debug_dir=debug_dir,
|
1825
|
-
campaign_id=campaign.campaign_id,
|
1826
|
-
campaign_info=campaign
|
1827
|
-
)
|
1828
|
-
if variants:
|
1829
|
-
log.info(f"Extracted {len(variants)} variants from figure")
|
1830
|
-
extracted_variants = variants
|
1831
|
-
continue
|
1898
|
+
log.info(f"Successfully extracted figure from {pdf_path.name}")
|
1899
|
+
break
|
1832
1900
|
|
1833
|
-
|
1834
|
-
|
1901
|
+
if figure_bytes:
|
1902
|
+
# Save figure if debug enabled
|
1903
|
+
if debug_dir:
|
1904
|
+
debug_path = Path(debug_dir)
|
1905
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1906
|
+
figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
|
1907
|
+
_dump(figure_bytes, figure_file)
|
1908
|
+
log.info(f"Saved figure to: {figure_file}")
|
1909
|
+
|
1910
|
+
# Extract lineage from figure
|
1911
|
+
variants = extract_lineage_from_figure(
|
1912
|
+
figure_bytes, model,
|
1913
|
+
debug_dir=debug_dir,
|
1914
|
+
campaign_id=campaign.campaign_id,
|
1915
|
+
campaign_info=campaign
|
1916
|
+
)
|
1917
|
+
if variants:
|
1918
|
+
log.info(f"Extracted {len(variants)} variants from figure")
|
1919
|
+
extracted_variants = variants
|
1920
|
+
|
1921
|
+
# Try table/text extraction if no figure extraction or if not a figure
|
1922
|
+
if not extracted_variants and location_type in ['table', 'text', 'section']:
|
1835
1923
|
log.info(f"Attempting text extraction for {location_type}: {location_str}")
|
1836
1924
|
|
1837
1925
|
# Determine which text to use based on source
|
@@ -2074,8 +2162,9 @@ def get_lineage(
|
|
2074
2162
|
|
2075
2163
|
# Try to extract the figure from available PDFs
|
2076
2164
|
figure_bytes = None
|
2165
|
+
# Note: This fallback path doesn't have the caption text
|
2077
2166
|
for pdf_path in pdf_paths:
|
2078
|
-
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
|
2167
|
+
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text="")
|
2079
2168
|
if figure_bytes:
|
2080
2169
|
log.info("Successfully extracted figure from %s", pdf_path.name)
|
2081
2170
|
break
|
@@ -2114,7 +2203,7 @@ def get_lineage(
|
|
2114
2203
|
# Use text-based extraction (works for tables and text sections)
|
2115
2204
|
# Extract from full text, not caption text - use only primary location
|
2116
2205
|
# Use more context for tables since they often span multiple pages
|
2117
|
-
context_size =
|
2206
|
+
context_size = 75000 if location_type == 'table' else 50000
|
2118
2207
|
focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
|
2119
2208
|
log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
|
2120
2209
|
len(full_text), len(focused_text),
|
@@ -2377,7 +2466,7 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
2377
2466
|
# Fallback to text search if page extraction didn't work
|
2378
2467
|
if not sample_text:
|
2379
2468
|
sample_text = _extract_text_at_locations(
|
2380
|
-
text, [location], context_chars=
|
2469
|
+
text, [location], context_chars=20000, validate_sequences=False
|
2381
2470
|
)
|
2382
2471
|
|
2383
2472
|
samples.append({
|
@@ -2419,29 +2508,25 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
2419
2508
|
|
2420
2509
|
# --- 7.3 Main extraction prompt ---------------------------------------------
|
2421
2510
|
_SEQ_EXTRACTION_PROMPT = """
|
2422
|
-
Extract ALL enzyme variant sequences from the text.
|
2423
|
-
|
2424
|
-
|
2425
|
-
1.
|
2426
|
-
2.
|
2427
|
-
3.
|
2428
|
-
|
2429
|
-
|
2430
|
-
|
2431
|
-
|
2432
|
-
|
2433
|
-
|
2434
|
-
|
2435
|
-
-
|
2436
|
-
|
2437
|
-
|
2438
|
-
|
2439
|
-
|
2440
|
-
-
|
2441
|
-
- DO NOT generate, infer, or hallucinate any sequences
|
2442
|
-
- Every character in the sequence must be directly copied from the text
|
2443
|
-
- If a sequence appears truncated or incomplete in the text, extract only what is shown
|
2444
|
-
- Be extremely careful and accurate - sequence accuracy is critical for scientific validity
|
2511
|
+
Extract ALL enzyme variant sequences from the text. Copy sequences EXACTLY as they appear - character by character.
|
2512
|
+
|
2513
|
+
KEY RULES:
|
2514
|
+
1. EXHAUSTIVE SEARCH: If a variant appears multiple times, check ALL occurrences and extract the LONGEST sequence
|
2515
|
+
2. MULTI-PAGE: Sequences span pages. Skip page numbers (66, 67, etc.) that interrupt sequences
|
2516
|
+
3. MERGE IF NEEDED: If sequence continues after page break, combine the parts
|
2517
|
+
4. NO MODIFICATIONS: Copy exactly - no edits or improvements
|
2518
|
+
|
2519
|
+
IMPORTANT: The same variant may appear multiple times with different sequence lengths. Always use the longest one.
|
2520
|
+
|
2521
|
+
SEQUENCE PRIORITY:
|
2522
|
+
- If BOTH amino acid AND DNA exist → use amino acid ONLY
|
2523
|
+
- For DNA: If mixed case, extract UPPERCASE only (lowercase=backbone)
|
2524
|
+
- Return minified JSON only
|
2525
|
+
|
2526
|
+
ACCURACY:
|
2527
|
+
- Extract ONLY what's written
|
2528
|
+
- Never hallucinate
|
2529
|
+
- Check entire document - complete sequences often appear later
|
2445
2530
|
|
2446
2531
|
Schema: {schema}
|
2447
2532
|
|
@@ -2535,9 +2620,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2535
2620
|
The most common sequence JSON data or None if all attempts failed
|
2536
2621
|
"""
|
2537
2622
|
responses = []
|
2538
|
-
max_attempts =
|
2623
|
+
max_attempts = 5 # 5 attempts for better consensus
|
2539
2624
|
|
2540
|
-
# Try
|
2625
|
+
# Try 5 times with early match detection
|
2541
2626
|
for attempt in range(max_attempts):
|
2542
2627
|
try:
|
2543
2628
|
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
@@ -2652,28 +2737,39 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2652
2737
|
if isinstance(resp, list):
|
2653
2738
|
for seq in resp:
|
2654
2739
|
if isinstance(seq, dict) and "variant_id" in seq:
|
2655
|
-
# Create a key for this sequence (variant_id + cleaned
|
2740
|
+
# Create a key for this sequence (variant_id + cleaned sequence)
|
2656
2741
|
variant_id = seq.get("variant_id", "")
|
2657
2742
|
aa_seq = seq.get("aa_seq", "")
|
2743
|
+
dna_seq = seq.get("dna_seq", "")
|
2744
|
+
|
2745
|
+
# Clean sequences for comparison
|
2658
2746
|
if aa_seq:
|
2659
2747
|
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2660
|
-
|
2748
|
+
if dna_seq:
|
2749
|
+
dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
|
2750
|
+
|
2751
|
+
# Use whichever sequence is present for the key
|
2752
|
+
seq_for_key = aa_seq if aa_seq else (dna_seq if dna_seq else "")
|
2753
|
+
key = f"{variant_id}|{seq_for_key}"
|
2661
2754
|
|
2662
2755
|
if key not in sequence_counts:
|
2663
2756
|
sequence_counts[key] = {"count": 0, "data": seq}
|
2664
2757
|
sequence_counts[key]["count"] += 1
|
2665
2758
|
|
2666
|
-
# Build result with sequences that appear in at least
|
2759
|
+
# Build result with sequences that appear in at least 2 attempts
|
2760
|
+
# Sort by count (descending) to prioritize sequences with higher consensus
|
2667
2761
|
result = []
|
2668
|
-
|
2669
|
-
|
2762
|
+
sorted_sequences = sorted(sequence_counts.items(), key=lambda x: x[1]["count"], reverse=True)
|
2763
|
+
|
2764
|
+
for key, info in sorted_sequences:
|
2765
|
+
if info["count"] >= 2: # Appears in at least 2/5 attempts
|
2670
2766
|
seq_data = info["data"].copy()
|
2671
2767
|
seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
|
2672
2768
|
result.append(seq_data)
|
2673
2769
|
log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
|
2674
2770
|
|
2675
2771
|
if result:
|
2676
|
-
log.info(f"Extracted {len(result)} sequences with at least
|
2772
|
+
log.info(f"Extracted {len(result)} sequences with at least 2/{max_attempts} consensus")
|
2677
2773
|
return result
|
2678
2774
|
|
2679
2775
|
# If no sequences appear twice, return the most complete attempt
|
@@ -2769,11 +2865,30 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
2769
2865
|
if aa and len(aa) <= 50:
|
2770
2866
|
log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
|
2771
2867
|
aa = None
|
2772
|
-
|
2773
|
-
|
2774
|
-
|
2868
|
+
|
2869
|
+
# Validate DNA sequences
|
2870
|
+
if dna:
|
2871
|
+
if len(dna) <= 150:
|
2872
|
+
log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
|
2873
|
+
dna = None
|
2874
|
+
# Check if DNA sequence length is divisible by 3
|
2875
|
+
elif len(dna) % 3 != 0:
|
2876
|
+
log.warning(f"Skipping DNA sequence for {vid}: length {len(dna)} not divisible by 3")
|
2877
|
+
dna = None
|
2878
|
+
else:
|
2879
|
+
# Check for stop codons in the middle of the sequence
|
2880
|
+
stop_codons = {'TAA', 'TAG', 'TGA'}
|
2881
|
+
has_internal_stop = False
|
2882
|
+
for i in range(0, len(dna) - 3, 3):
|
2883
|
+
codon = dna[i:i+3]
|
2884
|
+
if codon in stop_codons:
|
2885
|
+
log.warning(f"Skipping DNA sequence for {vid}: internal stop codon {codon} at position {i}")
|
2886
|
+
has_internal_stop = True
|
2887
|
+
break
|
2888
|
+
if has_internal_stop:
|
2889
|
+
dna = None
|
2775
2890
|
|
2776
|
-
# Skip if both sequences are
|
2891
|
+
# Skip if both sequences are invalid or missing
|
2777
2892
|
if not aa and not dna:
|
2778
2893
|
continue
|
2779
2894
|
|
@@ -3015,7 +3130,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
3015
3130
|
log.info("Page extraction did not return text, falling back to text search")
|
3016
3131
|
focused_text = _extract_text_at_locations(
|
3017
3132
|
text, [best_location],
|
3018
|
-
context_chars=max(min_length,
|
3133
|
+
context_chars=max(min_length, 50000),
|
3019
3134
|
validate_sequences=True
|
3020
3135
|
)
|
3021
3136
|
|