debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/caption_pattern.py +7 -2
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +673 -221
- debase/lineage_format.py +55 -6
- debase/reaction_info_extractor.py +282 -97
- debase/substrate_scope_extractor.py +218 -65
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
- debase-0.7.0.dist-info/RECORD +18 -0
- debase-0.6.1.dist-info/RECORD +0 -18
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0
@@ -336,7 +336,7 @@ def limited_caption_concat(*pdf_paths: str | Path, max_chars: int = MAX_CHARS) -
|
|
336
336
|
return "\n".join(chunks)
|
337
337
|
|
338
338
|
|
339
|
-
def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None) -> Optional[bytes]:
|
339
|
+
def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Optional[Union[str, Path]] = None, caption_text: str = "") -> Optional[bytes]:
|
340
340
|
"""Extract a specific figure from a PDF by finding its caption.
|
341
341
|
|
342
342
|
Returns the figure as PNG bytes if found, None otherwise.
|
@@ -345,64 +345,49 @@ def extract_figure(pdf_path: Union[str, Path], figure_id: str, debug_dir: Option
|
|
345
345
|
figure_bytes = None
|
346
346
|
|
347
347
|
try:
|
348
|
-
#
|
349
|
-
|
348
|
+
# Use caption text if provided, otherwise use figure_id
|
349
|
+
if caption_text:
|
350
|
+
# Use first 50 chars of caption for searching (enough to be unique)
|
351
|
+
search_text = caption_text[:50].strip()
|
352
|
+
log.info(f"Searching for figure using caption: '{search_text}...'")
|
353
|
+
else:
|
354
|
+
search_text = figure_id.strip()
|
355
|
+
log.info(f"Searching for figure using ID: '{search_text}'")
|
350
356
|
|
351
357
|
for page_num, page in enumerate(doc):
|
352
|
-
|
353
|
-
text_instances = page.search_for(search_text)
|
358
|
+
page_text = page.get_text()
|
354
359
|
|
355
|
-
if
|
356
|
-
|
360
|
+
# Check if caption text appears on this page
|
361
|
+
if search_text in page_text:
|
362
|
+
log.info(f"Found caption on page {page_num + 1}")
|
357
363
|
|
358
|
-
#
|
359
|
-
|
364
|
+
# Search for the exact text position
|
365
|
+
text_instances = page.search_for(search_text)
|
360
366
|
|
361
|
-
|
362
|
-
|
367
|
+
if text_instances:
|
368
|
+
# Get the position of the caption
|
369
|
+
caption_rect = text_instances[0]
|
363
370
|
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
if best_img is not None:
|
385
|
-
# Extract the identified image
|
386
|
-
pix = fitz.Pixmap(doc, best_img)
|
387
|
-
|
388
|
-
if pix.n - pix.alpha < 4: # GRAY or RGB
|
389
|
-
figure_bytes = pix.tobytes("png")
|
390
|
-
else: # Convert CMYK to RGB
|
391
|
-
pix2 = fitz.Pixmap(fitz.csRGB, pix)
|
392
|
-
figure_bytes = pix2.tobytes("png")
|
393
|
-
pix2 = None
|
394
|
-
pix = None
|
395
|
-
|
396
|
-
# Save to debug directory if provided
|
397
|
-
if debug_dir and figure_bytes:
|
398
|
-
debug_path = Path(debug_dir)
|
399
|
-
debug_path.mkdir(parents=True, exist_ok=True)
|
400
|
-
fig_file = debug_path / f"figure_{figure_id.replace(' ', '_').replace('.', '')}_{int(time.time())}.png"
|
401
|
-
with open(fig_file, 'wb') as f:
|
402
|
-
f.write(figure_bytes)
|
403
|
-
log.info(f"Saved figure to: {fig_file}")
|
404
|
-
|
405
|
-
break
|
371
|
+
# Instead of trying to extract individual images,
|
372
|
+
# extract the ENTIRE PAGE as an image
|
373
|
+
# This ensures we get the complete figure with all panels
|
374
|
+
log.info(f"Extracting entire page {page_num + 1} containing figure {figure_id}")
|
375
|
+
|
376
|
+
# Use high resolution for clarity
|
377
|
+
mat = fitz.Matrix(3.0, 3.0) # 3x zoom
|
378
|
+
pix = page.get_pixmap(matrix=mat)
|
379
|
+
figure_bytes = pix.tobytes("png")
|
380
|
+
|
381
|
+
# Save the extracted figure if debug is enabled
|
382
|
+
if debug_dir and figure_bytes:
|
383
|
+
debug_path = Path(debug_dir)
|
384
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
385
|
+
figure_file = debug_path / f"figure_{figure_id.replace(' ', '_')}_{int(time.time())}.png"
|
386
|
+
with open(figure_file, 'wb') as f:
|
387
|
+
f.write(figure_bytes)
|
388
|
+
log.info(f"Saved figure to: {figure_file}")
|
389
|
+
|
390
|
+
break # Found the figure, no need to continue
|
406
391
|
|
407
392
|
finally:
|
408
393
|
doc.close()
|
@@ -465,7 +450,7 @@ def get_model():
|
|
465
450
|
"temperature": 0.0, # Deterministic: always pick the most likely token
|
466
451
|
"top_p": 1.0, # Consider all tokens (but temperature=0 will pick the best)
|
467
452
|
"top_k": 1, # Only consider the single most likely token
|
468
|
-
"max_output_tokens":
|
453
|
+
"max_output_tokens": 65536, # Increased to 2x for handling larger lineage tables and sequences
|
469
454
|
}
|
470
455
|
|
471
456
|
# For Gemini 2.5 Flash, disable thinking tokens to save costs
|
@@ -685,39 +670,39 @@ from typing import List, Dict, Any
|
|
685
670
|
# ---- 6.0 Campaign identification prompts -----------------------------------
|
686
671
|
|
687
672
|
_CAMPAIGN_IDENTIFICATION_PROMPT = """
|
688
|
-
|
689
|
-
Analyze the following manuscript text to identify ALL distinct directed evolution campaigns.
|
690
|
-
|
691
|
-
Each campaign represents a separate evolutionary lineage targeting different:
|
692
|
-
- Model reactions (e.g., different chemical transformations)
|
693
|
-
- Substrate scopes
|
694
|
-
- Activities (e.g., different enzymatic reactions)
|
673
|
+
Identify directed evolution LINEAGE campaigns in this manuscript.
|
695
674
|
|
675
|
+
A campaign is a multi-round directed evolution effort that creates a FAMILY of variants through iterative cycles.
|
696
676
|
Look for:
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
677
|
+
- Multiple rounds/generations of evolution (e.g., "8 rounds of evolution", "5 generations")
|
678
|
+
- Lineage trees or variant families (e.g., "L1→L2→L3→L4", "WT→M1→M2→M3")
|
679
|
+
- Progressive improvement through iterations
|
680
|
+
- Parent-child relationships across multiple variants
|
681
|
+
|
682
|
+
Do NOT include:
|
683
|
+
- Single-point mutation studies or individual variant characterization
|
684
|
+
- Simple site-saturation mutagenesis at one position
|
685
|
+
|
686
|
+
IMPORTANT: Include previously evolved lineages IF they are the main focus of THIS paper (e.g., characterizing a previously evolved enzyme lineage with new substrates/conditions)
|
687
|
+
|
688
|
+
Key phrases: "rounds of directed evolution", "iterative evolution", "evolutionary lineage", "variant lineage", "generations of evolution"
|
701
689
|
|
702
690
|
Return a JSON array of campaigns:
|
703
691
|
[
|
704
692
|
{{
|
705
693
|
"campaign_id": "descriptive_unique_id_that_will_be_used_as_context",
|
706
694
|
"campaign_name": "descriptive name",
|
707
|
-
"description": "what
|
695
|
+
"description": "what THIS STUDY evolved for",
|
708
696
|
"model_substrate": "substrate name/id",
|
709
697
|
"model_product": "product name/id",
|
710
698
|
"substrate_id": "id from paper (e.g., 1a)",
|
711
699
|
"product_id": "id from paper (e.g., 2a)",
|
712
700
|
"data_locations": ["Table S1", "Figure 1"],
|
713
701
|
"lineage_hint": "enzyme name pattern",
|
714
|
-
"notes": "
|
702
|
+
"notes": "evidence this was evolved in THIS study"
|
715
703
|
}}
|
716
704
|
]
|
717
705
|
|
718
|
-
IMPORTANT: The campaign_id should be descriptive and meaningful as it will be used later as contextual information.
|
719
|
-
Use descriptive IDs like "lactamase_beta_hydrolysis_campaign" or "esterase_substrate_scope_optimization" rather than generic IDs like "campaign1" or "evolution1".
|
720
|
-
|
721
706
|
TEXT:
|
722
707
|
{text}
|
723
708
|
""".strip()
|
@@ -757,26 +742,46 @@ lineage of enzyme variants (i.e. which variant came from which parent and what
|
|
757
742
|
mutations were introduced){campaign_specific}. Pay attention to the provided context after the caption
|
758
743
|
ensure the location you return are actually lineage location with variants and mutations.
|
759
744
|
|
745
|
+
IMPORTANT SCORING CRITERIA:
|
746
|
+
- Locations that explicitly mention "lineage" should be scored MUCH HIGHER (90-100)
|
747
|
+
- Locations mentioning "evolutionary tree", "phylogenetic", "genealogy", or "ancestry" should also score high (85-95)
|
748
|
+
- Locations that only mention "variants" without lineage context should score lower (60-80)
|
749
|
+
- Generic tables of variants without parent-child relationships should score lowest (40-60)
|
750
|
+
|
760
751
|
Respond with a JSON array of objects, each containing:
|
761
|
-
- "location": the figure/table identifier (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
752
|
+
- "location": the figure/table identifier EXACTLY as it appears in the caption (e.g. "Table S1", "Figure 2B", "Table 1", "Figure 3")
|
762
753
|
- "type": one of "table", "figure", "section"
|
763
|
-
- "confidence": your confidence score (0-100) that this location contains lineage data
|
754
|
+
- "confidence": your confidence score (0-100) that this location contains lineage data (PRIORITIZE "lineage" mentions!)
|
764
755
|
- "reason": brief explanation of why this location likely contains lineage
|
756
|
+
- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
|
757
|
+
- "caption": the FULL caption text (include at least the first 200-300 characters of the caption to enable fuzzy matching)
|
765
758
|
{campaign_field}
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
759
|
+
CRITICAL INSTRUCTIONS:
|
760
|
+
1. Return "location" EXACTLY as the first reference identifier appears in the actual caption text
|
761
|
+
- Copy the exact characters including all punctuation (periods, colons, pipes, etc.) up to the first space after the identifier
|
762
|
+
- Do NOT modify, standardize, or interpret the location - return it verbatim from the document
|
763
|
+
2. Include the FULL caption text in the "caption" field to enable fuzzy matching when extracting
|
764
|
+
- This should be the complete caption as it appears in the document
|
765
|
+
- Include at least 200-300 characters to ensure unique matching
|
766
|
+
3. For each location, specify whether it's in the main manuscript or supplementary information (SI):
|
767
|
+
- Items like "Table S1", "Figure S2", etc. are typically in the SI
|
768
|
+
- Items like "Table 1", "Figure 2", etc. are typically in the main manuscript
|
769
|
+
- If uncertain, use context clues from the text
|
770
|
+
|
771
|
+
Order by confidence score (highest first), with special priority for:
|
772
|
+
1. Tables/figures explicitly mentioning "lineage" or "evolutionary tree" (score 90-100)
|
773
|
+
2. Tables showing complete parent-child relationships with mutations (score 80-95)
|
774
|
+
3. Figures showing evolutionary/phylogenetic trees (score 75-90)
|
775
|
+
4. Tables listing variants with parent information (score 70-85)
|
776
|
+
5. Generic variant tables without clear lineage information (score 40-70)
|
772
777
|
|
773
778
|
Don't include oligonucleotide results or result from only one round.
|
774
779
|
|
775
780
|
Example output:
|
776
781
|
[
|
777
|
-
{{"location": "Table S1", "type": "table", "confidence":
|
778
|
-
{{"location": "Figure 2B", "type": "figure", "confidence":
|
779
|
-
{{"location": "
|
782
|
+
{{"location": "Table S1.", "type": "table", "confidence": 98, "reason": "Complete enzyme lineage table with parent-child relationships", "source": "si", "caption": "Table S1. Complete lineage of enzyme variants showing the evolutionary progression from wild-type through eight rounds of directed evolution. Each variant is listed with its parent and accumulated mutations..."{campaign_example}}},
|
783
|
+
{{"location": "Figure 2B", "type": "figure", "confidence": 92, "reason": "Evolutionary tree explicitly showing lineage", "source": "manuscript", "caption": "Figure 2B Evolutionary lineage tree depicting the complete genealogy of engineered variants. Branches show parent-child relationships with mutations annotated..."{campaign_example}}},
|
784
|
+
{{"location": "Table 2", "type": "table", "confidence": 75, "reason": "Variant table with parent information", "source": "manuscript", "caption": "Table 2. Summary of enzyme variants generated in this study. Parent templates and mutations are indicated for each variant..."{campaign_example}}}
|
780
785
|
]
|
781
786
|
""".strip()
|
782
787
|
|
@@ -908,6 +913,9 @@ def identify_evolution_locations(
|
|
908
913
|
pdf_paths: Optional[List[Path]] = None,
|
909
914
|
) -> List[dict]:
|
910
915
|
"""Ask Gemini where in the paper the lineage is probably described."""
|
916
|
+
# Extract manuscript pages as images (in addition to text)
|
917
|
+
manuscript_images = []
|
918
|
+
|
911
919
|
# Extract table of contents from PDFs if available
|
912
920
|
toc_text = ""
|
913
921
|
if pdf_paths:
|
@@ -938,6 +946,27 @@ def identify_evolution_locations(
|
|
938
946
|
|
939
947
|
if toc_sections:
|
940
948
|
toc_text = "\n\nTABLE OF CONTENTS SECTIONS:" + ''.join(toc_sections) + "\n\n"
|
949
|
+
|
950
|
+
# Extract manuscript pages as images
|
951
|
+
if len(pdf_paths) >= 1:
|
952
|
+
manuscript_pdf = pdf_paths[0]
|
953
|
+
log.info(f"Extracting manuscript pages as images from: {manuscript_pdf.name}")
|
954
|
+
|
955
|
+
doc = _open_doc(manuscript_pdf)
|
956
|
+
try:
|
957
|
+
# Extract up to 10 pages as images
|
958
|
+
for page_num in range(min(10, len(doc))):
|
959
|
+
page = doc[page_num]
|
960
|
+
# Render page as image
|
961
|
+
mat = fitz.Matrix(2, 2) # 2x zoom for better quality
|
962
|
+
pix = page.get_pixmap(matrix=mat)
|
963
|
+
img_bytes = pix.tobytes("png")
|
964
|
+
manuscript_images.append(img_bytes)
|
965
|
+
log.debug(f"Extracted page {page_num + 1} as image ({len(img_bytes)} bytes)")
|
966
|
+
finally:
|
967
|
+
doc.close()
|
968
|
+
|
969
|
+
log.info(f"Extracted {len(manuscript_images)} manuscript pages as images")
|
941
970
|
|
942
971
|
# Include TOC before the main text
|
943
972
|
combined_text = toc_text + text if toc_text else text
|
@@ -979,15 +1008,80 @@ def identify_evolution_locations(
|
|
979
1008
|
campaign_specific=campaign_specific,
|
980
1009
|
campaign_field=campaign_field,
|
981
1010
|
campaign_example=campaign_example
|
982
|
-
)
|
1011
|
+
)
|
1012
|
+
|
983
1013
|
locs: List[dict] = []
|
984
1014
|
try:
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
1015
|
+
if manuscript_images:
|
1016
|
+
# Use vision API with manuscript images and SI text
|
1017
|
+
log.info("Using vision API with %d manuscript page images and SI text", len(manuscript_images))
|
1018
|
+
|
1019
|
+
# Convert images to PIL format for Gemini
|
1020
|
+
import PIL.Image
|
1021
|
+
import io
|
1022
|
+
|
1023
|
+
pil_images = []
|
1024
|
+
for img_bytes in manuscript_images:
|
1025
|
+
image = PIL.Image.open(io.BytesIO(img_bytes))
|
1026
|
+
pil_images.append(image)
|
1027
|
+
|
1028
|
+
# Build multimodal prompt with caption text AND manuscript images
|
1029
|
+
multimodal_prompt = [prompt + "\n\nTEXT (Captions and sections):\n" + combined_text]
|
1030
|
+
|
1031
|
+
# Add manuscript page images
|
1032
|
+
multimodal_prompt.append("\n\n=== MANUSCRIPT PAGES (as images for additional context) ===\n")
|
1033
|
+
multimodal_prompt.extend(pil_images)
|
1034
|
+
|
1035
|
+
# Save debug info if requested
|
1036
|
+
if debug_dir:
|
1037
|
+
debug_path = Path(debug_dir)
|
1038
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1039
|
+
|
1040
|
+
# Save prompt
|
1041
|
+
prompt_file = debug_path / f"locate_vision_prompt_{int(time.time())}.txt"
|
1042
|
+
_dump(f"=== VISION PROMPT FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nManuscript pages: {len(manuscript_images)}\nText length: {len(combined_text)} chars\n{'='*80}\n\n{prompt}\n\nTEXT (Captions and sections):\n{combined_text[:2000]}...(truncated)\n\n[{len(manuscript_images)} manuscript page images]",
|
1043
|
+
prompt_file)
|
1044
|
+
|
1045
|
+
# Save manuscript page samples
|
1046
|
+
for i, img_bytes in enumerate(manuscript_images[:3]): # Save first 3 pages
|
1047
|
+
img_file = debug_path / f"locate_manuscript_page_{i+1}_{int(time.time())}.png"
|
1048
|
+
_dump(img_bytes, img_file)
|
1049
|
+
|
1050
|
+
# Generate content with vision
|
1051
|
+
response = model.generate_content(multimodal_prompt)
|
1052
|
+
raw = response.text
|
1053
|
+
|
1054
|
+
# Parse JSON from response
|
1055
|
+
try:
|
1056
|
+
# Save raw response if debug enabled
|
1057
|
+
if debug_dir:
|
1058
|
+
response_file = Path(debug_dir) / f"locate_vision_response_{int(time.time())}.txt"
|
1059
|
+
_dump(f"=== VISION RESPONSE FOR LOCATE ===\nTimestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\nLength: {len(raw)} characters\n{'='*80}\n\n{raw}", response_file)
|
1060
|
+
|
1061
|
+
# Try to parse JSON
|
1062
|
+
try:
|
1063
|
+
locs = json.loads(raw)
|
1064
|
+
except json.JSONDecodeError:
|
1065
|
+
# Try to extract JSON from response
|
1066
|
+
json_match = re.search(r'\[.*\]', raw, re.DOTALL)
|
1067
|
+
if json_match:
|
1068
|
+
locs = json.loads(json_match.group(0))
|
1069
|
+
else:
|
1070
|
+
log.warning("Could not parse JSON from vision response")
|
1071
|
+
locs = []
|
1072
|
+
except Exception as e:
|
1073
|
+
log.warning(f"Error parsing vision response: {e}")
|
1074
|
+
locs = []
|
1075
|
+
|
1076
|
+
else:
|
1077
|
+
# Fall back to text-only mode
|
1078
|
+
prompt += "\n\nTEXT:\n" + combined_text
|
1079
|
+
locs = generate_json_with_retry(
|
1080
|
+
model,
|
1081
|
+
prompt,
|
1082
|
+
debug_dir=debug_dir,
|
1083
|
+
tag="locate",
|
1084
|
+
)
|
991
1085
|
except Exception as exc: # pragma: no cover
|
992
1086
|
log.warning("identify_evolution_locations(): %s", exc)
|
993
1087
|
|
@@ -1288,7 +1382,7 @@ def _is_toc_entry(text: str, position: int, pattern: str) -> bool:
|
|
1288
1382
|
|
1289
1383
|
return False
|
1290
1384
|
|
1291
|
-
def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int =
|
1385
|
+
def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], context_chars: int = 50000, validate_sequences: bool = False) -> str:
|
1292
1386
|
"""Extract text around identified locations."""
|
1293
1387
|
if not locations:
|
1294
1388
|
return text
|
@@ -1461,10 +1555,114 @@ def _extract_text_at_locations(text: str, locations: List[Union[str, dict]], con
|
|
1461
1555
|
|
1462
1556
|
# ---- 6.4 Public API -------------------------------------------------------
|
1463
1557
|
|
1464
|
-
def _extract_location_text(full_text: str, location: str, location_type: str) -> Optional[str]:
|
1465
|
-
"""Extract text from a specific location (table, section, etc.) in the full text.
|
1558
|
+
def _extract_location_text(full_text: str, location: str, location_type: str, caption_hint: Optional[str] = None) -> Optional[str]:
|
1559
|
+
"""Extract text from a specific location (table, section, etc.) in the full text.
|
1560
|
+
|
1561
|
+
Args:
|
1562
|
+
full_text: The full text to search in
|
1563
|
+
location: The location identifier (e.g., "Table S1")
|
1564
|
+
location_type: Type of location ("table", "figure", "section")
|
1565
|
+
caption_hint: Optional full caption text for fuzzy matching
|
1566
|
+
"""
|
1466
1567
|
import re
|
1467
1568
|
|
1569
|
+
# If caption hint is provided, try fuzzy matching first
|
1570
|
+
if caption_hint and len(caption_hint) > 20:
|
1571
|
+
log.info(f"Using caption hint for fuzzy matching: {caption_hint[:100]}...")
|
1572
|
+
|
1573
|
+
# Normalize texts for better matching (similar to reaction_info_extractor)
|
1574
|
+
def normalize_for_matching(text):
|
1575
|
+
# Remove extra whitespace, normalize spaces around punctuation
|
1576
|
+
text = ' '.join(text.split())
|
1577
|
+
# Normalize different dash types
|
1578
|
+
text = text.replace('–', '-').replace('—', '-')
|
1579
|
+
return text
|
1580
|
+
|
1581
|
+
normalized_hint = normalize_for_matching(caption_hint[:150]) # Use first 150 chars
|
1582
|
+
normalized_text = normalize_for_matching(full_text)
|
1583
|
+
|
1584
|
+
# Try to find ALL caption matches using character-based fuzzy matching
|
1585
|
+
all_matches = []
|
1586
|
+
|
1587
|
+
# Slide through the text looking for all matches above threshold
|
1588
|
+
hint_len = len(normalized_hint)
|
1589
|
+
for i in range(len(normalized_text) - hint_len + 1):
|
1590
|
+
snippet = normalized_text[i:i + hint_len]
|
1591
|
+
# Simple character-based similarity
|
1592
|
+
matches = sum(1 for a, b in zip(normalized_hint, snippet) if a == b)
|
1593
|
+
score = matches / hint_len
|
1594
|
+
|
1595
|
+
if score > 0.7: # 70% similarity threshold
|
1596
|
+
all_matches.append({
|
1597
|
+
'norm_pos': i,
|
1598
|
+
'score': score
|
1599
|
+
})
|
1600
|
+
|
1601
|
+
# If we found matches, extract from all of them
|
1602
|
+
if all_matches:
|
1603
|
+
log.info(f"Found {len(all_matches)} caption matches with fuzzy matching")
|
1604
|
+
|
1605
|
+
# Collect all occurrences from fuzzy matches
|
1606
|
+
all_occurrences = []
|
1607
|
+
seen_positions = set()
|
1608
|
+
|
1609
|
+
for match_info in all_matches:
|
1610
|
+
# Get the matched text from normalized version
|
1611
|
+
matched_normalized = normalized_text[match_info['norm_pos']:match_info['norm_pos'] + hint_len]
|
1612
|
+
|
1613
|
+
# Find where this appears in the original text
|
1614
|
+
best_original_pos = -1
|
1615
|
+
|
1616
|
+
# Search in the original text for this specific match
|
1617
|
+
for i in range(len(full_text) - len(caption_hint) + 1):
|
1618
|
+
if i in seen_positions:
|
1619
|
+
continue
|
1620
|
+
|
1621
|
+
original_snippet = full_text[i:i + len(caption_hint)]
|
1622
|
+
# Normalize and compare
|
1623
|
+
normalized_snippet = normalize_for_matching(original_snippet)
|
1624
|
+
if normalized_snippet[:hint_len] == matched_normalized:
|
1625
|
+
# Found exact match after normalization
|
1626
|
+
best_original_pos = i
|
1627
|
+
seen_positions.add(i)
|
1628
|
+
break
|
1629
|
+
|
1630
|
+
if best_original_pos >= 0:
|
1631
|
+
# Extract generous context from this match position
|
1632
|
+
start = max(0, best_original_pos - 1000)
|
1633
|
+
end = min(len(full_text), best_original_pos + 10000)
|
1634
|
+
context = full_text[start:end]
|
1635
|
+
|
1636
|
+
all_occurrences.append({
|
1637
|
+
'position': best_original_pos,
|
1638
|
+
'context': context,
|
1639
|
+
'score': match_info['score']
|
1640
|
+
})
|
1641
|
+
log.info(f"Fuzzy match at position {best_original_pos} with {match_info['score']*100:.1f}% similarity")
|
1642
|
+
|
1643
|
+
if all_occurrences:
|
1644
|
+
# Sort by position to maintain document order
|
1645
|
+
all_occurrences.sort(key=lambda x: x['position'])
|
1646
|
+
|
1647
|
+
# Combine all occurrences
|
1648
|
+
combined_text = f"=== All occurrences of {location} (fuzzy matched) ===\n\n"
|
1649
|
+
|
1650
|
+
for i, occurrence in enumerate(all_occurrences, 1):
|
1651
|
+
combined_text += f"--- Occurrence {i} at position {occurrence['position']} (similarity: {occurrence['score']*100:.1f}%) ---\n"
|
1652
|
+
combined_text += occurrence['context']
|
1653
|
+
combined_text += "\n\n"
|
1654
|
+
|
1655
|
+
# Apply same limit as table extraction
|
1656
|
+
if len(combined_text) > 150000:
|
1657
|
+
combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
|
1658
|
+
|
1659
|
+
log.info(f"Extracted {len(combined_text)} chars using fuzzy caption matching from {len(all_occurrences)} locations")
|
1660
|
+
return combined_text
|
1661
|
+
else:
|
1662
|
+
log.warning(f"Could not map any fuzzy matches back to original text")
|
1663
|
+
else:
|
1664
|
+
log.warning(f"No fuzzy matches found for caption above 70% threshold")
|
1665
|
+
|
1468
1666
|
if location_type == 'table':
|
1469
1667
|
# Find ALL mentions of this table and combine them
|
1470
1668
|
location_clean = location.strip()
|
@@ -1506,6 +1704,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
|
|
1506
1704
|
|
1507
1705
|
log.info(f"Found {len(all_occurrences)} occurrences of table '{location_clean}'")
|
1508
1706
|
|
1707
|
+
# Sort occurrences by position to maintain document order
|
1708
|
+
all_occurrences.sort(key=lambda x: x['position'])
|
1709
|
+
|
1509
1710
|
# Combine all occurrences into one text for Gemini to analyze
|
1510
1711
|
combined_text = f"=== All occurrences of {location_clean} ===\n\n"
|
1511
1712
|
|
@@ -1515,8 +1716,9 @@ def _extract_location_text(full_text: str, location: str, location_type: str) ->
|
|
1515
1716
|
combined_text += "\n\n"
|
1516
1717
|
|
1517
1718
|
# Limit total length to avoid overwhelming the model
|
1518
|
-
|
1519
|
-
|
1719
|
+
# Increased limit to ensure actual table content is included
|
1720
|
+
if len(combined_text) > 150000:
|
1721
|
+
combined_text = combined_text[:150000] + "\n\n[Truncated due to length...]"
|
1520
1722
|
|
1521
1723
|
return combined_text
|
1522
1724
|
|
@@ -1600,6 +1802,8 @@ def get_lineage(
|
|
1600
1802
|
*,
|
1601
1803
|
pdf_paths: Optional[List[Path]] = None,
|
1602
1804
|
debug_dir: str | Path | None = None,
|
1805
|
+
manuscript_text: Optional[str] = None,
|
1806
|
+
si_text: Optional[str] = None,
|
1603
1807
|
) -> Tuple[List[Variant], List[Campaign]]:
|
1604
1808
|
"""
|
1605
1809
|
High-level wrapper used by the pipeline.
|
@@ -1667,54 +1871,72 @@ def get_lineage(
|
|
1667
1871
|
for loc in locations:
|
1668
1872
|
log.info(f" - {loc['location']} ({loc['type']}, confidence: {loc['confidence']})")
|
1669
1873
|
|
1670
|
-
#
|
1874
|
+
# Sort locations by confidence and use the highest confidence one
|
1875
|
+
locations_sorted = sorted(locations, key=lambda x: x.get('confidence', 0), reverse=True)
|
1876
|
+
log.info(f"Using highest confidence location: {locations_sorted[0]['location']} (confidence: {locations_sorted[0]['confidence']})")
|
1877
|
+
|
1878
|
+
# Use the highest confidence location as primary location
|
1879
|
+
primary_location = locations_sorted[0]
|
1880
|
+
|
1881
|
+
# Extract location details
|
1882
|
+
location_str = primary_location.get('location', '')
|
1883
|
+
location_type = primary_location.get('type', '')
|
1884
|
+
confidence = primary_location.get('confidence', 0)
|
1885
|
+
caption_text = primary_location.get('caption', '')
|
1886
|
+
|
1887
|
+
# Initialize extracted variants list
|
1671
1888
|
extracted_variants = []
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
location_str = location.get('location', '')
|
1677
|
-
location_type = location.get('type', '')
|
1678
|
-
confidence = location.get('confidence', 0)
|
1889
|
+
|
1890
|
+
# Try figure extraction for high-confidence figures
|
1891
|
+
if location_type == 'figure' and confidence >= 70 and pdf_paths:
|
1892
|
+
log.info(f"Attempting to extract figure: {location_str}")
|
1679
1893
|
|
1680
|
-
|
1681
|
-
|
1682
|
-
|
1683
|
-
|
1684
|
-
figure_bytes = None
|
1685
|
-
for pdf_path in pdf_paths:
|
1686
|
-
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
|
1687
|
-
if figure_bytes:
|
1688
|
-
log.info(f"Successfully extracted figure from {pdf_path.name}")
|
1689
|
-
break
|
1690
|
-
|
1894
|
+
figure_bytes = None
|
1895
|
+
for pdf_path in pdf_paths:
|
1896
|
+
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text=caption_text)
|
1691
1897
|
if figure_bytes:
|
1692
|
-
|
1693
|
-
|
1694
|
-
debug_path = Path(debug_dir)
|
1695
|
-
debug_path.mkdir(parents=True, exist_ok=True)
|
1696
|
-
figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
|
1697
|
-
_dump(figure_bytes, figure_file)
|
1698
|
-
log.info(f"Saved figure to: {figure_file}")
|
1699
|
-
|
1700
|
-
# Extract lineage from figure
|
1701
|
-
variants = extract_lineage_from_figure(
|
1702
|
-
figure_bytes, model,
|
1703
|
-
debug_dir=debug_dir,
|
1704
|
-
campaign_id=campaign.campaign_id,
|
1705
|
-
campaign_info=campaign
|
1706
|
-
)
|
1707
|
-
if variants:
|
1708
|
-
log.info(f"Extracted {len(variants)} variants from figure")
|
1709
|
-
extracted_variants = variants
|
1710
|
-
continue
|
1898
|
+
log.info(f"Successfully extracted figure from {pdf_path.name}")
|
1899
|
+
break
|
1711
1900
|
|
1712
|
-
|
1713
|
-
|
1901
|
+
if figure_bytes:
|
1902
|
+
# Save figure if debug enabled
|
1903
|
+
if debug_dir:
|
1904
|
+
debug_path = Path(debug_dir)
|
1905
|
+
debug_path.mkdir(parents=True, exist_ok=True)
|
1906
|
+
figure_file = debug_path / f"lineage_figure_{campaign.campaign_id}_{location_str.replace(' ', '_')}_{int(time.time())}.png"
|
1907
|
+
_dump(figure_bytes, figure_file)
|
1908
|
+
log.info(f"Saved figure to: {figure_file}")
|
1909
|
+
|
1910
|
+
# Extract lineage from figure
|
1911
|
+
variants = extract_lineage_from_figure(
|
1912
|
+
figure_bytes, model,
|
1913
|
+
debug_dir=debug_dir,
|
1914
|
+
campaign_id=campaign.campaign_id,
|
1915
|
+
campaign_info=campaign
|
1916
|
+
)
|
1917
|
+
if variants:
|
1918
|
+
log.info(f"Extracted {len(variants)} variants from figure")
|
1919
|
+
extracted_variants = variants
|
1920
|
+
|
1921
|
+
# Try table/text extraction if no figure extraction or if not a figure
|
1922
|
+
if not extracted_variants and location_type in ['table', 'text', 'section']:
|
1714
1923
|
log.info(f"Attempting text extraction for {location_type}: {location_str}")
|
1715
1924
|
|
1716
|
-
#
|
1717
|
-
|
1925
|
+
# Determine which text to use based on source
|
1926
|
+
location_source = location.get('source', 'manuscript')
|
1927
|
+
if location_source == 'si' and si_text:
|
1928
|
+
text_to_search = si_text
|
1929
|
+
log.info(f"Using SI text for location {location_str}")
|
1930
|
+
elif location_source == 'manuscript' and manuscript_text:
|
1931
|
+
text_to_search = manuscript_text
|
1932
|
+
log.info(f"Using manuscript text for location {location_str}")
|
1933
|
+
else:
|
1934
|
+
text_to_search = full_text
|
1935
|
+
log.info(f"Using combined text for location {location_str} (fallback)")
|
1936
|
+
|
1937
|
+
# Extract the specific section/table from appropriate text
|
1938
|
+
caption_hint = location.get('caption', '')
|
1939
|
+
section_text = _extract_location_text(text_to_search, location_str, location_type, caption_hint)
|
1718
1940
|
if section_text:
|
1719
1941
|
log.info(f"Extracted {len(section_text)} chars from {location_type}: {location_str}")
|
1720
1942
|
# Save extracted section if debug enabled
|
@@ -1940,8 +2162,9 @@ def get_lineage(
|
|
1940
2162
|
|
1941
2163
|
# Try to extract the figure from available PDFs
|
1942
2164
|
figure_bytes = None
|
2165
|
+
# Note: This fallback path doesn't have the caption text
|
1943
2166
|
for pdf_path in pdf_paths:
|
1944
|
-
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir)
|
2167
|
+
figure_bytes = extract_figure(pdf_path, location_str, debug_dir=debug_dir, caption_text="")
|
1945
2168
|
if figure_bytes:
|
1946
2169
|
log.info("Successfully extracted figure from %s", pdf_path.name)
|
1947
2170
|
break
|
@@ -1980,7 +2203,7 @@ def get_lineage(
|
|
1980
2203
|
# Use text-based extraction (works for tables and text sections)
|
1981
2204
|
# Extract from full text, not caption text - use only primary location
|
1982
2205
|
# Use more context for tables since they often span multiple pages
|
1983
|
-
context_size =
|
2206
|
+
context_size = 75000 if location_type == 'table' else 50000
|
1984
2207
|
focused_text = _extract_text_at_locations(full_text, [primary_location], context_chars=context_size)
|
1985
2208
|
log.info("Reduced text from %d to %d chars using primary location %s for campaign %s",
|
1986
2209
|
len(full_text), len(focused_text),
|
@@ -2028,17 +2251,24 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
|
2028
2251
|
|
2029
2252
|
Look for table of contents entries or section listings that mention sequences.
|
2030
2253
|
Return a JSON array where each element has:
|
2031
|
-
- "section": the section heading or description
|
2254
|
+
- "section": the section heading or description EXACTLY as it appears
|
2032
2255
|
- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
|
2256
|
+
- "source": one of "manuscript" or "si" - indicate whether this is in the main manuscript or supplementary information
|
2257
|
+
- "caption": the FULL section heading or table of contents entry (at least 100-200 characters for fuzzy matching)
|
2033
2258
|
|
2034
2259
|
Focus on:
|
2035
2260
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
2036
2261
|
- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
|
2037
2262
|
- Prioritize sections that mention "protein" or "amino acid" sequences
|
2038
2263
|
|
2039
|
-
CRITICAL:
|
2040
|
-
|
2041
|
-
-
|
2264
|
+
CRITICAL:
|
2265
|
+
1. Page numbers must be returned as plain numbers or S-prefixed numbers only:
|
2266
|
+
- Correct: "53", "S12", "147"
|
2267
|
+
- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
|
2268
|
+
2. For each location, specify whether it's in the main manuscript or supplementary information (SI):
|
2269
|
+
- Pages with "S" prefix (e.g., "S53") are typically in the SI
|
2270
|
+
- Regular page numbers (e.g., "53") are typically in the main manuscript
|
2271
|
+
- Use context clues from the document structure
|
2042
2272
|
|
2043
2273
|
Return [] if no sequence sections are found.
|
2044
2274
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
@@ -2236,7 +2466,7 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
2236
2466
|
# Fallback to text search if page extraction didn't work
|
2237
2467
|
if not sample_text:
|
2238
2468
|
sample_text = _extract_text_at_locations(
|
2239
|
-
text, [location], context_chars=
|
2469
|
+
text, [location], context_chars=20000, validate_sequences=False
|
2240
2470
|
)
|
2241
2471
|
|
2242
2472
|
samples.append({
|
@@ -2278,44 +2508,30 @@ def validate_sequence_locations(text: str, locations: list, model, *, pdf_paths:
|
|
2278
2508
|
|
2279
2509
|
# --- 7.3 Main extraction prompt ---------------------------------------------
|
2280
2510
|
_SEQ_EXTRACTION_PROMPT = """
|
2281
|
-
Extract
|
2282
|
-
|
2283
|
-
IMPORTANT: Prioritize amino acid (protein) sequences over DNA sequences:
|
2284
|
-
- If an amino acid sequence exists for a variant, extract ONLY the aa_seq (set dna_seq to null)
|
2285
|
-
- Only extract dna_seq if NO amino acid sequence is available for that variant
|
2286
|
-
- This reduces redundancy since protein sequences are usually more relevant
|
2287
|
-
|
2288
|
-
CRITICAL: Use the EXACT variant identifier as it appears with each sequence:
|
2289
|
-
- Papers often use different naming conventions in different sections
|
2290
|
-
- DO NOT normalize or simplify variant IDs
|
2291
|
-
- Extract the variant_id exactly as written where the sequence appears
|
2292
|
-
- Common patterns include numeric IDs, generation labels, full descriptive names, or combinations
|
2293
|
-
|
2294
|
-
SEQUENCE EXTRACTION RULES:
|
2295
|
-
- Copy sequences EXACTLY as they appear in the text
|
2296
|
-
- Pay careful attention to repeated amino acids, and nucleotides (e.g., "AAA" should remain "AAA", not become "A")
|
2297
|
-
- Do NOT add, remove, or modify any amino acids, or nucleotides
|
2298
|
-
- Preserve the exact length and character sequence
|
2299
|
-
- If a sequence has line breaks or spacing, remove only formatting (spaces, newlines) but keep all amino acids
|
2300
|
-
- Double-check that consecutive identical amino acids or nucleotides are copied correctly
|
2301
|
-
|
2302
|
-
For each variant return:
|
2303
|
-
* variant_id - the EXACT label as it appears with the sequence (preserve all formatting)
|
2304
|
-
* aa_seq - amino-acid sequence (uppercase), or null - COPY EXACTLY FROM TEXT
|
2305
|
-
* dna_seq - DNA sequence (A/C/G/T), or null (ONLY if no aa_seq exists) - COPY EXACTLY FROM TEXT
|
2306
|
-
|
2307
|
-
Respond ONLY with **minified JSON** that matches the schema below.
|
2308
|
-
NO markdown, no code fences, no commentary.
|
2511
|
+
Extract ALL enzyme variant sequences from the text. Copy sequences EXACTLY as they appear - character by character.
|
2309
2512
|
|
2310
|
-
|
2311
|
-
|
2312
|
-
|
2313
|
-
|
2513
|
+
KEY RULES:
|
2514
|
+
1. EXHAUSTIVE SEARCH: If a variant appears multiple times, check ALL occurrences and extract the LONGEST sequence
|
2515
|
+
2. MULTI-PAGE: Sequences span pages. Skip page numbers (66, 67, etc.) that interrupt sequences
|
2516
|
+
3. MERGE IF NEEDED: If sequence continues after page break, combine the parts
|
2517
|
+
4. NO MODIFICATIONS: Copy exactly - no edits or improvements
|
2314
2518
|
|
2315
|
-
|
2316
|
-
|
2519
|
+
IMPORTANT: The same variant may appear multiple times with different sequence lengths. Always use the longest one.
|
2520
|
+
|
2521
|
+
SEQUENCE PRIORITY:
|
2522
|
+
- If BOTH amino acid AND DNA exist → use amino acid ONLY
|
2523
|
+
- For DNA: If mixed case, extract UPPERCASE only (lowercase=backbone)
|
2524
|
+
- Return minified JSON only
|
2525
|
+
|
2526
|
+
ACCURACY:
|
2527
|
+
- Extract ONLY what's written
|
2528
|
+
- Never hallucinate
|
2529
|
+
- Check entire document - complete sequences often appear later
|
2530
|
+
|
2531
|
+
Schema: {schema}
|
2532
|
+
|
2533
|
+
TEXT:
|
2317
2534
|
{text}
|
2318
|
-
```
|
2319
2535
|
""".strip()
|
2320
2536
|
|
2321
2537
|
def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list, dict]) -> bool:
|
@@ -2390,7 +2606,7 @@ def _check_sequence_responses_match(resp1: Union[list, dict], resp2: Union[list,
|
|
2390
2606
|
|
2391
2607
|
|
2392
2608
|
def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: str, *, debug_dir: str | Path | None = None) -> Optional[Any]:
|
2393
|
-
"""Extract sequence JSON using Gemini with up to
|
2609
|
+
"""Extract sequence JSON using Gemini with up to 3 attempts, returning most common result.
|
2394
2610
|
|
2395
2611
|
Can exit early after 2 attempts if the responses match exactly.
|
2396
2612
|
|
@@ -2404,9 +2620,9 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2404
2620
|
The most common sequence JSON data or None if all attempts failed
|
2405
2621
|
"""
|
2406
2622
|
responses = []
|
2407
|
-
max_attempts =
|
2623
|
+
max_attempts = 5 # 5 attempts for better consensus
|
2408
2624
|
|
2409
|
-
# Try
|
2625
|
+
# Try 5 times with early match detection
|
2410
2626
|
for attempt in range(max_attempts):
|
2411
2627
|
try:
|
2412
2628
|
log.info(f"Sequence extraction attempt {attempt + 1}/{max_attempts}")
|
@@ -2432,8 +2648,13 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2432
2648
|
|
2433
2649
|
# Try to parse as JSON
|
2434
2650
|
try:
|
2435
|
-
|
2436
|
-
|
2651
|
+
# First clean the response - remove any BOM or invisible characters
|
2652
|
+
raw_clean = raw.strip()
|
2653
|
+
if raw_clean.startswith('\ufeff'): # Remove BOM if present
|
2654
|
+
raw_clean = raw_clean[1:]
|
2655
|
+
parsed = json.loads(raw_clean)
|
2656
|
+
except json.JSONDecodeError as e:
|
2657
|
+
log.debug(f"Initial JSON parsing failed: {e}. Response starts with: {repr(raw[:100])}")
|
2437
2658
|
# Look for JSON array or object in the response
|
2438
2659
|
json_start = -1
|
2439
2660
|
json_end = -1
|
@@ -2482,17 +2703,22 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2482
2703
|
responses.append(parsed)
|
2483
2704
|
log.info(f"Sequence extraction attempt {attempt + 1}: {len(parsed) if isinstance(parsed, list) else 'object'} sequences")
|
2484
2705
|
|
2485
|
-
#
|
2486
|
-
if
|
2487
|
-
|
2488
|
-
if
|
2489
|
-
|
2490
|
-
if
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
|
2706
|
+
# If we got a good response with sequences, we can check for early termination
|
2707
|
+
if isinstance(parsed, list) and len(parsed) > 0:
|
2708
|
+
# Early match detection after 2 attempts
|
2709
|
+
if attempt >= 1: # After 2nd attempt (0-indexed)
|
2710
|
+
valid_responses_so_far = [r for r in responses if r is not None and isinstance(r, list) and len(r) > 0]
|
2711
|
+
if len(valid_responses_so_far) >= 2:
|
2712
|
+
# Check if the last two valid responses match
|
2713
|
+
if _check_sequence_responses_match(valid_responses_so_far[-2], valid_responses_so_far[-1]):
|
2714
|
+
log.info(f"Early match detected after {attempt + 1} attempts - sequences are consistent")
|
2715
|
+
# Add the matching response to fill remaining attempts
|
2716
|
+
for _ in range(max_attempts - attempt - 1):
|
2717
|
+
responses.append(valid_responses_so_far[-1])
|
2718
|
+
break
|
2719
|
+
# If this is the first attempt and we got sequences, continue to validate with at least one more
|
2720
|
+
elif attempt == 0 and len(parsed) > 5: # Got substantial sequences on first try
|
2721
|
+
log.info("Got substantial sequences on first attempt, will validate with one more")
|
2496
2722
|
|
2497
2723
|
except Exception as e:
|
2498
2724
|
log.warning(f"Sequence extraction attempt {attempt + 1} failed: {e}")
|
@@ -2511,28 +2737,39 @@ def _extract_sequences_with_triple_validation(model, prompt: str, schema_hint: s
|
|
2511
2737
|
if isinstance(resp, list):
|
2512
2738
|
for seq in resp:
|
2513
2739
|
if isinstance(seq, dict) and "variant_id" in seq:
|
2514
|
-
# Create a key for this sequence (variant_id + cleaned
|
2740
|
+
# Create a key for this sequence (variant_id + cleaned sequence)
|
2515
2741
|
variant_id = seq.get("variant_id", "")
|
2516
2742
|
aa_seq = seq.get("aa_seq", "")
|
2743
|
+
dna_seq = seq.get("dna_seq", "")
|
2744
|
+
|
2745
|
+
# Clean sequences for comparison
|
2517
2746
|
if aa_seq:
|
2518
2747
|
aa_seq = aa_seq.replace(" ", "").replace("\n", "").upper()
|
2519
|
-
|
2748
|
+
if dna_seq:
|
2749
|
+
dna_seq = dna_seq.replace(" ", "").replace("\n", "").upper()
|
2750
|
+
|
2751
|
+
# Use whichever sequence is present for the key
|
2752
|
+
seq_for_key = aa_seq if aa_seq else (dna_seq if dna_seq else "")
|
2753
|
+
key = f"{variant_id}|{seq_for_key}"
|
2520
2754
|
|
2521
2755
|
if key not in sequence_counts:
|
2522
2756
|
sequence_counts[key] = {"count": 0, "data": seq}
|
2523
2757
|
sequence_counts[key]["count"] += 1
|
2524
2758
|
|
2525
|
-
# Build result with sequences that appear in at least
|
2759
|
+
# Build result with sequences that appear in at least 2 attempts
|
2760
|
+
# Sort by count (descending) to prioritize sequences with higher consensus
|
2526
2761
|
result = []
|
2527
|
-
|
2528
|
-
|
2762
|
+
sorted_sequences = sorted(sequence_counts.items(), key=lambda x: x[1]["count"], reverse=True)
|
2763
|
+
|
2764
|
+
for key, info in sorted_sequences:
|
2765
|
+
if info["count"] >= 2: # Appears in at least 2/5 attempts
|
2529
2766
|
seq_data = info["data"].copy()
|
2530
2767
|
seq_data["extraction_confidence"] = f"{info['count']}/{max_attempts}"
|
2531
2768
|
result.append(seq_data)
|
2532
2769
|
log.info(f"Sequence {seq_data.get('variant_id')} appeared in {info['count']}/{max_attempts} attempts")
|
2533
2770
|
|
2534
2771
|
if result:
|
2535
|
-
log.info(f"Extracted {len(result)} sequences with at least
|
2772
|
+
log.info(f"Extracted {len(result)} sequences with at least 2/{max_attempts} consensus")
|
2536
2773
|
return result
|
2537
2774
|
|
2538
2775
|
# If no sequences appear twice, return the most complete attempt
|
@@ -2628,11 +2865,30 @@ def _parse_sequences(raw: list[dict]) -> list[SequenceBlock]:
|
|
2628
2865
|
if aa and len(aa) <= 50:
|
2629
2866
|
log.debug(f"Skipping short AA sequence for {vid}: {len(aa)} amino acids")
|
2630
2867
|
aa = None
|
2631
|
-
|
2632
|
-
|
2633
|
-
|
2868
|
+
|
2869
|
+
# Validate DNA sequences
|
2870
|
+
if dna:
|
2871
|
+
if len(dna) <= 150:
|
2872
|
+
log.debug(f"Skipping short DNA sequence for {vid}: {len(dna)} nucleotides")
|
2873
|
+
dna = None
|
2874
|
+
# Check if DNA sequence length is divisible by 3
|
2875
|
+
elif len(dna) % 3 != 0:
|
2876
|
+
log.warning(f"Skipping DNA sequence for {vid}: length {len(dna)} not divisible by 3")
|
2877
|
+
dna = None
|
2878
|
+
else:
|
2879
|
+
# Check for stop codons in the middle of the sequence
|
2880
|
+
stop_codons = {'TAA', 'TAG', 'TGA'}
|
2881
|
+
has_internal_stop = False
|
2882
|
+
for i in range(0, len(dna) - 3, 3):
|
2883
|
+
codon = dna[i:i+3]
|
2884
|
+
if codon in stop_codons:
|
2885
|
+
log.warning(f"Skipping DNA sequence for {vid}: internal stop codon {codon} at position {i}")
|
2886
|
+
has_internal_stop = True
|
2887
|
+
break
|
2888
|
+
if has_internal_stop:
|
2889
|
+
dna = None
|
2634
2890
|
|
2635
|
-
# Skip if both sequences are
|
2891
|
+
# Skip if both sequences are invalid or missing
|
2636
2892
|
if not aa and not dna:
|
2637
2893
|
continue
|
2638
2894
|
|
@@ -2852,9 +3108,9 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2852
3108
|
focused_text = ""
|
2853
3109
|
if pdf_paths and isinstance(best_location, dict) and 'page' in best_location:
|
2854
3110
|
page_num = best_location['page']
|
2855
|
-
# Extract current page plus next
|
3111
|
+
# Extract current page plus next 5 pages (6 total) to prevent hallucination
|
2856
3112
|
all_pages = []
|
2857
|
-
for i in range(
|
3113
|
+
for i in range(6): # Current + next 5 (6 pages total)
|
2858
3114
|
if isinstance(page_num, str) and page_num.upper().startswith('S'):
|
2859
3115
|
next_page = f"S{int(page_num[1:]) + i}"
|
2860
3116
|
else:
|
@@ -2866,7 +3122,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2866
3122
|
break
|
2867
3123
|
if all_pages:
|
2868
3124
|
focused_text = "\n".join(all_pages)
|
2869
|
-
log.info("Extracted %d chars from pages %s through %d more pages",
|
3125
|
+
log.info("Extracted %d chars from pages %s through %d more pages (limited to 6 pages total)",
|
2870
3126
|
len(focused_text), page_num, len(all_pages) - 1)
|
2871
3127
|
|
2872
3128
|
# Fallback to text search if page extraction didn't work
|
@@ -2874,7 +3130,7 @@ def get_sequences(text: str, model, *, pdf_paths: List[Path] = None, debug_dir:
|
|
2874
3130
|
log.info("Page extraction did not return text, falling back to text search")
|
2875
3131
|
focused_text = _extract_text_at_locations(
|
2876
3132
|
text, [best_location],
|
2877
|
-
context_chars=max(min_length,
|
3133
|
+
context_chars=max(min_length, 50000),
|
2878
3134
|
validate_sequences=True
|
2879
3135
|
)
|
2880
3136
|
|
@@ -3152,6 +3408,83 @@ def fetch_sequence_by_name(enzyme_name: str) -> Dict[str, str]:
|
|
3152
3408
|
return {}
|
3153
3409
|
|
3154
3410
|
|
3411
|
+
def _match_variant_ids_with_gemini(
|
3412
|
+
lineage_variant_ids: List[str],
|
3413
|
+
pdb_variant_ids: List[str],
|
3414
|
+
model
|
3415
|
+
) -> Dict[str, str]:
|
3416
|
+
"""Use Gemini to match variant IDs that may have slight formatting differences.
|
3417
|
+
|
3418
|
+
Args:
|
3419
|
+
lineage_variant_ids: List of variant IDs from the lineage
|
3420
|
+
pdb_variant_ids: List of variant IDs from PDB matching
|
3421
|
+
model: Gemini model for matching
|
3422
|
+
|
3423
|
+
Returns:
|
3424
|
+
Dictionary mapping lineage_variant_id -> pdb_variant_id
|
3425
|
+
"""
|
3426
|
+
if not lineage_variant_ids or not pdb_variant_ids or not model:
|
3427
|
+
return {}
|
3428
|
+
|
3429
|
+
# If the lists are identical, return direct mapping
|
3430
|
+
if set(lineage_variant_ids) == set(pdb_variant_ids):
|
3431
|
+
return {vid: vid for vid in lineage_variant_ids if vid in pdb_variant_ids}
|
3432
|
+
|
3433
|
+
# Use Gemini to match variant IDs that may have formatting differences
|
3434
|
+
prompt = f"""Match variant IDs between two lists that may have slight formatting differences (whitespace, encoding, etc.).
|
3435
|
+
These represent the same enzyme variants but may be formatted differently.
|
3436
|
+
|
3437
|
+
Lineage variant IDs:
|
3438
|
+
{json.dumps(lineage_variant_ids, indent=2)}
|
3439
|
+
|
3440
|
+
PDB variant IDs:
|
3441
|
+
{json.dumps(pdb_variant_ids, indent=2)}
|
3442
|
+
|
3443
|
+
Match variants that represent the SAME enzyme variant, accounting for:
|
3444
|
+
- Whitespace differences (extra spaces, tabs)
|
3445
|
+
- Character encoding differences
|
3446
|
+
- Minor formatting variations
|
3447
|
+
|
3448
|
+
Return ONLY a JSON object mapping lineage IDs to PDB IDs.
|
3449
|
+
Format: {{"lineage_id": "pdb_id", ...}}
|
3450
|
+
Only include matches you are confident represent the same variant.
|
3451
|
+
Return an empty object {{}} if no matches can be confidently made.
|
3452
|
+
"""
|
3453
|
+
|
3454
|
+
try:
|
3455
|
+
response = model.generate_content(prompt)
|
3456
|
+
text = _extract_text(response).strip()
|
3457
|
+
|
3458
|
+
# Parse JSON response
|
3459
|
+
if text.startswith("```"):
|
3460
|
+
text = text.split("```")[1].strip()
|
3461
|
+
if text.startswith("json"):
|
3462
|
+
text = text[4:].strip()
|
3463
|
+
|
3464
|
+
# Clean up the text
|
3465
|
+
text = text.strip()
|
3466
|
+
if not text or text == "{}":
|
3467
|
+
return {}
|
3468
|
+
|
3469
|
+
matches = json.loads(text)
|
3470
|
+
log.info(f"Gemini matched {len(matches)} variant IDs for PDB assignment")
|
3471
|
+
|
3472
|
+
# Validate matches
|
3473
|
+
valid_matches = {}
|
3474
|
+
for lineage_id, pdb_id in matches.items():
|
3475
|
+
if lineage_id in lineage_variant_ids and pdb_id in pdb_variant_ids:
|
3476
|
+
valid_matches[lineage_id] = pdb_id
|
3477
|
+
log.info(f"Variant ID match: {lineage_id} -> {pdb_id}")
|
3478
|
+
else:
|
3479
|
+
log.warning(f"Invalid match ignored: {lineage_id} -> {pdb_id}")
|
3480
|
+
|
3481
|
+
return valid_matches
|
3482
|
+
|
3483
|
+
except Exception as e:
|
3484
|
+
log.warning(f"Failed to match variant IDs with Gemini: {e}")
|
3485
|
+
return {}
|
3486
|
+
|
3487
|
+
|
3155
3488
|
def match_pdb_to_variants(
|
3156
3489
|
pdb_sequences: Dict[str, str],
|
3157
3490
|
variants: List[Variant],
|
@@ -3235,24 +3568,76 @@ Return ONLY the variant_id as a JSON string, e.g.: "ApePgb GLVRSQL"
|
|
3235
3568
|
text = _extract_text(response).strip()
|
3236
3569
|
|
3237
3570
|
# Parse JSON response (expecting a single string)
|
3238
|
-
|
3571
|
+
# Look for JSON code blocks first
|
3572
|
+
if "```json" in text:
|
3573
|
+
# Extract content between ```json and ```
|
3574
|
+
import re
|
3575
|
+
json_match = re.search(r'```json\s*\n?(.*?)\n?```', text, re.DOTALL)
|
3576
|
+
if json_match:
|
3577
|
+
json_content = json_match.group(1).strip()
|
3578
|
+
try:
|
3579
|
+
# Parse as JSON and extract the string value
|
3580
|
+
parsed = json.loads(json_content)
|
3581
|
+
matched_variant = str(parsed).strip('"\'')
|
3582
|
+
except:
|
3583
|
+
# If JSON parsing fails, try to extract the quoted string
|
3584
|
+
quoted_match = re.search(r'"([^"]+)"', json_content)
|
3585
|
+
if quoted_match:
|
3586
|
+
matched_variant = quoted_match.group(1)
|
3587
|
+
else:
|
3588
|
+
matched_variant = json_content.strip('"\'')
|
3589
|
+
else:
|
3590
|
+
matched_variant = text.strip('"\'')
|
3591
|
+
elif text.startswith("```"):
|
3592
|
+
# Handle other code blocks
|
3239
3593
|
text = text.split("```")[1].strip()
|
3240
3594
|
if text.startswith("json"):
|
3241
3595
|
text = text[4:].strip()
|
3596
|
+
matched_variant = text.strip('"\'')
|
3597
|
+
else:
|
3598
|
+
# Look for quoted strings in the response
|
3599
|
+
import re
|
3600
|
+
quoted_match = re.search(r'"([^"]+)"', text)
|
3601
|
+
if quoted_match:
|
3602
|
+
matched_variant = quoted_match.group(1)
|
3603
|
+
else:
|
3604
|
+
# Remove quotes if present
|
3605
|
+
matched_variant = text.strip('"\'')
|
3242
3606
|
|
3243
|
-
|
3244
|
-
text = text.strip('"\'')
|
3245
|
-
|
3246
|
-
matched_variant = text
|
3607
|
+
log.info(f"Extracted variant name: '{matched_variant}' from response")
|
3247
3608
|
log.info(f"PDB {pdb_id} matched to variant: {matched_variant}")
|
3248
3609
|
|
3249
3610
|
# Return mapping with all chains pointing to the same variant
|
3250
3611
|
mapping = {}
|
3251
|
-
if matched_variant
|
3252
|
-
|
3253
|
-
|
3254
|
-
|
3612
|
+
if matched_variant:
|
3613
|
+
# Debug logging
|
3614
|
+
variant_ids = [v.variant_id for v in variants]
|
3615
|
+
log.info(f"Looking for variant '{matched_variant}' in lineage variants: {variant_ids}")
|
3616
|
+
|
3617
|
+
# Check if the matched variant exists in the lineage
|
3618
|
+
found_variant = any(v.variant_id == matched_variant for v in variants)
|
3619
|
+
log.info(f"Variant '{matched_variant}' found in lineage: {found_variant}")
|
3620
|
+
|
3621
|
+
if found_variant:
|
3622
|
+
for chain_id in pdb_sequences:
|
3623
|
+
mapping[matched_variant] = chain_id
|
3624
|
+
log.info(f"Created mapping: {matched_variant} -> {chain_id}")
|
3625
|
+
break # Only use the first chain
|
3626
|
+
else:
|
3627
|
+
log.warning(f"Variant '{matched_variant}' not found in lineage variants")
|
3628
|
+
# Try fuzzy matching
|
3629
|
+
for variant in variants:
|
3630
|
+
if variant.variant_id.strip() == matched_variant.strip():
|
3631
|
+
log.info(f"Found fuzzy match: '{variant.variant_id}' == '{matched_variant}'")
|
3632
|
+
for chain_id in pdb_sequences:
|
3633
|
+
mapping[variant.variant_id] = chain_id
|
3634
|
+
log.info(f"Created fuzzy mapping: {variant.variant_id} -> {chain_id}")
|
3635
|
+
break
|
3636
|
+
break
|
3637
|
+
else:
|
3638
|
+
log.warning("No matched variant extracted from response")
|
3255
3639
|
|
3640
|
+
log.info(f"Final mapping result: {mapping}")
|
3256
3641
|
return mapping
|
3257
3642
|
|
3258
3643
|
except Exception as e:
|
@@ -3634,14 +4019,28 @@ def run_pipeline(
|
|
3634
4019
|
caption_text = limited_caption_concat(*pdf_paths)
|
3635
4020
|
full_text = limited_concat(*pdf_paths)
|
3636
4021
|
|
4022
|
+
# Also load separate texts for manuscript and SI
|
4023
|
+
manuscript_text = limited_concat(manuscript) if manuscript else None
|
4024
|
+
si_text = limited_concat(si_path) if si_path else None
|
4025
|
+
|
3637
4026
|
log.info("Loaded %d chars of captions for identification and %d chars of full text for extraction",
|
3638
4027
|
len(caption_text), len(full_text))
|
4028
|
+
if manuscript_text:
|
4029
|
+
log.info("Loaded %d chars from manuscript", len(manuscript_text))
|
4030
|
+
if si_text:
|
4031
|
+
log.info("Loaded %d chars from SI", len(si_text))
|
3639
4032
|
|
3640
4033
|
# 2. Connect to Gemini -----------------------------------------------------
|
3641
4034
|
model = get_model()
|
3642
4035
|
|
3643
4036
|
# 3. Extract lineage (Section 6) ------------------------------------------
|
3644
|
-
lineage, campaigns = get_lineage(
|
4037
|
+
lineage, campaigns = get_lineage(
|
4038
|
+
caption_text, full_text, model,
|
4039
|
+
pdf_paths=pdf_paths,
|
4040
|
+
debug_dir=debug_dir,
|
4041
|
+
manuscript_text=manuscript_text,
|
4042
|
+
si_text=si_text
|
4043
|
+
)
|
3645
4044
|
|
3646
4045
|
if not lineage:
|
3647
4046
|
raise RuntimeError("Pipeline aborted: failed to extract any lineage data")
|
@@ -3721,12 +4120,40 @@ def run_pipeline(
|
|
3721
4120
|
pdb_sequences, lineage, full_text, model, pdb_id
|
3722
4121
|
)
|
3723
4122
|
|
4123
|
+
log.info(f"PDB matching result: {variant_to_chain}")
|
4124
|
+
log.info(f"Available PDB sequences: {list(pdb_sequences.keys())}")
|
4125
|
+
log.info(f"Lineage variants: {[v.variant_id for v in lineage]}")
|
4126
|
+
|
3724
4127
|
# Convert to SequenceBlock objects
|
3725
4128
|
pdb_seq_blocks = []
|
3726
|
-
|
3727
|
-
|
3728
|
-
|
3729
|
-
|
4129
|
+
|
4130
|
+
# Use Gemini-based matching for robust variant ID comparison
|
4131
|
+
if variant_to_chain and model:
|
4132
|
+
# Create a mapping using Gemini for robust string matching
|
4133
|
+
gemini_mapping = _match_variant_ids_with_gemini(
|
4134
|
+
lineage_variant_ids=[v.variant_id for v in lineage],
|
4135
|
+
pdb_variant_ids=list(variant_to_chain.keys()),
|
4136
|
+
model=model
|
4137
|
+
)
|
4138
|
+
|
4139
|
+
for variant in lineage:
|
4140
|
+
log.info(f"Processing variant: {variant.variant_id}")
|
4141
|
+
|
4142
|
+
# Try direct match first
|
4143
|
+
chain_id = variant_to_chain.get(variant.variant_id)
|
4144
|
+
log.info(f"Direct match for {variant.variant_id}: {chain_id}")
|
4145
|
+
|
4146
|
+
# If no direct match, try Gemini-based matching
|
4147
|
+
if not chain_id:
|
4148
|
+
matched_pdb_variant = gemini_mapping.get(variant.variant_id)
|
4149
|
+
log.info(f"Gemini match for {variant.variant_id}: {matched_pdb_variant}")
|
4150
|
+
if matched_pdb_variant:
|
4151
|
+
chain_id = variant_to_chain.get(matched_pdb_variant)
|
4152
|
+
log.info(f"Chain ID from Gemini match: {chain_id}")
|
4153
|
+
|
4154
|
+
if chain_id and chain_id in pdb_sequences:
|
4155
|
+
seq_length = len(pdb_sequences[chain_id])
|
4156
|
+
log.info(f"Creating sequence block for {variant.variant_id} with {seq_length} residues from chain {chain_id}")
|
3730
4157
|
seq_block = SequenceBlock(
|
3731
4158
|
variant_id=variant.variant_id,
|
3732
4159
|
aa_seq=pdb_sequences[chain_id],
|
@@ -3737,6 +4164,26 @@ def run_pipeline(
|
|
3737
4164
|
)
|
3738
4165
|
pdb_seq_blocks.append(seq_block)
|
3739
4166
|
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
4167
|
+
else:
|
4168
|
+
log.warning(f"No chain_id found for variant {variant.variant_id} or chain not in sequences")
|
4169
|
+
else:
|
4170
|
+
# Fallback to direct matching if no model or no matches
|
4171
|
+
for variant in lineage:
|
4172
|
+
if variant.variant_id in variant_to_chain:
|
4173
|
+
chain_id = variant_to_chain[variant.variant_id]
|
4174
|
+
if chain_id in pdb_sequences:
|
4175
|
+
seq_block = SequenceBlock(
|
4176
|
+
variant_id=variant.variant_id,
|
4177
|
+
aa_seq=pdb_sequences[chain_id],
|
4178
|
+
dna_seq=None,
|
4179
|
+
confidence=1.0, # High confidence for PDB sequences
|
4180
|
+
truncated=False,
|
4181
|
+
metadata={"source": "PDB", "pdb_id": pdb_id, "chain": chain_id}
|
4182
|
+
)
|
4183
|
+
pdb_seq_blocks.append(seq_block)
|
4184
|
+
log.info(f"Added PDB sequence for {variant.variant_id} from {pdb_id}:{chain_id}")
|
4185
|
+
|
4186
|
+
log.info(f"PDB sequence blocks created: {len(pdb_seq_blocks)}")
|
3740
4187
|
|
3741
4188
|
if pdb_seq_blocks:
|
3742
4189
|
# Update the dataframe with PDB sequences
|
@@ -3746,8 +4193,13 @@ def run_pipeline(
|
|
3746
4193
|
df_merged.loc[mask, 'aa_seq'] = seq_block.aa_seq
|
3747
4194
|
df_merged.loc[mask, 'seq_confidence'] = seq_block.confidence
|
3748
4195
|
df_merged.loc[mask, 'seq_source'] = seq_block.metadata.get('source', 'PDB')
|
4196
|
+
log.info(f"Updated dataframe with sequence for {seq_block.variant_id}")
|
4197
|
+
else:
|
4198
|
+
log.warning(f"No matching row in dataframe for variant {seq_block.variant_id}")
|
3749
4199
|
log.info(f"Successfully extracted {len(pdb_seq_blocks)} sequences from PDB {pdb_id}")
|
3750
4200
|
break
|
4201
|
+
else:
|
4202
|
+
log.warning(f"No PDB sequence blocks were created for {pdb_id}")
|
3751
4203
|
else:
|
3752
4204
|
log.warning(f"No sequences found in PDB {pdb_id}")
|
3753
4205
|
else:
|