debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/caption_pattern.py +7 -2
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +673 -221
- debase/lineage_format.py +55 -6
- debase/reaction_info_extractor.py +282 -97
- debase/substrate_scope_extractor.py +218 -65
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
- debase-0.7.0.dist-info/RECORD +18 -0
- debase-0.6.1.dist-info/RECORD +0 -18
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,7 @@ import json
|
|
29
29
|
import logging
|
30
30
|
import os
|
31
31
|
import re
|
32
|
+
import subprocess
|
32
33
|
import sys
|
33
34
|
import time
|
34
35
|
from base64 import b64encode, b64decode
|
@@ -90,6 +91,40 @@ handler.setFormatter(logging.Formatter("%(levelname)s [%(name)s] %(message)s"))
|
|
90
91
|
LOGGER.addHandler(handler)
|
91
92
|
LOGGER.setLevel(logging.INFO)
|
92
93
|
|
94
|
+
# === OPSIN VALIDATION === -------------------------------------------------
|
95
|
+
|
96
|
+
def is_valid_iupac_name_with_opsin(name: str) -> bool:
|
97
|
+
"""Check if a name is a valid IUPAC name using the local OPSIN command."""
|
98
|
+
if not name or len(name.strip()) < 3:
|
99
|
+
return False
|
100
|
+
|
101
|
+
# Skip if it looks like a compound ID (e.g., "1a", "S1", etc.)
|
102
|
+
if re.match(r'^[0-9]+[a-z]?$|^S\d+$', name.strip()):
|
103
|
+
return False
|
104
|
+
|
105
|
+
try:
|
106
|
+
# Use local OPSIN command to check if name can be converted to SMILES
|
107
|
+
process = subprocess.run(
|
108
|
+
['opsin', '-o', 'smi'],
|
109
|
+
input=name.strip(),
|
110
|
+
text=True,
|
111
|
+
capture_output=True,
|
112
|
+
timeout=30
|
113
|
+
)
|
114
|
+
|
115
|
+
# If OPSIN successfully converts to SMILES, the name is valid IUPAC
|
116
|
+
if process.returncode == 0 and process.stdout.strip():
|
117
|
+
output = process.stdout.strip()
|
118
|
+
# Check if output looks like a valid SMILES (contains common SMILES characters)
|
119
|
+
if any(char in output for char in 'CNOS()=[]#+-'):
|
120
|
+
return True
|
121
|
+
|
122
|
+
return False
|
123
|
+
|
124
|
+
except Exception as e:
|
125
|
+
LOGGER.debug(f"OPSIN check failed for '{name}': {e}")
|
126
|
+
return False
|
127
|
+
|
93
128
|
# --- Debug dump helper ----------------------------------------------------
|
94
129
|
def _dump(text: str | bytes, path: Path | str) -> None:
|
95
130
|
"""Write `text` / `bytes` to `path`, creating parent dirs as needed."""
|
@@ -442,7 +477,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
|
|
442
477
|
You are an expert reader of protein engineering manuscripts.
|
443
478
|
Given the following article captions and section titles, identify most promising locations
|
444
479
|
(tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
|
445
|
-
activity, etc.) for enzyme variants.
|
480
|
+
activity, etc.) for enzyme variants.
|
481
|
+
|
482
|
+
CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
|
483
|
+
- Look for locations showing data for ALL enzyme variants in the evolution lineage
|
484
|
+
- Prioritize sources that show the complete evolutionary progression (parent → child variants)
|
485
|
+
- Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
|
486
|
+
- Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
|
446
487
|
|
447
488
|
IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
|
448
489
|
performance data locations. Pay careful attention to:
|
@@ -450,8 +491,13 @@ performance data locations. Pay careful attention to:
|
|
450
491
|
- Enzyme name prefixes that indicate different campaigns
|
451
492
|
- Different substrate/product types mentioned in captions
|
452
493
|
|
494
|
+
IMPORTANT FIGURE REFERENCE RULES:
|
495
|
+
- For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
|
496
|
+
- The extraction system will handle retrieving the entire figure including all sub-panels
|
497
|
+
- For tables, return the complete reference as it appears
|
498
|
+
|
453
499
|
Respond with a JSON array where each element contains:
|
454
|
-
- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
|
500
|
+
- "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
|
455
501
|
- "type": one of "table", "figure"
|
456
502
|
- "confidence": your confidence score (0-100)
|
457
503
|
- "caption": the exact caption text for this location
|
@@ -459,7 +505,12 @@ Respond with a JSON array where each element contains:
|
|
459
505
|
- "lineage_hint": any indication of which enzyme group this data is for (or null)
|
460
506
|
- "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
|
461
507
|
|
462
|
-
|
508
|
+
PRIORITIZATION RULES:
|
509
|
+
- HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
|
510
|
+
- MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
|
511
|
+
- LOWEST PRIORITY: Sources showing data for individual variants only
|
512
|
+
|
513
|
+
Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
|
463
514
|
|
464
515
|
IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
|
465
516
|
|
@@ -503,6 +554,13 @@ IMPORTANT:
|
|
503
554
|
- If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
|
504
555
|
- If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
|
505
556
|
|
557
|
+
CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
|
558
|
+
- Yield (%) measures how much product was formed (0-100%)
|
559
|
+
- Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
|
560
|
+
- TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
|
561
|
+
- These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
|
562
|
+
- Be extremely careful when extracting from tables/figures with multiple columns or data series
|
563
|
+
|
506
564
|
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
507
565
|
""")
|
508
566
|
|
@@ -530,6 +588,17 @@ STEP 4: Extract values for each matched variant
|
|
530
588
|
- CRITICAL: Read actual scale values from the axis labels and tick marks
|
531
589
|
- Verify: taller bars should have higher values, higher dots should have higher values
|
532
590
|
|
591
|
+
CRITICAL DATA ACCURACY REQUIREMENTS:
|
592
|
+
- DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
|
593
|
+
- Yield is typically shown as percentage (0-100%)
|
594
|
+
- Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
|
595
|
+
- TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
|
596
|
+
- Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
|
597
|
+
- Carefully match each bar/dot to its corresponding enzyme label on the X-axis
|
598
|
+
- If looking at grouped bars, ensure you're reading the correct bar for each metric
|
599
|
+
- Double-check that variant A's yield is not confused with variant B's yield
|
600
|
+
- If values are unclear or ambiguous, return null rather than guessing
|
601
|
+
|
533
602
|
Target enzymes to find and extract:
|
534
603
|
{enzyme_names}
|
535
604
|
|
@@ -572,23 +641,29 @@ Given the following text sections, identify where the MODEL REACTION information
|
|
572
641
|
The model reaction is the STANDARD reaction used to evaluate all enzyme variants
|
573
642
|
(not the substrate scope). Look for:
|
574
643
|
|
575
|
-
-
|
576
|
-
-
|
577
|
-
-
|
578
|
-
-
|
644
|
+
- SPECIFIC compound numbers (e.g., "1a", "2a", "3a") used in the model reaction
|
645
|
+
- Reaction SCHEMES or FIGURES showing the model reaction with numbered compounds
|
646
|
+
- Tables showing reaction conditions with specific compound IDs
|
647
|
+
- Sections titled "Model Reaction", "Standard Reaction", "General Procedure" WITH compound numbers
|
648
|
+
|
649
|
+
CRITICAL REQUIREMENTS:
|
650
|
+
1. The location MUST reference SPECIFIC numbered compounds (not generic descriptions)
|
651
|
+
2. DO NOT use generic locations like "main text" or "introduction"
|
652
|
+
3. MUST be a Figure, Scheme, Table, or specific SI section
|
653
|
+
4. Look for actual compound IDs like "1a + 2a → 3a" or "substrate 1a"
|
579
654
|
|
580
655
|
Also identify where the IUPAC names for these specific compounds are listed.
|
581
656
|
|
582
657
|
Respond with a JSON object containing:
|
583
658
|
{
|
584
659
|
"model_reaction_location": {
|
585
|
-
"location": "
|
660
|
+
"location": "SPECIFIC Figure/Scheme/Table number (e.g., 'Figure 2a', 'Scheme 1', 'Table S1')",
|
586
661
|
"confidence": 0-100,
|
587
|
-
"reason": "why this contains the model reaction",
|
588
|
-
"compound_ids": ["list", "of", "compound", "IDs", "
|
662
|
+
"reason": "why this contains the model reaction WITH specific compound IDs",
|
663
|
+
"compound_ids": ["list", "of", "SPECIFIC", "compound", "IDs", "found", "e.g.", "1a", "2a", "3a"]
|
589
664
|
},
|
590
665
|
"conditions_location": {
|
591
|
-
"location": "where reaction conditions are described",
|
666
|
+
"location": "SPECIFIC location where reaction conditions are described",
|
592
667
|
"confidence": 0-100
|
593
668
|
},
|
594
669
|
"iupac_location": {
|
@@ -598,6 +673,11 @@ Respond with a JSON object containing:
|
|
598
673
|
}
|
599
674
|
}
|
600
675
|
|
676
|
+
IMPORTANT:
|
677
|
+
- If no SPECIFIC compound IDs are found, set compound_ids to []
|
678
|
+
- The model_reaction_location MUST be a Figure, Scheme, Table, or SI section, NOT "main text"
|
679
|
+
- Look for numbered compounds like "1a", "2a", not generic terms like "enol acetates"
|
680
|
+
|
601
681
|
Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
|
602
682
|
""")
|
603
683
|
|
@@ -608,11 +688,20 @@ This is the reaction used for directed evolution screening, NOT the substrate sc
|
|
608
688
|
Look for terms like "model reaction", "standard substrate", "benchmark reaction",
|
609
689
|
or the specific reaction mentioned in enzyme screening/evolution sections.
|
610
690
|
|
691
|
+
CRITICAL STEPS FOR COMPOUND IDENTIFICATION:
|
692
|
+
1. ALWAYS look for specific compound IDs/numbers in the model reaction (e.g., "1a", "2a", "3a", "6a", "7a")
|
693
|
+
2. If the text mentions generic terms like "enol acetates" or "silyl enol ethers", search for the SPECIFIC numbered compounds used
|
694
|
+
3. Look in reaction schemes, figures, and experimental sections for numbered compounds
|
695
|
+
4. Common patterns:
|
696
|
+
- "compound 1a" or "substrate 1a"
|
697
|
+
- Numbers in bold or italics (1a, 2a, etc.)
|
698
|
+
- References like "using 1a as substrate"
|
699
|
+
|
611
700
|
CRITICAL STEPS FOR IUPAC NAMES:
|
612
|
-
1.
|
613
|
-
2.
|
614
|
-
3.
|
615
|
-
4.
|
701
|
+
1. After finding compound IDs, search the context for these IDs to find their IUPAC names
|
702
|
+
2. Look for sections with "Compound 1a:", "Product 3a:", or similar patterns
|
703
|
+
3. The IUPAC names are usually given after the compound ID in parentheses or after a colon
|
704
|
+
4. If no IUPAC name is found for a compound ID, still include the ID in substrate_list/product_list
|
616
705
|
|
617
706
|
CRITICAL FOR SUBSTRATE CONCENTRATION:
|
618
707
|
- Look carefully in FIGURES and figure captions for substrate concentration information
|
@@ -623,10 +712,10 @@ CRITICAL FOR SUBSTRATE CONCENTRATION:
|
|
623
712
|
- The substrate is the molecule being chemically transformed by the enzyme
|
624
713
|
|
625
714
|
Return a JSON object with:
|
626
|
-
* "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["
|
627
|
-
* "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents
|
628
|
-
* "product_list" - Array of product identifiers as used in the paper (e.g., ["
|
629
|
-
* "product_iupac_list" - Array of IUPAC names for ALL products formed
|
715
|
+
* "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["1a", "2a"]) - NEVER generic descriptions
|
716
|
+
* "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents (null if not found)
|
717
|
+
* "product_list" - Array of product identifiers as used in the paper (e.g., ["3a"]) - NEVER generic descriptions
|
718
|
+
* "product_iupac_list" - Array of IUPAC names for ALL products formed (null if not found)
|
630
719
|
* "reaction_substrate_concentration" - Concentration of actual substrate(s) being transformed, NOT reducing agents like dithionite
|
631
720
|
* "cofactor" - Any cofactors used (e.g., "NADH", "NADPH", "FAD", "heme", etc.) or null if none
|
632
721
|
* "reaction_temperature" - reaction temperature (e.g., "25°C", "room temperature")
|
@@ -635,7 +724,8 @@ Return a JSON object with:
|
|
635
724
|
* "reaction_other_conditions" - other important conditions (enzyme loading, reducing agents like dithionite, time, anaerobic, etc.)
|
636
725
|
|
637
726
|
IMPORTANT:
|
638
|
-
-
|
727
|
+
- ALWAYS use specific compound IDs (like "1a", "2a") in substrate_list and product_list, NEVER generic descriptions
|
728
|
+
- If you can't find specific compound IDs, look harder in figures, schemes, and experimental sections
|
639
729
|
- Substrate concentration = concentration of chemicals being transformed, NOT reducing agents (dithionite, NADH, etc.)
|
640
730
|
- Maintain correspondence: substrate_list[i] should map to substrate_iupac_list[i], same for products
|
641
731
|
- If a compound ID has no IUPAC name found, still include it in the list with null in the IUPAC list
|
@@ -749,7 +839,7 @@ Return as JSON:
|
|
749
839
|
###############################################################################
|
750
840
|
|
751
841
|
class ReactionExtractor:
|
752
|
-
_FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)
|
842
|
+
_FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\.?\s+s?\d+[a-z]?", re.I)
|
753
843
|
_TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
|
754
844
|
|
755
845
|
def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
|
@@ -887,15 +977,22 @@ class ReactionExtractor:
|
|
887
977
|
campaign_context = f"""
|
888
978
|
IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
|
889
979
|
|
890
|
-
|
980
|
+
CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
|
981
|
+
- Campaign ID: {self.campaign_info.get('campaign_id', '')}
|
891
982
|
- Name: {self.campaign_info.get('campaign_name', '')}
|
892
983
|
- Description: {self.campaign_info.get('description', '')}
|
893
984
|
- Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
|
894
985
|
- Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
|
986
|
+
- Notes: {self.campaign_info.get('notes', '')}
|
895
987
|
|
896
988
|
KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
|
897
989
|
These locations are known to contain relevant data - prioritize them highly.
|
898
990
|
|
991
|
+
CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
|
992
|
+
- Look for data showing the entire evolutionary progression of enzyme variants
|
993
|
+
- Prioritize locations that show performance data for ALL variants in the lineage
|
994
|
+
- The campaign description and notes above provide context about the evolution strategy used
|
995
|
+
|
899
996
|
{f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
|
900
997
|
|
901
998
|
CRITICAL: Only return locations that contain data for this specific campaign.
|
@@ -1367,6 +1464,10 @@ class ReactionExtractor:
|
|
1367
1464
|
text = ' '.join(text.split())
|
1368
1465
|
# Normalize different dash types
|
1369
1466
|
text = text.replace('–', '-').replace('—', '-')
|
1467
|
+
# Normalize pipe character and other special chars
|
1468
|
+
text = text.replace('|', ' ').replace('│', ' ')
|
1469
|
+
# Remove multiple spaces
|
1470
|
+
text = ' '.join(text.split())
|
1370
1471
|
return text
|
1371
1472
|
|
1372
1473
|
normalized_hint = normalize_for_matching(caption_hint[:100]) # Use first 100 chars
|
@@ -1863,15 +1964,17 @@ class ReactionExtractor:
|
|
1863
1964
|
ref_lc = location_str.lower()
|
1864
1965
|
image_b64: Optional[str] = None
|
1865
1966
|
|
1866
|
-
#
|
1867
|
-
if
|
1967
|
+
# Skip validation entirely when we have a caption hint - trust the vision model
|
1968
|
+
if caption_hint:
|
1969
|
+
LOGGER.info("Skipping validation - using caption hint for %s", location_str)
|
1970
|
+
elif not self._validate_location_exists(location_str):
|
1868
1971
|
LOGGER.warning("Location %s not found in document - skipping", location_str)
|
1869
1972
|
return []
|
1870
1973
|
|
1871
1974
|
# Add campaign context if available
|
1872
1975
|
campaign_context = ""
|
1873
1976
|
if self.campaign_filter:
|
1874
|
-
campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\n"
|
1977
|
+
campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\nEXCLUDE reference variants from other publications - only include variants created/tested in THIS study.\n"
|
1875
1978
|
|
1876
1979
|
if self._TAB_RE.search(ref_lc):
|
1877
1980
|
# For tables, try to extract the page as an image first
|
@@ -1935,6 +2038,24 @@ class ReactionExtractor:
|
|
1935
2038
|
prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
|
1936
2039
|
LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
|
1937
2040
|
tag = f"extract_metrics_batch_vision"
|
2041
|
+
|
2042
|
+
# Save the figure image to debug directory
|
2043
|
+
if self.debug_dir and isinstance(ref, dict):
|
2044
|
+
location_str = ref.get('location', str(ref))
|
2045
|
+
else:
|
2046
|
+
location_str = str(ref)
|
2047
|
+
|
2048
|
+
if self.debug_dir:
|
2049
|
+
timestamp = int(time.time())
|
2050
|
+
img_file = self.debug_dir / f"metrics_extraction_{location_str.replace(' ', '_').replace('.', '')}_{timestamp}.png"
|
2051
|
+
try:
|
2052
|
+
import base64
|
2053
|
+
img_bytes = base64.b64decode(image_b64)
|
2054
|
+
with open(img_file, 'wb') as f:
|
2055
|
+
f.write(img_bytes)
|
2056
|
+
LOGGER.info("Saved metrics extraction figure to: %s", img_file)
|
2057
|
+
except Exception as e:
|
2058
|
+
LOGGER.warning("Failed to save metrics extraction figure: %s", e)
|
1938
2059
|
else:
|
1939
2060
|
# Add enzyme names to prompt for batch extraction with explicit format requirement
|
1940
2061
|
format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
|
@@ -2071,6 +2192,10 @@ These variants belong to campaign: {self.campaign_filter}
|
|
2071
2192
|
{campaigns_context}
|
2072
2193
|
Focus on finding the model reaction that was used to evaluate THESE specific variants.
|
2073
2194
|
Different campaigns may use different model reactions.
|
2195
|
+
|
2196
|
+
CRITICAL: These variants should be from THIS study only!
|
2197
|
+
- EXCLUDE any reference variants cited from other publications
|
2198
|
+
- Only include variants that were created/engineered in this manuscript
|
2074
2199
|
"""
|
2075
2200
|
|
2076
2201
|
prompt = enzyme_context + PROMPT_FIND_MODEL_REACTION_LOCATION + "\n\n=== CAPTIONS AND SECTIONS ===\n" + all_text + "\n\n=== MANUSCRIPT TEXT PREVIEW ===\n" + ms_preview + "\n\n=== SI TEXT PREVIEW ===\n" + si_preview
|
@@ -2558,41 +2683,17 @@ Do NOT include compound information from other campaigns.
|
|
2558
2683
|
if not mapping or not mapping.iupac_name:
|
2559
2684
|
missing_compounds.append(cid)
|
2560
2685
|
|
2561
|
-
# Tier 2 (skip directly to full search): Full manuscript + SI search
|
2686
|
+
# Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
|
2562
2687
|
if missing_compounds:
|
2563
|
-
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
|
2688
|
+
LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
|
2564
2689
|
len(missing_compounds), sorted(missing_compounds))
|
2565
2690
|
|
2566
|
-
# Get all available figures for compound structure analysis
|
2567
|
-
figure_images = {}
|
2568
|
-
|
2569
|
-
# Extract main manuscript figures
|
2570
|
-
figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
|
2571
|
-
for ref in figure_refs:
|
2572
|
-
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2573
|
-
if img_b64:
|
2574
|
-
figure_images[ref] = img_b64
|
2575
|
-
LOGGER.info("Retrieved %s for compound mapping", ref)
|
2576
|
-
|
2577
|
-
# Get SI figures
|
2578
|
-
si_figure_refs = []
|
2579
|
-
for page in self.si_pages[:10]: # Check first 10 SI pages
|
2580
|
-
matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
|
2581
|
-
si_figure_refs.extend(matches[:10]) # Limit to 10 figures
|
2582
|
-
|
2583
|
-
# Extract SI figures
|
2584
|
-
for ref in set(si_figure_refs):
|
2585
|
-
if ref not in figure_images:
|
2586
|
-
img_b64 = self._extract_page_png(ref, extract_figure_only=True)
|
2587
|
-
if img_b64:
|
2588
|
-
figure_images[ref] = img_b64
|
2589
|
-
LOGGER.info("Extracted %s for compound mapping", ref)
|
2590
|
-
|
2591
2691
|
# Full text search including ALL pages (manuscript + SI)
|
2592
2692
|
full_text = "\n\n".join(self.all_pages) # Send everything
|
2593
2693
|
|
2594
|
-
|
2595
|
-
|
2694
|
+
# Use text-only extraction for Tier 2 (no images)
|
2695
|
+
final_mappings = self._extract_compound_mappings_from_text(
|
2696
|
+
full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
|
2596
2697
|
)
|
2597
2698
|
|
2598
2699
|
# Merge final mappings with better compound ID matching
|
@@ -2826,6 +2927,12 @@ These variants belong to campaign: {self.campaign_filter}
|
|
2826
2927
|
Focus on extracting the model reaction that was used to evaluate THESE specific variants.
|
2827
2928
|
Different campaigns may use different model reactions and substrates.
|
2828
2929
|
|
2930
|
+
CRITICAL: EXCLUDE reference variants from other publications!
|
2931
|
+
- Only extract data for variants that were actually tested/created in THIS study
|
2932
|
+
- Do NOT include data for reference enzymes cited from other papers
|
2933
|
+
- Look for phrases like "from reference", "previously reported", "from [Author] et al." to identify reference variants
|
2934
|
+
- Focus ONLY on the variants that were engineered/tested in this manuscript
|
2935
|
+
|
2829
2936
|
"""
|
2830
2937
|
|
2831
2938
|
# Include both manuscript and SI text for better coverage
|
@@ -2933,34 +3040,6 @@ Different campaigns may use different model reactions and substrates.
|
|
2933
3040
|
LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
|
2934
3041
|
list(compound_mappings.keys()))
|
2935
3042
|
|
2936
|
-
# First, populate IUPAC lists directly from compound mappings based on compound_type
|
2937
|
-
substrate_iupacs_from_mappings = []
|
2938
|
-
product_iupacs_from_mappings = []
|
2939
|
-
|
2940
|
-
for mapping in compound_mappings.values():
|
2941
|
-
if mapping.iupac_name and mapping.compound_type:
|
2942
|
-
if mapping.compound_type.lower() == "substrate":
|
2943
|
-
substrate_iupacs_from_mappings.append(mapping.iupac_name)
|
2944
|
-
LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
|
2945
|
-
elif mapping.compound_type.lower() == "product":
|
2946
|
-
product_iupacs_from_mappings.append(mapping.iupac_name)
|
2947
|
-
LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
|
2948
|
-
|
2949
|
-
# Initialize or update the IUPAC lists with mapped compounds
|
2950
|
-
if substrate_iupacs_from_mappings:
|
2951
|
-
existing_substrates = data.get("substrate_iupac_list", []) or []
|
2952
|
-
if isinstance(existing_substrates, list):
|
2953
|
-
data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
|
2954
|
-
else:
|
2955
|
-
data["substrate_iupac_list"] = substrate_iupacs_from_mappings
|
2956
|
-
|
2957
|
-
if product_iupacs_from_mappings:
|
2958
|
-
existing_products = data.get("product_iupac_list", []) or []
|
2959
|
-
if isinstance(existing_products, list):
|
2960
|
-
data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
|
2961
|
-
else:
|
2962
|
-
data["product_iupac_list"] = product_iupacs_from_mappings
|
2963
|
-
|
2964
3043
|
# Try to map substrate/product lists through compound IDs
|
2965
3044
|
substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
|
2966
3045
|
if isinstance(substrate_list, list):
|
@@ -3053,6 +3132,100 @@ Different campaigns may use different model reactions and substrates.
|
|
3053
3132
|
]
|
3054
3133
|
for key in expected_keys:
|
3055
3134
|
data.setdefault(key, None)
|
3135
|
+
|
3136
|
+
# === OPSIN VALIDATION AND COMPOUND MAPPING FALLBACK ===
|
3137
|
+
# Check if the IUPAC names are actually valid using OPSIN
|
3138
|
+
needs_compound_mapping = False
|
3139
|
+
|
3140
|
+
# Check substrate IUPAC names
|
3141
|
+
substrate_has_invalid = False
|
3142
|
+
if data.get("substrate_list") and isinstance(data["substrate_list"], list):
|
3143
|
+
# Check if we have substrate IDs but missing or invalid IUPAC names
|
3144
|
+
if not data.get("substrate_iupac_list"):
|
3145
|
+
LOGGER.warning("Substrate list exists but no IUPAC names provided")
|
3146
|
+
substrate_has_invalid = True
|
3147
|
+
else:
|
3148
|
+
substrate_names = data["substrate_iupac_list"].split("; ") if isinstance(data["substrate_iupac_list"], str) else []
|
3149
|
+
# Check each substrate ID has a valid IUPAC name
|
3150
|
+
for i, substrate_id in enumerate(data["substrate_list"]):
|
3151
|
+
if i >= len(substrate_names) or not substrate_names[i]:
|
3152
|
+
LOGGER.warning(f"No IUPAC name for substrate '{substrate_id}'")
|
3153
|
+
substrate_has_invalid = True
|
3154
|
+
elif not is_valid_iupac_name_with_opsin(substrate_names[i]):
|
3155
|
+
LOGGER.warning(f"Invalid IUPAC name detected for substrate '{substrate_id}': '{substrate_names[i]}'")
|
3156
|
+
substrate_has_invalid = True
|
3157
|
+
|
3158
|
+
if substrate_has_invalid:
|
3159
|
+
needs_compound_mapping = True
|
3160
|
+
LOGGER.info("Found missing or invalid substrate IUPAC names, will attempt compound mapping")
|
3161
|
+
|
3162
|
+
# Check product IUPAC names
|
3163
|
+
product_has_invalid = False
|
3164
|
+
if data.get("product_list") and isinstance(data["product_list"], list):
|
3165
|
+
# Check if we have product IDs but missing or invalid IUPAC names
|
3166
|
+
if not data.get("product_iupac_list"):
|
3167
|
+
LOGGER.warning("Product list exists but no IUPAC names provided")
|
3168
|
+
product_has_invalid = True
|
3169
|
+
else:
|
3170
|
+
product_names = data["product_iupac_list"].split("; ") if isinstance(data["product_iupac_list"], str) else []
|
3171
|
+
# Check each product ID has a valid IUPAC name
|
3172
|
+
for i, product_id in enumerate(data["product_list"]):
|
3173
|
+
if i >= len(product_names) or not product_names[i]:
|
3174
|
+
LOGGER.warning(f"No IUPAC name for product '{product_id}'")
|
3175
|
+
product_has_invalid = True
|
3176
|
+
elif not is_valid_iupac_name_with_opsin(product_names[i]):
|
3177
|
+
LOGGER.warning(f"Invalid IUPAC name detected for product '{product_id}': '{product_names[i]}'")
|
3178
|
+
product_has_invalid = True
|
3179
|
+
|
3180
|
+
if product_has_invalid:
|
3181
|
+
needs_compound_mapping = True
|
3182
|
+
LOGGER.info("Found missing or invalid product IUPAC names, will attempt compound mapping")
|
3183
|
+
|
3184
|
+
# If we need compound mapping and have substrate/product lists, attempt it
|
3185
|
+
if needs_compound_mapping and (data.get("substrate_list") or data.get("product_list")):
|
3186
|
+
LOGGER.info("Attempting compound mapping due to invalid IUPAC names")
|
3187
|
+
|
3188
|
+
# Collect all compound IDs that need mapping
|
3189
|
+
compound_ids_to_map = []
|
3190
|
+
if data.get("substrate_list") and isinstance(data["substrate_list"], list):
|
3191
|
+
compound_ids_to_map.extend(data["substrate_list"])
|
3192
|
+
if data.get("product_list") and isinstance(data["product_list"], list):
|
3193
|
+
compound_ids_to_map.extend(data["product_list"])
|
3194
|
+
|
3195
|
+
if compound_ids_to_map:
|
3196
|
+
LOGGER.info(f"Attempting to map compound IDs: {compound_ids_to_map}")
|
3197
|
+
|
3198
|
+
# Use the adaptive compound mapping
|
3199
|
+
compound_mappings = self._extract_compound_mappings_adaptive(
|
3200
|
+
compound_ids_to_map,
|
3201
|
+
campaign_filter=self.campaign_filter
|
3202
|
+
)
|
3203
|
+
|
3204
|
+
# Re-map substrate IUPAC names
|
3205
|
+
if data.get("substrate_list") and isinstance(data["substrate_list"], list):
|
3206
|
+
mapped_substrates = []
|
3207
|
+
for substrate_id in data["substrate_list"]:
|
3208
|
+
mapping = compound_mappings.get(substrate_id.lower().strip())
|
3209
|
+
if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
|
3210
|
+
mapped_substrates.append(mapping.iupac_name)
|
3211
|
+
LOGGER.info(f"Successfully mapped substrate '{substrate_id}' to IUPAC: {mapping.iupac_name}")
|
3212
|
+
|
3213
|
+
if mapped_substrates:
|
3214
|
+
data["substrate_iupac_list"] = "; ".join(mapped_substrates)
|
3215
|
+
LOGGER.info(f"Updated substrate IUPAC list with {len(mapped_substrates)} valid names")
|
3216
|
+
|
3217
|
+
# Re-map product IUPAC names
|
3218
|
+
if data.get("product_list") and isinstance(data["product_list"], list):
|
3219
|
+
mapped_products = []
|
3220
|
+
for product_id in data["product_list"]:
|
3221
|
+
mapping = compound_mappings.get(product_id.lower().strip())
|
3222
|
+
if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
|
3223
|
+
mapped_products.append(mapping.iupac_name)
|
3224
|
+
LOGGER.info(f"Successfully mapped product '{product_id}' to IUPAC: {mapping.iupac_name}")
|
3225
|
+
|
3226
|
+
if mapped_products:
|
3227
|
+
data["product_iupac_list"] = "; ".join(mapped_products)
|
3228
|
+
LOGGER.info(f"Updated product IUPAC list with {len(mapped_products)} valid names")
|
3056
3229
|
|
3057
3230
|
return data
|
3058
3231
|
|
@@ -3131,21 +3304,10 @@ Different campaigns may use different model reactions and substrates.
|
|
3131
3304
|
# Extract model reaction for this location - use unified approach
|
3132
3305
|
LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
|
3133
3306
|
|
3134
|
-
#
|
3135
|
-
|
3136
|
-
|
3137
|
-
|
3138
|
-
model_reaction_locations
|
3139
|
-
)
|
3140
|
-
|
3141
|
-
# Check if lineage extraction was successful
|
3142
|
-
if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
|
3143
|
-
LOGGER.info("Using lineage-specific model reaction data")
|
3144
|
-
model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
|
3145
|
-
else:
|
3146
|
-
LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
|
3147
|
-
# Use the comprehensive multimodal approach as fallback
|
3148
|
-
model_info = self.gather_model_reaction_info(location_enzymes)
|
3307
|
+
# Skip lineage-specific extraction and use comprehensive multimodal extraction directly
|
3308
|
+
# The lineage-specific extraction often returns generic substrate classes instead of specific compounds
|
3309
|
+
LOGGER.info("Using comprehensive multimodal extraction for model reaction")
|
3310
|
+
model_info = self.gather_model_reaction_info(location_enzymes)
|
3149
3311
|
|
3150
3312
|
LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
|
3151
3313
|
|
@@ -3571,6 +3733,11 @@ def main() -> None:
|
|
3571
3733
|
LOGGER.info("Loading enzyme data from CSV…")
|
3572
3734
|
enzyme_df = pd.read_csv(args.lineage_csv)
|
3573
3735
|
|
3736
|
+
# Rename enzyme_id to enzyme if needed
|
3737
|
+
if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
|
3738
|
+
enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
|
3739
|
+
LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
|
3740
|
+
|
3574
3741
|
# Detect campaign information from the enzyme CSV
|
3575
3742
|
if 'campaign_id' in enzyme_df.columns:
|
3576
3743
|
all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
|
@@ -3601,6 +3768,11 @@ def main() -> None:
|
|
3601
3768
|
campaign_info=campaign_info)
|
3602
3769
|
df_metrics = extractor.run(enzyme_df)
|
3603
3770
|
|
3771
|
+
# For single campaign, also merge with lineage data
|
3772
|
+
if not df_metrics.empty:
|
3773
|
+
df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3774
|
+
LOGGER.info("Merged metrics with lineage data for single campaign")
|
3775
|
+
|
3604
3776
|
elif len(all_campaigns) > 1:
|
3605
3777
|
LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
|
3606
3778
|
all_results = []
|
@@ -3651,6 +3823,10 @@ def main() -> None:
|
|
3651
3823
|
# Merge campaign metrics with lineage data
|
3652
3824
|
campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
|
3653
3825
|
|
3826
|
+
# Rename aa_seq to protein_sequence for consistency
|
3827
|
+
if 'aa_seq' in campaign_final.columns:
|
3828
|
+
campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
|
3829
|
+
|
3654
3830
|
# Save campaign-specific file immediately
|
3655
3831
|
output_dir = args.output.parent
|
3656
3832
|
base_name = args.output.stem
|
@@ -3667,6 +3843,10 @@ def main() -> None:
|
|
3667
3843
|
# Still save an empty campaign file with lineage data
|
3668
3844
|
campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
|
3669
3845
|
if not campaign_lineage.empty:
|
3846
|
+
# Rename aa_seq to protein_sequence for consistency
|
3847
|
+
if 'aa_seq' in campaign_lineage.columns:
|
3848
|
+
campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
|
3849
|
+
|
3670
3850
|
output_dir = args.output.parent
|
3671
3851
|
base_name = args.output.stem
|
3672
3852
|
campaign_file = output_dir / f"{base_name}_{campaign}.csv"
|
@@ -3697,6 +3877,11 @@ def main() -> None:
|
|
3697
3877
|
df_final = df_metrics
|
3698
3878
|
LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
|
3699
3879
|
|
3880
|
+
# Rename aa_seq to protein_sequence for consistency
|
3881
|
+
if df_final is not None and 'aa_seq' in df_final.columns:
|
3882
|
+
df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
|
3883
|
+
LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
|
3884
|
+
|
3700
3885
|
df_final.to_csv(args.output, index=False)
|
3701
3886
|
LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
|
3702
3887
|
|