debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,7 @@ import json
29
29
  import logging
30
30
  import os
31
31
  import re
32
+ import subprocess
32
33
  import sys
33
34
  import time
34
35
  from base64 import b64encode, b64decode
@@ -90,6 +91,40 @@ handler.setFormatter(logging.Formatter("%(levelname)s [%(name)s] %(message)s"))
90
91
  LOGGER.addHandler(handler)
91
92
  LOGGER.setLevel(logging.INFO)
92
93
 
94
+ # === OPSIN VALIDATION === -------------------------------------------------
95
+
96
+ def is_valid_iupac_name_with_opsin(name: str) -> bool:
97
+ """Check if a name is a valid IUPAC name using the local OPSIN command."""
98
+ if not name or len(name.strip()) < 3:
99
+ return False
100
+
101
+ # Skip if it looks like a compound ID (e.g., "1a", "S1", etc.)
102
+ if re.match(r'^[0-9]+[a-z]?$|^S\d+$', name.strip()):
103
+ return False
104
+
105
+ try:
106
+ # Use local OPSIN command to check if name can be converted to SMILES
107
+ process = subprocess.run(
108
+ ['opsin', '-o', 'smi'],
109
+ input=name.strip(),
110
+ text=True,
111
+ capture_output=True,
112
+ timeout=30
113
+ )
114
+
115
+ # If OPSIN successfully converts to SMILES, the name is valid IUPAC
116
+ if process.returncode == 0 and process.stdout.strip():
117
+ output = process.stdout.strip()
118
+ # Check if output looks like a valid SMILES (contains common SMILES characters)
119
+ if any(char in output for char in 'CNOS()=[]#+-'):
120
+ return True
121
+
122
+ return False
123
+
124
+ except Exception as e:
125
+ LOGGER.debug(f"OPSIN check failed for '{name}': {e}")
126
+ return False
127
+
93
128
  # --- Debug dump helper ----------------------------------------------------
94
129
  def _dump(text: str | bytes, path: Path | str) -> None:
95
130
  """Write `text` / `bytes` to `path`, creating parent dirs as needed."""
@@ -442,7 +477,13 @@ PROMPT_FIND_LOCATIONS = dedent("""
442
477
  You are an expert reader of protein engineering manuscripts.
443
478
  Given the following article captions and section titles, identify most promising locations
444
479
  (tables or figures) that contain reaction performance data (yield, TON, TTN, ee,
445
- activity, etc.) for enzyme variants. Use your best judgement to include location showing full evolution lineage data.
480
+ activity, etc.) for enzyme variants.
481
+
482
+ CRITICAL PRIORITY: FULL EVOLUTION LINEAGE DATA IS REQUIRED
483
+ - Look for locations showing data for ALL enzyme variants in the evolution lineage
484
+ - Prioritize sources that show the complete evolutionary progression (parent → child variants)
485
+ - Look for captions mentioning "sequentially evolved", "evolution lineage", "rounds of evolution", "directed evolution progression"
486
+ - Sources showing data for individual variants only (e.g., just the final variant) are LESS VALUABLE than complete lineage data
446
487
 
447
488
  IMPORTANT: Some papers have multiple enzyme lineages/campaigns with different
448
489
  performance data locations. Pay careful attention to:
@@ -450,8 +491,13 @@ performance data locations. Pay careful attention to:
450
491
  - Enzyme name prefixes that indicate different campaigns
451
492
  - Different substrate/product types mentioned in captions
452
493
 
494
+ IMPORTANT FIGURE REFERENCE RULES:
495
+ - For figures, ALWAYS return the main figure number only (e.g., "Figure 2", NOT "Figure 2a" or "Figure 2(a)")
496
+ - The extraction system will handle retrieving the entire figure including all sub-panels
497
+ - For tables, return the complete reference as it appears
498
+
453
499
  Respond with a JSON array where each element contains:
454
- - "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2")
500
+ - "location": the identifier (e.g. "Table S1", "Figure 3", "Table 2", NOT "Figure 3a")
455
501
  - "type": one of "table", "figure"
456
502
  - "confidence": your confidence score (0-100)
457
503
  - "caption": the exact caption text for this location
@@ -459,7 +505,12 @@ Respond with a JSON array where each element contains:
459
505
  - "lineage_hint": any indication of which enzyme group this data is for (or null)
460
506
  - "campaign_clues": specific text in the caption that indicates the campaign (enzyme names, substrate types, etc.)
461
507
 
462
- Tables are generally preferred over figures unless you are convinced that only the figure you find have complete lineage reaction matrix information. Some table don't have performance data, check provided context of the specific table.
508
+ PRIORITIZATION RULES:
509
+ - HIGHEST PRIORITY: Sources showing COMPLETE evolution lineage data (all variants in progression)
510
+ - MEDIUM PRIORITY: Sources showing data for multiple variants (but not complete lineage)
511
+ - LOWEST PRIORITY: Sources showing data for individual variants only
512
+
513
+ Tables are generally preferred over figures unless you are convinced that only the figure contains complete lineage reaction matrix information. Some tables don't have performance data, check provided context of the specific table.
463
514
 
464
515
  IMPORTANT FOR TABLES: When evaluating a table, check if the context below the table shows performance values (TTN, yield, ee, etc.). If the table caption mentions enzymes but the table only shows mutations/sequences, look for performance data in the text immediately following the table. If context below the table shows numerical values, use the table location as it likely contains the referenced data.
465
516
 
@@ -503,6 +554,13 @@ IMPORTANT:
503
554
  - If the table shows different reactions (e.g., pyrrolidine vs indoline), note this in "notes"
504
555
  - If you find conflicting values between bar graphs and text, or multiple sources for the same enzyme, ONLY use the most complete and reliable source (typically the primary figure/table being analyzed)
505
556
 
557
+ CRITICAL: DO NOT CONFUSE DIFFERENT METRICS:
558
+ - Yield (%) measures how much product was formed (0-100%)
559
+ - Selectivity/ee (%) measures enantiomeric excess - the stereoselectivity of the reaction
560
+ - TTN (number) measures total turnovers - how many substrate molecules each enzyme converts
561
+ - These are COMPLETELY DIFFERENT values - a reaction might have 95% yield but 85% ee and 1000 TTN
562
+ - Be extremely careful when extracting from tables/figures with multiple columns or data series
563
+
506
564
  Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
507
565
  """)
508
566
 
@@ -530,6 +588,17 @@ STEP 4: Extract values for each matched variant
530
588
  - CRITICAL: Read actual scale values from the axis labels and tick marks
531
589
  - Verify: taller bars should have higher values, higher dots should have higher values
532
590
 
591
+ CRITICAL DATA ACCURACY REQUIREMENTS:
592
+ - DO NOT CONFUSE yield with selectivity (ee) with TTN values - these are completely different metrics
593
+ - Yield is typically shown as percentage (0-100%)
594
+ - Selectivity/ee is enantiomeric excess, also shown as percentage but measures stereoselectivity
595
+ - TTN (Total Turnover Number) is the number of substrate molecules converted per enzyme molecule
596
+ - Each enzyme variant should have its OWN yield, ee, and TTN values - do not mix values between variants
597
+ - Carefully match each bar/dot to its corresponding enzyme label on the X-axis
598
+ - If looking at grouped bars, ensure you're reading the correct bar for each metric
599
+ - Double-check that variant A's yield is not confused with variant B's yield
600
+ - If values are unclear or ambiguous, return null rather than guessing
601
+
533
602
  Target enzymes to find and extract:
534
603
  {enzyme_names}
535
604
 
@@ -572,23 +641,29 @@ Given the following text sections, identify where the MODEL REACTION information
572
641
  The model reaction is the STANDARD reaction used to evaluate all enzyme variants
573
642
  (not the substrate scope). Look for:
574
643
 
575
- - Sections titled "Model Reaction", "Standard Reaction", "General Procedure"
576
- - Text describing the reaction conditions used for enzyme evolution/screening
577
- - Sections describing which substrates were used as the benchmark
578
- - Compound numbers (e.g., "6a", "7a") used in the model reaction
644
+ - SPECIFIC compound numbers (e.g., "1a", "2a", "3a") used in the model reaction
645
+ - Reaction SCHEMES or FIGURES showing the model reaction with numbered compounds
646
+ - Tables showing reaction conditions with specific compound IDs
647
+ - Sections titled "Model Reaction", "Standard Reaction", "General Procedure" WITH compound numbers
648
+
649
+ CRITICAL REQUIREMENTS:
650
+ 1. The location MUST reference SPECIFIC numbered compounds (not generic descriptions)
651
+ 2. DO NOT use generic locations like "main text" or "introduction"
652
+ 3. MUST be a Figure, Scheme, Table, or specific SI section
653
+ 4. Look for actual compound IDs like "1a + 2a → 3a" or "substrate 1a"
579
654
 
580
655
  Also identify where the IUPAC names for these specific compounds are listed.
581
656
 
582
657
  Respond with a JSON object containing:
583
658
  {
584
659
  "model_reaction_location": {
585
- "location": "section name or description",
660
+ "location": "SPECIFIC Figure/Scheme/Table number (e.g., 'Figure 2a', 'Scheme 1', 'Table S1')",
586
661
  "confidence": 0-100,
587
- "reason": "why this contains the model reaction",
588
- "compound_ids": ["list", "of", "compound", "IDs", "if", "found"]
662
+ "reason": "why this contains the model reaction WITH specific compound IDs",
663
+ "compound_ids": ["list", "of", "SPECIFIC", "compound", "IDs", "found", "e.g.", "1a", "2a", "3a"]
589
664
  },
590
665
  "conditions_location": {
591
- "location": "where reaction conditions are described",
666
+ "location": "SPECIFIC location where reaction conditions are described",
592
667
  "confidence": 0-100
593
668
  },
594
669
  "iupac_location": {
@@ -598,6 +673,11 @@ Respond with a JSON object containing:
598
673
  }
599
674
  }
600
675
 
676
+ IMPORTANT:
677
+ - If no SPECIFIC compound IDs are found, set compound_ids to []
678
+ - The model_reaction_location MUST be a Figure, Scheme, Table, or SI section, NOT "main text"
679
+ - Look for numbered compounds like "1a", "2a", not generic terms like "enol acetates"
680
+
601
681
  Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
602
682
  """)
603
683
 
@@ -608,11 +688,20 @@ This is the reaction used for directed evolution screening, NOT the substrate sc
608
688
  Look for terms like "model reaction", "standard substrate", "benchmark reaction",
609
689
  or the specific reaction mentioned in enzyme screening/evolution sections.
610
690
 
691
+ CRITICAL STEPS FOR COMPOUND IDENTIFICATION:
692
+ 1. ALWAYS look for specific compound IDs/numbers in the model reaction (e.g., "1a", "2a", "3a", "6a", "7a")
693
+ 2. If the text mentions generic terms like "enol acetates" or "silyl enol ethers", search for the SPECIFIC numbered compounds used
694
+ 3. Look in reaction schemes, figures, and experimental sections for numbered compounds
695
+ 4. Common patterns:
696
+ - "compound 1a" or "substrate 1a"
697
+ - Numbers in bold or italics (1a, 2a, etc.)
698
+ - References like "using 1a as substrate"
699
+
611
700
  CRITICAL STEPS FOR IUPAC NAMES:
612
- 1. First identify the compound IDs used in the model reaction (e.g., "6a", "7a")
613
- 2. Then search the provided context for these compound IDs to find their IUPAC names
614
- 3. Look for sections with "Compound 6a", "Product 7a", or similar patterns
615
- 4. The IUPAC names are usually given after the compound ID in parentheses or after a colon
701
+ 1. After finding compound IDs, search the context for these IDs to find their IUPAC names
702
+ 2. Look for sections with "Compound 1a:", "Product 3a:", or similar patterns
703
+ 3. The IUPAC names are usually given after the compound ID in parentheses or after a colon
704
+ 4. If no IUPAC name is found for a compound ID, still include the ID in substrate_list/product_list
616
705
 
617
706
  CRITICAL FOR SUBSTRATE CONCENTRATION:
618
707
  - Look carefully in FIGURES and figure captions for substrate concentration information
@@ -623,10 +712,10 @@ CRITICAL FOR SUBSTRATE CONCENTRATION:
623
712
  - The substrate is the molecule being chemically transformed by the enzyme
624
713
 
625
714
  Return a JSON object with:
626
- * "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["5", "6a"])
627
- * "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents
628
- * "product_list" - Array of product identifiers as used in the paper (e.g., ["7a"])
629
- * "product_iupac_list" - Array of IUPAC names for ALL products formed
715
+ * "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["1a", "2a"]) - NEVER generic descriptions
716
+ * "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents (null if not found)
717
+ * "product_list" - Array of product identifiers as used in the paper (e.g., ["3a"]) - NEVER generic descriptions
718
+ * "product_iupac_list" - Array of IUPAC names for ALL products formed (null if not found)
630
719
  * "reaction_substrate_concentration" - Concentration of actual substrate(s) being transformed, NOT reducing agents like dithionite
631
720
  * "cofactor" - Any cofactors used (e.g., "NADH", "NADPH", "FAD", "heme", etc.) or null if none
632
721
  * "reaction_temperature" - reaction temperature (e.g., "25°C", "room temperature")
@@ -635,7 +724,8 @@ Return a JSON object with:
635
724
  * "reaction_other_conditions" - other important conditions (enzyme loading, reducing agents like dithionite, time, anaerobic, etc.)
636
725
 
637
726
  IMPORTANT:
638
- - Extract the reaction used for ENZYME EVOLUTION/SCREENING (not substrate scope)
727
+ - ALWAYS use specific compound IDs (like "1a", "2a") in substrate_list and product_list, NEVER generic descriptions
728
+ - If you can't find specific compound IDs, look harder in figures, schemes, and experimental sections
639
729
  - Substrate concentration = concentration of chemicals being transformed, NOT reducing agents (dithionite, NADH, etc.)
640
730
  - Maintain correspondence: substrate_list[i] should map to substrate_iupac_list[i], same for products
641
731
  - If a compound ID has no IUPAC name found, still include it in the list with null in the IUPAC list
@@ -749,7 +839,7 @@ Return as JSON:
749
839
  ###############################################################################
750
840
 
751
841
  class ReactionExtractor:
752
- _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
842
+ _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\.?\s+s?\d+[a-z]?", re.I)
753
843
  _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
754
844
 
755
845
  def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
@@ -887,15 +977,22 @@ class ReactionExtractor:
887
977
  campaign_context = f"""
888
978
  IMPORTANT: You are looking for performance data specifically for the {self.campaign_filter} campaign.
889
979
 
890
- Campaign Details:
980
+ CAMPAIGN DETAILS FROM CAMPAIGNS.JSON:
981
+ - Campaign ID: {self.campaign_info.get('campaign_id', '')}
891
982
  - Name: {self.campaign_info.get('campaign_name', '')}
892
983
  - Description: {self.campaign_info.get('description', '')}
893
984
  - Model Substrate: {self.campaign_info.get('model_substrate', '')} (ID: {self.campaign_info.get('substrate_id', '')})
894
985
  - Model Product: {self.campaign_info.get('model_product', '')} (ID: {self.campaign_info.get('product_id', '')})
986
+ - Notes: {self.campaign_info.get('notes', '')}
895
987
 
896
988
  KNOWN DATA LOCATIONS FOR THIS CAMPAIGN: {', '.join(location_hints)}
897
989
  These locations are known to contain relevant data - prioritize them highly.
898
990
 
991
+ CRITICAL REQUIREMENT: For this campaign, you must find locations that contain COMPLETE EVOLUTION LINEAGE DATA.
992
+ - Look for data showing the entire evolutionary progression of enzyme variants
993
+ - Prioritize locations that show performance data for ALL variants in the lineage
994
+ - The campaign description and notes above provide context about the evolution strategy used
995
+
899
996
  {f"ALL CAMPAIGNS IN THIS PAPER: {chr(10).join([f'- {c}' for c in self.all_campaigns])}" if self.all_campaigns else ""}
900
997
 
901
998
  CRITICAL: Only return locations that contain data for this specific campaign.
@@ -1367,6 +1464,10 @@ class ReactionExtractor:
1367
1464
  text = ' '.join(text.split())
1368
1465
  # Normalize different dash types
1369
1466
  text = text.replace('–', '-').replace('—', '-')
1467
+ # Normalize pipe character and other special chars
1468
+ text = text.replace('|', ' ').replace('│', ' ')
1469
+ # Remove multiple spaces
1470
+ text = ' '.join(text.split())
1370
1471
  return text
1371
1472
 
1372
1473
  normalized_hint = normalize_for_matching(caption_hint[:100]) # Use first 100 chars
@@ -1863,15 +1964,17 @@ class ReactionExtractor:
1863
1964
  ref_lc = location_str.lower()
1864
1965
  image_b64: Optional[str] = None
1865
1966
 
1866
- # First, validate that the location actually exists in the document
1867
- if not self._validate_location_exists(location_str):
1967
+ # Skip validation entirely when we have a caption hint - trust the vision model
1968
+ if caption_hint:
1969
+ LOGGER.info("Skipping validation - using caption hint for %s", location_str)
1970
+ elif not self._validate_location_exists(location_str):
1868
1971
  LOGGER.warning("Location %s not found in document - skipping", location_str)
1869
1972
  return []
1870
1973
 
1871
1974
  # Add campaign context if available
1872
1975
  campaign_context = ""
1873
1976
  if self.campaign_filter:
1874
- campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\n"
1977
+ campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\nEXCLUDE reference variants from other publications - only include variants created/tested in THIS study.\n"
1875
1978
 
1876
1979
  if self._TAB_RE.search(ref_lc):
1877
1980
  # For tables, try to extract the page as an image first
@@ -1935,6 +2038,24 @@ class ReactionExtractor:
1935
2038
  prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
1936
2039
  LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
1937
2040
  tag = f"extract_metrics_batch_vision"
2041
+
2042
+ # Save the figure image to debug directory
2043
+ if self.debug_dir and isinstance(ref, dict):
2044
+ location_str = ref.get('location', str(ref))
2045
+ else:
2046
+ location_str = str(ref)
2047
+
2048
+ if self.debug_dir:
2049
+ timestamp = int(time.time())
2050
+ img_file = self.debug_dir / f"metrics_extraction_{location_str.replace(' ', '_').replace('.', '')}_{timestamp}.png"
2051
+ try:
2052
+ import base64
2053
+ img_bytes = base64.b64decode(image_b64)
2054
+ with open(img_file, 'wb') as f:
2055
+ f.write(img_bytes)
2056
+ LOGGER.info("Saved metrics extraction figure to: %s", img_file)
2057
+ except Exception as e:
2058
+ LOGGER.warning("Failed to save metrics extraction figure: %s", e)
1938
2059
  else:
1939
2060
  # Add enzyme names to prompt for batch extraction with explicit format requirement
1940
2061
  format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
@@ -2071,6 +2192,10 @@ These variants belong to campaign: {self.campaign_filter}
2071
2192
  {campaigns_context}
2072
2193
  Focus on finding the model reaction that was used to evaluate THESE specific variants.
2073
2194
  Different campaigns may use different model reactions.
2195
+
2196
+ CRITICAL: These variants should be from THIS study only!
2197
+ - EXCLUDE any reference variants cited from other publications
2198
+ - Only include variants that were created/engineered in this manuscript
2074
2199
  """
2075
2200
 
2076
2201
  prompt = enzyme_context + PROMPT_FIND_MODEL_REACTION_LOCATION + "\n\n=== CAPTIONS AND SECTIONS ===\n" + all_text + "\n\n=== MANUSCRIPT TEXT PREVIEW ===\n" + ms_preview + "\n\n=== SI TEXT PREVIEW ===\n" + si_preview
@@ -2558,41 +2683,17 @@ Do NOT include compound information from other campaigns.
2558
2683
  if not mapping or not mapping.iupac_name:
2559
2684
  missing_compounds.append(cid)
2560
2685
 
2561
- # Tier 2 (skip directly to full search): Full manuscript + SI search with all available figures
2686
+ # Tier 2 (skip directly to full search): Full manuscript + SI search WITHOUT figures
2562
2687
  if missing_compounds:
2563
- LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full search: %s",
2688
+ LOGGER.info("Tier 2: %d compounds still missing IUPAC names, going directly to full text search: %s",
2564
2689
  len(missing_compounds), sorted(missing_compounds))
2565
2690
 
2566
- # Get all available figures for compound structure analysis
2567
- figure_images = {}
2568
-
2569
- # Extract main manuscript figures
2570
- figure_refs = ["Figure 1", "Figure 2", "Figure 3", "Figure 4", "Scheme 1", "Scheme 2", "Scheme 3"]
2571
- for ref in figure_refs:
2572
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2573
- if img_b64:
2574
- figure_images[ref] = img_b64
2575
- LOGGER.info("Retrieved %s for compound mapping", ref)
2576
-
2577
- # Get SI figures
2578
- si_figure_refs = []
2579
- for page in self.si_pages[:10]: # Check first 10 SI pages
2580
- matches = re.findall(r"Figure S\d+|Scheme S\d+", page)
2581
- si_figure_refs.extend(matches[:10]) # Limit to 10 figures
2582
-
2583
- # Extract SI figures
2584
- for ref in set(si_figure_refs):
2585
- if ref not in figure_images:
2586
- img_b64 = self._extract_page_png(ref, extract_figure_only=True)
2587
- if img_b64:
2588
- figure_images[ref] = img_b64
2589
- LOGGER.info("Extracted %s for compound mapping", ref)
2590
-
2591
2691
  # Full text search including ALL pages (manuscript + SI)
2592
2692
  full_text = "\n\n".join(self.all_pages) # Send everything
2593
2693
 
2594
- final_mappings = self._extract_compound_mappings_with_figures(
2595
- full_text, missing_compounds, figure_images, tag_suffix="tier2", campaign_filter=campaign_filter
2694
+ # Use text-only extraction for Tier 2 (no images)
2695
+ final_mappings = self._extract_compound_mappings_from_text(
2696
+ full_text[:100000], missing_compounds, tag_suffix="tier2", campaign_filter=campaign_filter
2596
2697
  )
2597
2698
 
2598
2699
  # Merge final mappings with better compound ID matching
@@ -2826,6 +2927,12 @@ These variants belong to campaign: {self.campaign_filter}
2826
2927
  Focus on extracting the model reaction that was used to evaluate THESE specific variants.
2827
2928
  Different campaigns may use different model reactions and substrates.
2828
2929
 
2930
+ CRITICAL: EXCLUDE reference variants from other publications!
2931
+ - Only extract data for variants that were actually tested/created in THIS study
2932
+ - Do NOT include data for reference enzymes cited from other papers
2933
+ - Look for phrases like "from reference", "previously reported", "from [Author] et al." to identify reference variants
2934
+ - Focus ONLY on the variants that were engineered/tested in this manuscript
2935
+
2829
2936
  """
2830
2937
 
2831
2938
  # Include both manuscript and SI text for better coverage
@@ -2933,34 +3040,6 @@ Different campaigns may use different model reactions and substrates.
2933
3040
  LOGGER.info("Enhancing IUPAC names using compound mappings. Available mappings: %s",
2934
3041
  list(compound_mappings.keys()))
2935
3042
 
2936
- # First, populate IUPAC lists directly from compound mappings based on compound_type
2937
- substrate_iupacs_from_mappings = []
2938
- product_iupacs_from_mappings = []
2939
-
2940
- for mapping in compound_mappings.values():
2941
- if mapping.iupac_name and mapping.compound_type:
2942
- if mapping.compound_type.lower() == "substrate":
2943
- substrate_iupacs_from_mappings.append(mapping.iupac_name)
2944
- LOGGER.info("Added substrate IUPAC from mapping: '%s'", mapping.iupac_name)
2945
- elif mapping.compound_type.lower() == "product":
2946
- product_iupacs_from_mappings.append(mapping.iupac_name)
2947
- LOGGER.info("Added product IUPAC from mapping: '%s'", mapping.iupac_name)
2948
-
2949
- # Initialize or update the IUPAC lists with mapped compounds
2950
- if substrate_iupacs_from_mappings:
2951
- existing_substrates = data.get("substrate_iupac_list", []) or []
2952
- if isinstance(existing_substrates, list):
2953
- data["substrate_iupac_list"] = existing_substrates + substrate_iupacs_from_mappings
2954
- else:
2955
- data["substrate_iupac_list"] = substrate_iupacs_from_mappings
2956
-
2957
- if product_iupacs_from_mappings:
2958
- existing_products = data.get("product_iupac_list", []) or []
2959
- if isinstance(existing_products, list):
2960
- data["product_iupac_list"] = existing_products + product_iupacs_from_mappings
2961
- else:
2962
- data["product_iupac_list"] = product_iupacs_from_mappings
2963
-
2964
3043
  # Try to map substrate/product lists through compound IDs
2965
3044
  substrate_list = data.get("substrate_iupac_list", []) or data.get("substrate_list", [])
2966
3045
  if isinstance(substrate_list, list):
@@ -3053,6 +3132,100 @@ Different campaigns may use different model reactions and substrates.
3053
3132
  ]
3054
3133
  for key in expected_keys:
3055
3134
  data.setdefault(key, None)
3135
+
3136
+ # === OPSIN VALIDATION AND COMPOUND MAPPING FALLBACK ===
3137
+ # Check if the IUPAC names are actually valid using OPSIN
3138
+ needs_compound_mapping = False
3139
+
3140
+ # Check substrate IUPAC names
3141
+ substrate_has_invalid = False
3142
+ if data.get("substrate_list") and isinstance(data["substrate_list"], list):
3143
+ # Check if we have substrate IDs but missing or invalid IUPAC names
3144
+ if not data.get("substrate_iupac_list"):
3145
+ LOGGER.warning("Substrate list exists but no IUPAC names provided")
3146
+ substrate_has_invalid = True
3147
+ else:
3148
+ substrate_names = data["substrate_iupac_list"].split("; ") if isinstance(data["substrate_iupac_list"], str) else []
3149
+ # Check each substrate ID has a valid IUPAC name
3150
+ for i, substrate_id in enumerate(data["substrate_list"]):
3151
+ if i >= len(substrate_names) or not substrate_names[i]:
3152
+ LOGGER.warning(f"No IUPAC name for substrate '{substrate_id}'")
3153
+ substrate_has_invalid = True
3154
+ elif not is_valid_iupac_name_with_opsin(substrate_names[i]):
3155
+ LOGGER.warning(f"Invalid IUPAC name detected for substrate '{substrate_id}': '{substrate_names[i]}'")
3156
+ substrate_has_invalid = True
3157
+
3158
+ if substrate_has_invalid:
3159
+ needs_compound_mapping = True
3160
+ LOGGER.info("Found missing or invalid substrate IUPAC names, will attempt compound mapping")
3161
+
3162
+ # Check product IUPAC names
3163
+ product_has_invalid = False
3164
+ if data.get("product_list") and isinstance(data["product_list"], list):
3165
+ # Check if we have product IDs but missing or invalid IUPAC names
3166
+ if not data.get("product_iupac_list"):
3167
+ LOGGER.warning("Product list exists but no IUPAC names provided")
3168
+ product_has_invalid = True
3169
+ else:
3170
+ product_names = data["product_iupac_list"].split("; ") if isinstance(data["product_iupac_list"], str) else []
3171
+ # Check each product ID has a valid IUPAC name
3172
+ for i, product_id in enumerate(data["product_list"]):
3173
+ if i >= len(product_names) or not product_names[i]:
3174
+ LOGGER.warning(f"No IUPAC name for product '{product_id}'")
3175
+ product_has_invalid = True
3176
+ elif not is_valid_iupac_name_with_opsin(product_names[i]):
3177
+ LOGGER.warning(f"Invalid IUPAC name detected for product '{product_id}': '{product_names[i]}'")
3178
+ product_has_invalid = True
3179
+
3180
+ if product_has_invalid:
3181
+ needs_compound_mapping = True
3182
+ LOGGER.info("Found missing or invalid product IUPAC names, will attempt compound mapping")
3183
+
3184
+ # If we need compound mapping and have substrate/product lists, attempt it
3185
+ if needs_compound_mapping and (data.get("substrate_list") or data.get("product_list")):
3186
+ LOGGER.info("Attempting compound mapping due to invalid IUPAC names")
3187
+
3188
+ # Collect all compound IDs that need mapping
3189
+ compound_ids_to_map = []
3190
+ if data.get("substrate_list") and isinstance(data["substrate_list"], list):
3191
+ compound_ids_to_map.extend(data["substrate_list"])
3192
+ if data.get("product_list") and isinstance(data["product_list"], list):
3193
+ compound_ids_to_map.extend(data["product_list"])
3194
+
3195
+ if compound_ids_to_map:
3196
+ LOGGER.info(f"Attempting to map compound IDs: {compound_ids_to_map}")
3197
+
3198
+ # Use the adaptive compound mapping
3199
+ compound_mappings = self._extract_compound_mappings_adaptive(
3200
+ compound_ids_to_map,
3201
+ campaign_filter=self.campaign_filter
3202
+ )
3203
+
3204
+ # Re-map substrate IUPAC names
3205
+ if data.get("substrate_list") and isinstance(data["substrate_list"], list):
3206
+ mapped_substrates = []
3207
+ for substrate_id in data["substrate_list"]:
3208
+ mapping = compound_mappings.get(substrate_id.lower().strip())
3209
+ if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
3210
+ mapped_substrates.append(mapping.iupac_name)
3211
+ LOGGER.info(f"Successfully mapped substrate '{substrate_id}' to IUPAC: {mapping.iupac_name}")
3212
+
3213
+ if mapped_substrates:
3214
+ data["substrate_iupac_list"] = "; ".join(mapped_substrates)
3215
+ LOGGER.info(f"Updated substrate IUPAC list with {len(mapped_substrates)} valid names")
3216
+
3217
+ # Re-map product IUPAC names
3218
+ if data.get("product_list") and isinstance(data["product_list"], list):
3219
+ mapped_products = []
3220
+ for product_id in data["product_list"]:
3221
+ mapping = compound_mappings.get(product_id.lower().strip())
3222
+ if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
3223
+ mapped_products.append(mapping.iupac_name)
3224
+ LOGGER.info(f"Successfully mapped product '{product_id}' to IUPAC: {mapping.iupac_name}")
3225
+
3226
+ if mapped_products:
3227
+ data["product_iupac_list"] = "; ".join(mapped_products)
3228
+ LOGGER.info(f"Updated product IUPAC list with {len(mapped_products)} valid names")
3056
3229
 
3057
3230
  return data
3058
3231
 
@@ -3131,21 +3304,10 @@ Different campaigns may use different model reactions and substrates.
3131
3304
  # Extract model reaction for this location - use unified approach
3132
3305
  LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
3133
3306
 
3134
- # Try lineage-specific extraction first
3135
- location_model_reaction = self.find_lineage_model_reaction(
3136
- best_location['location'],
3137
- location_context,
3138
- model_reaction_locations
3139
- )
3140
-
3141
- # Check if lineage extraction was successful
3142
- if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
3143
- LOGGER.info("Using lineage-specific model reaction data")
3144
- model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
3145
- else:
3146
- LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
3147
- # Use the comprehensive multimodal approach as fallback
3148
- model_info = self.gather_model_reaction_info(location_enzymes)
3307
+ # Skip lineage-specific extraction and use comprehensive multimodal extraction directly
3308
+ # The lineage-specific extraction often returns generic substrate classes instead of specific compounds
3309
+ LOGGER.info("Using comprehensive multimodal extraction for model reaction")
3310
+ model_info = self.gather_model_reaction_info(location_enzymes)
3149
3311
 
3150
3312
  LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
3151
3313
 
@@ -3571,6 +3733,11 @@ def main() -> None:
3571
3733
  LOGGER.info("Loading enzyme data from CSV…")
3572
3734
  enzyme_df = pd.read_csv(args.lineage_csv)
3573
3735
 
3736
+ # Rename enzyme_id to enzyme if needed
3737
+ if "enzyme_id" in enzyme_df.columns and "enzyme" not in enzyme_df.columns:
3738
+ enzyme_df = enzyme_df.rename(columns={"enzyme_id": "enzyme"})
3739
+ LOGGER.info("Renamed 'enzyme_id' column to 'enzyme' in lineage data")
3740
+
3574
3741
  # Detect campaign information from the enzyme CSV
3575
3742
  if 'campaign_id' in enzyme_df.columns:
3576
3743
  all_campaigns = enzyme_df['campaign_id'].dropna().unique().tolist()
@@ -3601,6 +3768,11 @@ def main() -> None:
3601
3768
  campaign_info=campaign_info)
3602
3769
  df_metrics = extractor.run(enzyme_df)
3603
3770
 
3771
+ # For single campaign, also merge with lineage data
3772
+ if not df_metrics.empty:
3773
+ df_metrics = df_metrics.merge(enzyme_df, on='enzyme', how='left', suffixes=('', '_lineage'))
3774
+ LOGGER.info("Merged metrics with lineage data for single campaign")
3775
+
3604
3776
  elif len(all_campaigns) > 1:
3605
3777
  LOGGER.info("Detected multiple campaigns: %s", all_campaigns)
3606
3778
  all_results = []
@@ -3651,6 +3823,10 @@ def main() -> None:
3651
3823
  # Merge campaign metrics with lineage data
3652
3824
  campaign_final = campaign_metrics.merge(campaign_lineage, on='enzyme', how='left', suffixes=('', '_lineage'))
3653
3825
 
3826
+ # Rename aa_seq to protein_sequence for consistency
3827
+ if 'aa_seq' in campaign_final.columns:
3828
+ campaign_final = campaign_final.rename(columns={'aa_seq': 'protein_sequence'})
3829
+
3654
3830
  # Save campaign-specific file immediately
3655
3831
  output_dir = args.output.parent
3656
3832
  base_name = args.output.stem
@@ -3667,6 +3843,10 @@ def main() -> None:
3667
3843
  # Still save an empty campaign file with lineage data
3668
3844
  campaign_lineage = enzyme_df[enzyme_df['campaign_id'] == campaign].copy()
3669
3845
  if not campaign_lineage.empty:
3846
+ # Rename aa_seq to protein_sequence for consistency
3847
+ if 'aa_seq' in campaign_lineage.columns:
3848
+ campaign_lineage = campaign_lineage.rename(columns={'aa_seq': 'protein_sequence'})
3849
+
3670
3850
  output_dir = args.output.parent
3671
3851
  base_name = args.output.stem
3672
3852
  campaign_file = output_dir / f"{base_name}_{campaign}.csv"
@@ -3697,6 +3877,11 @@ def main() -> None:
3697
3877
  df_final = df_metrics
3698
3878
  LOGGER.info("Using pre-merged campaign data - final dataset has %d rows", len(df_final) if df_final is not None else 0)
3699
3879
 
3880
+ # Rename aa_seq to protein_sequence for consistency
3881
+ if df_final is not None and 'aa_seq' in df_final.columns:
3882
+ df_final = df_final.rename(columns={'aa_seq': 'protein_sequence'})
3883
+ LOGGER.info("Renamed 'aa_seq' column to 'protein_sequence' for consistency")
3884
+
3700
3885
  df_final.to_csv(args.output, index=False)
3701
3886
  LOGGER.info("Saved %d rows -> %s", len(df_final), args.output)
3702
3887