debase 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/lineage_format.py CHANGED
@@ -213,8 +213,8 @@ class VariantRecord:
213
213
  return result
214
214
 
215
215
 
216
- def ttn_or_yield(self) -> Optional[float]:
217
- for col in ("ttn", "yield"):
216
+ def get_fitness_value(self) -> Optional[float]:
217
+ for col in ("ttn", "tof", "yield"):
218
218
  val = self.row.get(col)
219
219
  if val is not None and pd.notna(val):
220
220
  try:
@@ -1272,9 +1272,15 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1272
1272
 
1273
1273
  # Fitness type -------------------------------------------------------
1274
1274
  fitness_type = ""
1275
- if rec.ttn_or_yield() is not None:
1275
+ if rec.get_fitness_value() is not None:
1276
1276
  ttn_val = row.get("ttn")
1277
- fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
1277
+ tof_val = row.get("tof")
1278
+ if ttn_val is not None and pd.notna(ttn_val):
1279
+ fitness_type = "ttn"
1280
+ elif tof_val is not None and pd.notna(tof_val):
1281
+ fitness_type = "tof"
1282
+ else:
1283
+ fitness_type = "yield"
1278
1284
 
1279
1285
  # Additional info -----------------------------------------------------
1280
1286
  extra: Dict[str, str] = {
@@ -1295,7 +1301,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1295
1301
  amino_acid_substitutions=aa_muts,
1296
1302
  nt_sequence=rec.nt_seq,
1297
1303
  aa_sequence=rec.aa_seq,
1298
- fitness_value=rec.ttn_or_yield(),
1304
+ fitness_value=rec.get_fitness_value(),
1299
1305
  fitness_type=fitness_type,
1300
1306
  cofactor=cofactor,
1301
1307
  reaction_condition=reaction_condition,
@@ -29,6 +29,7 @@ import json
29
29
  import logging
30
30
  import os
31
31
  import re
32
+ import subprocess
32
33
  import sys
33
34
  import time
34
35
  from base64 import b64encode, b64decode
@@ -90,6 +91,40 @@ handler.setFormatter(logging.Formatter("%(levelname)s [%(name)s] %(message)s"))
90
91
  LOGGER.addHandler(handler)
91
92
  LOGGER.setLevel(logging.INFO)
92
93
 
94
+ # === OPSIN VALIDATION === -------------------------------------------------
95
+
96
+ def is_valid_iupac_name_with_opsin(name: str) -> bool:
97
+ """Check if a name is a valid IUPAC name using the local OPSIN command."""
98
+ if not name or len(name.strip()) < 3:
99
+ return False
100
+
101
+ # Skip if it looks like a compound ID (e.g., "1a", "S1", etc.)
102
+ if re.match(r'^[0-9]+[a-z]?$|^S\d+$', name.strip()):
103
+ return False
104
+
105
+ try:
106
+ # Use local OPSIN command to check if name can be converted to SMILES
107
+ process = subprocess.run(
108
+ ['opsin', '-o', 'smi'],
109
+ input=name.strip(),
110
+ text=True,
111
+ capture_output=True,
112
+ timeout=30
113
+ )
114
+
115
+ # If OPSIN successfully converts to SMILES, the name is valid IUPAC
116
+ if process.returncode == 0 and process.stdout.strip():
117
+ output = process.stdout.strip()
118
+ # Check if output looks like a valid SMILES (contains common SMILES characters)
119
+ if any(char in output for char in 'CNOS()=[]#+-'):
120
+ return True
121
+
122
+ return False
123
+
124
+ except Exception as e:
125
+ LOGGER.debug(f"OPSIN check failed for '{name}': {e}")
126
+ return False
127
+
93
128
  # --- Debug dump helper ----------------------------------------------------
94
129
  def _dump(text: str | bytes, path: Path | str) -> None:
95
130
  """Write `text` / `bytes` to `path`, creating parent dirs as needed."""
@@ -606,23 +641,29 @@ Given the following text sections, identify where the MODEL REACTION information
606
641
  The model reaction is the STANDARD reaction used to evaluate all enzyme variants
607
642
  (not the substrate scope). Look for:
608
643
 
609
- - Sections titled "Model Reaction", "Standard Reaction", "General Procedure"
610
- - Text describing the reaction conditions used for enzyme evolution/screening
611
- - Sections describing which substrates were used as the benchmark
612
- - Compound numbers (e.g., "6a", "7a") used in the model reaction
644
+ - SPECIFIC compound numbers (e.g., "1a", "2a", "3a") used in the model reaction
645
+ - Reaction SCHEMES or FIGURES showing the model reaction with numbered compounds
646
+ - Tables showing reaction conditions with specific compound IDs
647
+ - Sections titled "Model Reaction", "Standard Reaction", "General Procedure" WITH compound numbers
648
+
649
+ CRITICAL REQUIREMENTS:
650
+ 1. The location MUST reference SPECIFIC numbered compounds (not generic descriptions)
651
+ 2. DO NOT use generic locations like "main text" or "introduction"
652
+ 3. MUST be a Figure, Scheme, Table, or specific SI section
653
+ 4. Look for actual compound IDs like "1a + 2a → 3a" or "substrate 1a"
613
654
 
614
655
  Also identify where the IUPAC names for these specific compounds are listed.
615
656
 
616
657
  Respond with a JSON object containing:
617
658
  {
618
659
  "model_reaction_location": {
619
- "location": "section name or description",
660
+ "location": "SPECIFIC Figure/Scheme/Table number (e.g., 'Figure 2a', 'Scheme 1', 'Table S1')",
620
661
  "confidence": 0-100,
621
- "reason": "why this contains the model reaction",
622
- "compound_ids": ["list", "of", "compound", "IDs", "if", "found"]
662
+ "reason": "why this contains the model reaction WITH specific compound IDs",
663
+ "compound_ids": ["list", "of", "SPECIFIC", "compound", "IDs", "found", "e.g.", "1a", "2a", "3a"]
623
664
  },
624
665
  "conditions_location": {
625
- "location": "where reaction conditions are described",
666
+ "location": "SPECIFIC location where reaction conditions are described",
626
667
  "confidence": 0-100
627
668
  },
628
669
  "iupac_location": {
@@ -632,6 +673,11 @@ Respond with a JSON object containing:
632
673
  }
633
674
  }
634
675
 
676
+ IMPORTANT:
677
+ - If no SPECIFIC compound IDs are found, set compound_ids to []
678
+ - The model_reaction_location MUST be a Figure, Scheme, Table, or SI section, NOT "main text"
679
+ - Look for numbered compounds like "1a", "2a", not generic terms like "enol acetates"
680
+
635
681
  Respond ONLY with **minified JSON**. NO markdown fences, no commentary.
636
682
  """)
637
683
 
@@ -642,11 +688,20 @@ This is the reaction used for directed evolution screening, NOT the substrate sc
642
688
  Look for terms like "model reaction", "standard substrate", "benchmark reaction",
643
689
  or the specific reaction mentioned in enzyme screening/evolution sections.
644
690
 
691
+ CRITICAL STEPS FOR COMPOUND IDENTIFICATION:
692
+ 1. ALWAYS look for specific compound IDs/numbers in the model reaction (e.g., "1a", "2a", "3a", "6a", "7a")
693
+ 2. If the text mentions generic terms like "enol acetates" or "silyl enol ethers", search for the SPECIFIC numbered compounds used
694
+ 3. Look in reaction schemes, figures, and experimental sections for numbered compounds
695
+ 4. Common patterns:
696
+ - "compound 1a" or "substrate 1a"
697
+ - Numbers in bold or italics (1a, 2a, etc.)
698
+ - References like "using 1a as substrate"
699
+
645
700
  CRITICAL STEPS FOR IUPAC NAMES:
646
- 1. First identify the compound IDs used in the model reaction (e.g., "6a", "7a")
647
- 2. Then search the provided context for these compound IDs to find their IUPAC names
648
- 3. Look for sections with "Compound 6a", "Product 7a", or similar patterns
649
- 4. The IUPAC names are usually given after the compound ID in parentheses or after a colon
701
+ 1. After finding compound IDs, search the context for these IDs to find their IUPAC names
702
+ 2. Look for sections with "Compound 1a:", "Product 3a:", or similar patterns
703
+ 3. The IUPAC names are usually given after the compound ID in parentheses or after a colon
704
+ 4. If no IUPAC name is found for a compound ID, still include the ID in substrate_list/product_list
650
705
 
651
706
  CRITICAL FOR SUBSTRATE CONCENTRATION:
652
707
  - Look carefully in FIGURES and figure captions for substrate concentration information
@@ -657,10 +712,10 @@ CRITICAL FOR SUBSTRATE CONCENTRATION:
657
712
  - The substrate is the molecule being chemically transformed by the enzyme
658
713
 
659
714
  Return a JSON object with:
660
- * "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["5", "6a"])
661
- * "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents
662
- * "product_list" - Array of product identifiers as used in the paper (e.g., ["7a"])
663
- * "product_iupac_list" - Array of IUPAC names for ALL products formed
715
+ * "substrate_list" - Array of substrate identifiers as used in the paper (e.g., ["1a", "2a"]) - NEVER generic descriptions
716
+ * "substrate_iupac_list" - Array of IUPAC names for ALL substrates/reagents (null if not found)
717
+ * "product_list" - Array of product identifiers as used in the paper (e.g., ["3a"]) - NEVER generic descriptions
718
+ * "product_iupac_list" - Array of IUPAC names for ALL products formed (null if not found)
664
719
  * "reaction_substrate_concentration" - Concentration of actual substrate(s) being transformed, NOT reducing agents like dithionite
665
720
  * "cofactor" - Any cofactors used (e.g., "NADH", "NADPH", "FAD", "heme", etc.) or null if none
666
721
  * "reaction_temperature" - reaction temperature (e.g., "25°C", "room temperature")
@@ -669,7 +724,8 @@ Return a JSON object with:
669
724
  * "reaction_other_conditions" - other important conditions (enzyme loading, reducing agents like dithionite, time, anaerobic, etc.)
670
725
 
671
726
  IMPORTANT:
672
- - Extract the reaction used for ENZYME EVOLUTION/SCREENING (not substrate scope)
727
+ - ALWAYS use specific compound IDs (like "1a", "2a") in substrate_list and product_list, NEVER generic descriptions
728
+ - If you can't find specific compound IDs, look harder in figures, schemes, and experimental sections
673
729
  - Substrate concentration = concentration of chemicals being transformed, NOT reducing agents (dithionite, NADH, etc.)
674
730
  - Maintain correspondence: substrate_list[i] should map to substrate_iupac_list[i], same for products
675
731
  - If a compound ID has no IUPAC name found, still include it in the list with null in the IUPAC list
@@ -783,7 +839,7 @@ Return as JSON:
783
839
  ###############################################################################
784
840
 
785
841
  class ReactionExtractor:
786
- _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\s+s?\d+[a-z]?", re.I)
842
+ _FIG_RE = re.compile(r"(?:supplementary\s+)?fig(?:ure)?\.?\s+s?\d+[a-z]?", re.I)
787
843
  _TAB_RE = re.compile(r"(?:supplementary\s+)?tab(?:le)?\s+s?\d+[a-z]?", re.I)
788
844
 
789
845
  def __init__(self, manuscript: Path, si: Optional[Path], cfg: Config, debug_dir: Optional[Path] = None,
@@ -1408,6 +1464,10 @@ class ReactionExtractor:
1408
1464
  text = ' '.join(text.split())
1409
1465
  # Normalize different dash types
1410
1466
  text = text.replace('–', '-').replace('—', '-')
1467
+ # Normalize pipe character and other special chars
1468
+ text = text.replace('|', ' ').replace('│', ' ')
1469
+ # Remove multiple spaces
1470
+ text = ' '.join(text.split())
1411
1471
  return text
1412
1472
 
1413
1473
  normalized_hint = normalize_for_matching(caption_hint[:100]) # Use first 100 chars
@@ -1904,15 +1964,17 @@ class ReactionExtractor:
1904
1964
  ref_lc = location_str.lower()
1905
1965
  image_b64: Optional[str] = None
1906
1966
 
1907
- # First, validate that the location actually exists in the document
1908
- if not self._validate_location_exists(location_str):
1967
+ # Skip validation entirely when we have a caption hint - trust the vision model
1968
+ if caption_hint:
1969
+ LOGGER.info("Skipping validation - using caption hint for %s", location_str)
1970
+ elif not self._validate_location_exists(location_str):
1909
1971
  LOGGER.warning("Location %s not found in document - skipping", location_str)
1910
1972
  return []
1911
1973
 
1912
1974
  # Add campaign context if available
1913
1975
  campaign_context = ""
1914
1976
  if self.campaign_filter:
1915
- campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\n"
1977
+ campaign_context = f"\n\nIMPORTANT: You are extracting data for the {self.campaign_filter} campaign.\nOnly extract data that is relevant to this specific campaign.\nEXCLUDE reference variants from other publications - only include variants created/tested in THIS study.\n"
1916
1978
 
1917
1979
  if self._TAB_RE.search(ref_lc):
1918
1980
  # For tables, try to extract the page as an image first
@@ -1976,6 +2038,24 @@ class ReactionExtractor:
1976
2038
  prompt = campaign_context + location_context + PROMPT_EXTRACT_FIGURE_METRICS_BATCH.format(enzyme_names=enzyme_names)
1977
2039
  LOGGER.info("Gemini Vision: extracting metrics for %d enzymes from %s…", len(enzyme_list), ref)
1978
2040
  tag = f"extract_metrics_batch_vision"
2041
+
2042
+ # Save the figure image to debug directory
2043
+ if self.debug_dir and isinstance(ref, dict):
2044
+ location_str = ref.get('location', str(ref))
2045
+ else:
2046
+ location_str = str(ref)
2047
+
2048
+ if self.debug_dir:
2049
+ timestamp = int(time.time())
2050
+ img_file = self.debug_dir / f"metrics_extraction_{location_str.replace(' ', '_').replace('.', '')}_{timestamp}.png"
2051
+ try:
2052
+ import base64
2053
+ img_bytes = base64.b64decode(image_b64)
2054
+ with open(img_file, 'wb') as f:
2055
+ f.write(img_bytes)
2056
+ LOGGER.info("Saved metrics extraction figure to: %s", img_file)
2057
+ except Exception as e:
2058
+ LOGGER.warning("Failed to save metrics extraction figure: %s", e)
1979
2059
  else:
1980
2060
  # Add enzyme names to prompt for batch extraction with explicit format requirement
1981
2061
  format_example = '{"enzyme1": {"yield": "99.0%", "ttn": null, ...}, "enzyme2": {"yield": "85.0%", ...}}'
@@ -2112,6 +2192,10 @@ These variants belong to campaign: {self.campaign_filter}
2112
2192
  {campaigns_context}
2113
2193
  Focus on finding the model reaction that was used to evaluate THESE specific variants.
2114
2194
  Different campaigns may use different model reactions.
2195
+
2196
+ CRITICAL: These variants should be from THIS study only!
2197
+ - EXCLUDE any reference variants cited from other publications
2198
+ - Only include variants that were created/engineered in this manuscript
2115
2199
  """
2116
2200
 
2117
2201
  prompt = enzyme_context + PROMPT_FIND_MODEL_REACTION_LOCATION + "\n\n=== CAPTIONS AND SECTIONS ===\n" + all_text + "\n\n=== MANUSCRIPT TEXT PREVIEW ===\n" + ms_preview + "\n\n=== SI TEXT PREVIEW ===\n" + si_preview
@@ -2843,6 +2927,12 @@ These variants belong to campaign: {self.campaign_filter}
2843
2927
  Focus on extracting the model reaction that was used to evaluate THESE specific variants.
2844
2928
  Different campaigns may use different model reactions and substrates.
2845
2929
 
2930
+ CRITICAL: EXCLUDE reference variants from other publications!
2931
+ - Only extract data for variants that were actually tested/created in THIS study
2932
+ - Do NOT include data for reference enzymes cited from other papers
2933
+ - Look for phrases like "from reference", "previously reported", "from [Author] et al." to identify reference variants
2934
+ - Focus ONLY on the variants that were engineered/tested in this manuscript
2935
+
2846
2936
  """
2847
2937
 
2848
2938
  # Include both manuscript and SI text for better coverage
@@ -3042,6 +3132,100 @@ Different campaigns may use different model reactions and substrates.
3042
3132
  ]
3043
3133
  for key in expected_keys:
3044
3134
  data.setdefault(key, None)
3135
+
3136
+ # === OPSIN VALIDATION AND COMPOUND MAPPING FALLBACK ===
3137
+ # Check if the IUPAC names are actually valid using OPSIN
3138
+ needs_compound_mapping = False
3139
+
3140
+ # Check substrate IUPAC names
3141
+ substrate_has_invalid = False
3142
+ if data.get("substrate_list") and isinstance(data["substrate_list"], list):
3143
+ # Check if we have substrate IDs but missing or invalid IUPAC names
3144
+ if not data.get("substrate_iupac_list"):
3145
+ LOGGER.warning("Substrate list exists but no IUPAC names provided")
3146
+ substrate_has_invalid = True
3147
+ else:
3148
+ substrate_names = data["substrate_iupac_list"].split("; ") if isinstance(data["substrate_iupac_list"], str) else []
3149
+ # Check each substrate ID has a valid IUPAC name
3150
+ for i, substrate_id in enumerate(data["substrate_list"]):
3151
+ if i >= len(substrate_names) or not substrate_names[i]:
3152
+ LOGGER.warning(f"No IUPAC name for substrate '{substrate_id}'")
3153
+ substrate_has_invalid = True
3154
+ elif not is_valid_iupac_name_with_opsin(substrate_names[i]):
3155
+ LOGGER.warning(f"Invalid IUPAC name detected for substrate '{substrate_id}': '{substrate_names[i]}'")
3156
+ substrate_has_invalid = True
3157
+
3158
+ if substrate_has_invalid:
3159
+ needs_compound_mapping = True
3160
+ LOGGER.info("Found missing or invalid substrate IUPAC names, will attempt compound mapping")
3161
+
3162
+ # Check product IUPAC names
3163
+ product_has_invalid = False
3164
+ if data.get("product_list") and isinstance(data["product_list"], list):
3165
+ # Check if we have product IDs but missing or invalid IUPAC names
3166
+ if not data.get("product_iupac_list"):
3167
+ LOGGER.warning("Product list exists but no IUPAC names provided")
3168
+ product_has_invalid = True
3169
+ else:
3170
+ product_names = data["product_iupac_list"].split("; ") if isinstance(data["product_iupac_list"], str) else []
3171
+ # Check each product ID has a valid IUPAC name
3172
+ for i, product_id in enumerate(data["product_list"]):
3173
+ if i >= len(product_names) or not product_names[i]:
3174
+ LOGGER.warning(f"No IUPAC name for product '{product_id}'")
3175
+ product_has_invalid = True
3176
+ elif not is_valid_iupac_name_with_opsin(product_names[i]):
3177
+ LOGGER.warning(f"Invalid IUPAC name detected for product '{product_id}': '{product_names[i]}'")
3178
+ product_has_invalid = True
3179
+
3180
+ if product_has_invalid:
3181
+ needs_compound_mapping = True
3182
+ LOGGER.info("Found missing or invalid product IUPAC names, will attempt compound mapping")
3183
+
3184
+ # If we need compound mapping and have substrate/product lists, attempt it
3185
+ if needs_compound_mapping and (data.get("substrate_list") or data.get("product_list")):
3186
+ LOGGER.info("Attempting compound mapping due to invalid IUPAC names")
3187
+
3188
+ # Collect all compound IDs that need mapping
3189
+ compound_ids_to_map = []
3190
+ if data.get("substrate_list") and isinstance(data["substrate_list"], list):
3191
+ compound_ids_to_map.extend(data["substrate_list"])
3192
+ if data.get("product_list") and isinstance(data["product_list"], list):
3193
+ compound_ids_to_map.extend(data["product_list"])
3194
+
3195
+ if compound_ids_to_map:
3196
+ LOGGER.info(f"Attempting to map compound IDs: {compound_ids_to_map}")
3197
+
3198
+ # Use the adaptive compound mapping
3199
+ compound_mappings = self._extract_compound_mappings_adaptive(
3200
+ compound_ids_to_map,
3201
+ campaign_filter=self.campaign_filter
3202
+ )
3203
+
3204
+ # Re-map substrate IUPAC names
3205
+ if data.get("substrate_list") and isinstance(data["substrate_list"], list):
3206
+ mapped_substrates = []
3207
+ for substrate_id in data["substrate_list"]:
3208
+ mapping = compound_mappings.get(substrate_id.lower().strip())
3209
+ if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
3210
+ mapped_substrates.append(mapping.iupac_name)
3211
+ LOGGER.info(f"Successfully mapped substrate '{substrate_id}' to IUPAC: {mapping.iupac_name}")
3212
+
3213
+ if mapped_substrates:
3214
+ data["substrate_iupac_list"] = "; ".join(mapped_substrates)
3215
+ LOGGER.info(f"Updated substrate IUPAC list with {len(mapped_substrates)} valid names")
3216
+
3217
+ # Re-map product IUPAC names
3218
+ if data.get("product_list") and isinstance(data["product_list"], list):
3219
+ mapped_products = []
3220
+ for product_id in data["product_list"]:
3221
+ mapping = compound_mappings.get(product_id.lower().strip())
3222
+ if mapping and mapping.iupac_name and is_valid_iupac_name_with_opsin(mapping.iupac_name):
3223
+ mapped_products.append(mapping.iupac_name)
3224
+ LOGGER.info(f"Successfully mapped product '{product_id}' to IUPAC: {mapping.iupac_name}")
3225
+
3226
+ if mapped_products:
3227
+ data["product_iupac_list"] = "; ".join(mapped_products)
3228
+ LOGGER.info(f"Updated product IUPAC list with {len(mapped_products)} valid names")
3045
3229
 
3046
3230
  return data
3047
3231
 
@@ -3120,21 +3304,10 @@ Different campaigns may use different model reactions and substrates.
3120
3304
  # Extract model reaction for this location - use unified approach
3121
3305
  LOGGER.info("Extracting model reaction for location: %s", best_location['location'])
3122
3306
 
3123
- # Try lineage-specific extraction first
3124
- location_model_reaction = self.find_lineage_model_reaction(
3125
- best_location['location'],
3126
- location_context,
3127
- model_reaction_locations
3128
- )
3129
-
3130
- # Check if lineage extraction was successful
3131
- if location_model_reaction.get('substrate_ids') or location_model_reaction.get('product_ids'):
3132
- LOGGER.info("Using lineage-specific model reaction data")
3133
- model_info = self._extract_lineage_model_info(location_model_reaction, location_enzymes)
3134
- else:
3135
- LOGGER.info("Lineage extraction failed, using comprehensive multimodal extraction")
3136
- # Use the comprehensive multimodal approach as fallback
3137
- model_info = self.gather_model_reaction_info(location_enzymes)
3307
+ # Skip lineage-specific extraction and use comprehensive multimodal extraction directly
3308
+ # The lineage-specific extraction often returns generic substrate classes instead of specific compounds
3309
+ LOGGER.info("Using comprehensive multimodal extraction for model reaction")
3310
+ model_info = self.gather_model_reaction_info(location_enzymes)
3138
3311
 
3139
3312
  LOGGER.info("Model reaction extraction complete for location: %s", best_location['location'])
3140
3313