debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/lineage_format.py CHANGED
@@ -30,6 +30,7 @@ from __future__ import annotations
30
30
 
31
31
  import argparse
32
32
  import csv
33
+ import difflib
33
34
  import json
34
35
  import logging
35
36
  import os
@@ -212,8 +213,8 @@ class VariantRecord:
212
213
  return result
213
214
 
214
215
 
215
- def ttn_or_yield(self) -> Optional[float]:
216
- for col in ("ttn", "yield"):
216
+ def get_fitness_value(self) -> Optional[float]:
217
+ for col in ("ttn", "tof", "yield"):
217
218
  val = self.row.get(col)
218
219
  if val is not None and pd.notna(val):
219
220
  try:
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
726
727
  gemini_matched_count += 1
727
728
  log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
728
729
  else:
729
- log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
730
+ # Try fuzzy matching when exact match fails
731
+ best_match = None
732
+ best_score = 0
733
+
734
+ # Try all possible keys in seq_lookup
735
+ for key in seq_lookup.keys():
736
+ if campaign_id in key: # Only consider keys from same campaign
737
+ # Extract enzyme_id part from composite key
738
+ try:
739
+ _, key_enzyme_id = key.split('_', 1)
740
+ except ValueError:
741
+ continue
742
+
743
+ # Calculate similarity score
744
+ score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
745
+
746
+ # Always track the highest score
747
+ if score > best_score:
748
+ best_score = score
749
+ best_match = key
750
+
751
+ # Use the best match regardless of threshold (let user see the score)
752
+ if best_match and best_score > 0.5: # Lower threshold but log the score
753
+ idx = entry["idx"]
754
+ df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
755
+ df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
756
+ if seq_lookup[best_match]["nt_sequence"]:
757
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
758
+ df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
759
+
760
+ # Also copy generation and parent_enzyme_id
761
+ df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
762
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
763
+
764
+ # Store the match for later mutation copying
765
+ _, matched_enzyme = best_match.split('_', 1)
766
+ df.at[idx, "_matched_enzyme_id"] = matched_enzyme
767
+ df.at[idx, "_matched_campaign_id"] = campaign_id
768
+
769
+ gemini_matched_count += 1
770
+ log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
771
+ else:
772
+ log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
730
773
 
731
774
  except Exception as e:
732
775
  log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
@@ -1229,9 +1272,15 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1229
1272
 
1230
1273
  # Fitness type -------------------------------------------------------
1231
1274
  fitness_type = ""
1232
- if rec.ttn_or_yield() is not None:
1275
+ if rec.get_fitness_value() is not None:
1233
1276
  ttn_val = row.get("ttn")
1234
- fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
1277
+ tof_val = row.get("tof")
1278
+ if ttn_val is not None and pd.notna(ttn_val):
1279
+ fitness_type = "ttn"
1280
+ elif tof_val is not None and pd.notna(tof_val):
1281
+ fitness_type = "tof"
1282
+ else:
1283
+ fitness_type = "yield"
1235
1284
 
1236
1285
  # Additional info -----------------------------------------------------
1237
1286
  extra: Dict[str, str] = {
@@ -1252,7 +1301,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1252
1301
  amino_acid_substitutions=aa_muts,
1253
1302
  nt_sequence=rec.nt_seq,
1254
1303
  aa_sequence=rec.aa_seq,
1255
- fitness_value=rec.ttn_or_yield(),
1304
+ fitness_value=rec.get_fitness_value(),
1256
1305
  fitness_type=fitness_type,
1257
1306
  cofactor=cofactor,
1258
1307
  reaction_condition=reaction_condition,