debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/lineage_format.py CHANGED
@@ -30,6 +30,7 @@ from __future__ import annotations
30
30
 
31
31
  import argparse
32
32
  import csv
33
+ import difflib
33
34
  import json
34
35
  import logging
35
36
  import os
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
726
727
  gemini_matched_count += 1
727
728
  log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
728
729
  else:
729
- log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
730
+ # Try fuzzy matching when exact match fails
731
+ best_match = None
732
+ best_score = 0
733
+
734
+ # Try all possible keys in seq_lookup
735
+ for key in seq_lookup.keys():
736
+ if campaign_id in key: # Only consider keys from same campaign
737
+ # Extract enzyme_id part from composite key
738
+ try:
739
+ _, key_enzyme_id = key.split('_', 1)
740
+ except ValueError:
741
+ continue
742
+
743
+ # Calculate similarity score
744
+ score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
745
+
746
+ # Always track the highest score
747
+ if score > best_score:
748
+ best_score = score
749
+ best_match = key
750
+
751
+ # Use the best match regardless of threshold (let user see the score)
752
+ if best_match and best_score > 0.5: # Lower threshold but log the score
753
+ idx = entry["idx"]
754
+ df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
755
+ df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
756
+ if seq_lookup[best_match]["nt_sequence"]:
757
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
758
+ df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
759
+
760
+ # Also copy generation and parent_enzyme_id
761
+ df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
762
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
763
+
764
+ # Store the match for later mutation copying
765
+ _, matched_enzyme = best_match.split('_', 1)
766
+ df.at[idx, "_matched_enzyme_id"] = matched_enzyme
767
+ df.at[idx, "_matched_campaign_id"] = campaign_id
768
+
769
+ gemini_matched_count += 1
770
+ log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
771
+ else:
772
+ log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
730
773
 
731
774
  except Exception as e:
732
775
  log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")