PyPI - debase - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

debase 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

debase/_version.py +1 -1
debase/caption_pattern.py +7 -2
debase/cleanup_sequence.py +34 -6
debase/enzyme_lineage_extractor.py +673 -221
debase/lineage_format.py +55 -6
debase/reaction_info_extractor.py +282 -97
debase/substrate_scope_extractor.py +218 -65
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
debase-0.7.0.dist-info/RECORD +18 -0
debase-0.6.1.dist-info/RECORD +0 -18
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
{debase-0.6.1.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0

debase/lineage_format.py CHANGED Viewed

@@ -30,6 +30,7 @@ from __future__ import annotations
 import argparse
 import csv
+import difflib
 import json
 import logging
 import os
@@ -212,8 +213,8 @@ class VariantRecord:
         return result
-    def ttn_or_yield(self) -> Optional[float]:
-        for col in ("ttn", "yield"):
+    def get_fitness_value(self) -> Optional[float]:
+        for col in ("ttn", "tof", "yield"):
             val = self.row.get(col)
             if val is not None and pd.notna(val):
                 try:
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
                             gemini_matched_count += 1
                             log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
                         else:
-                            log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
+                            # Try fuzzy matching when exact match fails
+                            best_match = None
+                            best_score = 0
+                            # Try all possible keys in seq_lookup
+                            for key in seq_lookup.keys():
+                                if campaign_id in key:  # Only consider keys from same campaign
+                                    # Extract enzyme_id part from composite key
+                                    try:
+                                        _, key_enzyme_id = key.split('_', 1)
+                                    except ValueError:
+                                        continue
+                                    # Calculate similarity score
+                                    score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
+                                    # Always track the highest score
+                                    if score > best_score:
+                                        best_score = score
+                                        best_match = key
+                            # Use the best match regardless of threshold (let user see the score)
+                            if best_match and best_score > 0.5:  # Lower threshold but log the score
+                                idx = entry["idx"]
+                                df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
+                                df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
+                                if seq_lookup[best_match]["nt_sequence"]:
+                                    df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
+                                    df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
+                                # Also copy generation and parent_enzyme_id
+                                df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
+                                df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
+                                # Store the match for later mutation copying
+                                _, matched_enzyme = best_match.split('_', 1)
+                                df.at[idx, "_matched_enzyme_id"] = matched_enzyme
+                                df.at[idx, "_matched_campaign_id"] = campaign_id
+                                gemini_matched_count += 1
+                                log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
+                            else:
+                                log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
             except Exception as e:
                 log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
@@ -1229,9 +1272,15 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         # Fitness type -------------------------------------------------------
         fitness_type = ""
-        if rec.ttn_or_yield() is not None:
+        if rec.get_fitness_value() is not None:
             ttn_val = row.get("ttn")
-            fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
+            tof_val = row.get("tof")
+            if ttn_val is not None and pd.notna(ttn_val):
+                fitness_type = "ttn"
+            elif tof_val is not None and pd.notna(tof_val):
+                fitness_type = "tof"
+            else:
+                fitness_type = "yield"
         # Additional info -----------------------------------------------------
         extra: Dict[str, str] = {
@@ -1252,7 +1301,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             amino_acid_substitutions=aa_muts,
             nt_sequence=rec.nt_seq,
             aa_sequence=rec.aa_seq,
-            fitness_value=rec.ttn_or_yield(),
+            fitness_value=rec.get_fitness_value(),
             fitness_type=fitness_type,
             cofactor=cofactor,
             reaction_condition=reaction_condition,

debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

debase 0.6.1py3-none-any.whl → 0.7.0py3-none-any.whl