debase 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/caption_pattern.py +7 -2
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +673 -221
- debase/lineage_format.py +55 -6
- debase/reaction_info_extractor.py +282 -97
- debase/substrate_scope_extractor.py +218 -65
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/METADATA +1 -1
- debase-0.7.0.dist-info/RECORD +18 -0
- debase-0.6.1.dist-info/RECORD +0 -18
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/WHEEL +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/entry_points.txt +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.1.dist-info → debase-0.7.0.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -30,6 +30,7 @@ from __future__ import annotations
|
|
30
30
|
|
31
31
|
import argparse
|
32
32
|
import csv
|
33
|
+
import difflib
|
33
34
|
import json
|
34
35
|
import logging
|
35
36
|
import os
|
@@ -212,8 +213,8 @@ class VariantRecord:
|
|
212
213
|
return result
|
213
214
|
|
214
215
|
|
215
|
-
def
|
216
|
-
for col in ("ttn", "yield"):
|
216
|
+
def get_fitness_value(self) -> Optional[float]:
|
217
|
+
for col in ("ttn", "tof", "yield"):
|
217
218
|
val = self.row.get(col)
|
218
219
|
if val is not None and pd.notna(val):
|
219
220
|
try:
|
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
|
|
726
727
|
gemini_matched_count += 1
|
727
728
|
log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
|
728
729
|
else:
|
729
|
-
|
730
|
+
# Try fuzzy matching when exact match fails
|
731
|
+
best_match = None
|
732
|
+
best_score = 0
|
733
|
+
|
734
|
+
# Try all possible keys in seq_lookup
|
735
|
+
for key in seq_lookup.keys():
|
736
|
+
if campaign_id in key: # Only consider keys from same campaign
|
737
|
+
# Extract enzyme_id part from composite key
|
738
|
+
try:
|
739
|
+
_, key_enzyme_id = key.split('_', 1)
|
740
|
+
except ValueError:
|
741
|
+
continue
|
742
|
+
|
743
|
+
# Calculate similarity score
|
744
|
+
score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
|
745
|
+
|
746
|
+
# Always track the highest score
|
747
|
+
if score > best_score:
|
748
|
+
best_score = score
|
749
|
+
best_match = key
|
750
|
+
|
751
|
+
# Use the best match regardless of threshold (let user see the score)
|
752
|
+
if best_match and best_score > 0.5: # Lower threshold but log the score
|
753
|
+
idx = entry["idx"]
|
754
|
+
df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
|
755
|
+
df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
|
756
|
+
if seq_lookup[best_match]["nt_sequence"]:
|
757
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
|
758
|
+
df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
|
759
|
+
|
760
|
+
# Also copy generation and parent_enzyme_id
|
761
|
+
df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
|
762
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
|
763
|
+
|
764
|
+
# Store the match for later mutation copying
|
765
|
+
_, matched_enzyme = best_match.split('_', 1)
|
766
|
+
df.at[idx, "_matched_enzyme_id"] = matched_enzyme
|
767
|
+
df.at[idx, "_matched_campaign_id"] = campaign_id
|
768
|
+
|
769
|
+
gemini_matched_count += 1
|
770
|
+
log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
|
771
|
+
else:
|
772
|
+
log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
|
730
773
|
|
731
774
|
except Exception as e:
|
732
775
|
log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
|
@@ -1229,9 +1272,15 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
1229
1272
|
|
1230
1273
|
# Fitness type -------------------------------------------------------
|
1231
1274
|
fitness_type = ""
|
1232
|
-
if rec.
|
1275
|
+
if rec.get_fitness_value() is not None:
|
1233
1276
|
ttn_val = row.get("ttn")
|
1234
|
-
|
1277
|
+
tof_val = row.get("tof")
|
1278
|
+
if ttn_val is not None and pd.notna(ttn_val):
|
1279
|
+
fitness_type = "ttn"
|
1280
|
+
elif tof_val is not None and pd.notna(tof_val):
|
1281
|
+
fitness_type = "tof"
|
1282
|
+
else:
|
1283
|
+
fitness_type = "yield"
|
1235
1284
|
|
1236
1285
|
# Additional info -----------------------------------------------------
|
1237
1286
|
extra: Dict[str, str] = {
|
@@ -1252,7 +1301,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
1252
1301
|
amino_acid_substitutions=aa_muts,
|
1253
1302
|
nt_sequence=rec.nt_seq,
|
1254
1303
|
aa_sequence=rec.aa_seq,
|
1255
|
-
fitness_value=rec.
|
1304
|
+
fitness_value=rec.get_fitness_value(),
|
1256
1305
|
fitness_type=fitness_type,
|
1257
1306
|
cofactor=cofactor,
|
1258
1307
|
reaction_condition=reaction_condition,
|