debase 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/campaign_utils.py +146 -0
- debase/caption_pattern.py +44 -0
- debase/cleanup_sequence.py +34 -6
- debase/enzyme_lineage_extractor.py +481 -106
- debase/lineage_format.py +44 -1
- debase/reaction_info_extractor.py +479 -135
- debase/substrate_scope_extractor.py +207 -80
- debase/wrapper.py +3 -3
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/METADATA +1 -1
- debase-0.6.2.dist-info/RECORD +18 -0
- debase-0.6.0.dist-info/RECORD +0 -16
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/WHEEL +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/entry_points.txt +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.6.0.dist-info → debase-0.6.2.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -30,6 +30,7 @@ from __future__ import annotations
|
|
30
30
|
|
31
31
|
import argparse
|
32
32
|
import csv
|
33
|
+
import difflib
|
33
34
|
import json
|
34
35
|
import logging
|
35
36
|
import os
|
@@ -726,7 +727,49 @@ Only include matches you are confident about. If no match exists, omit that enzy
|
|
726
727
|
gemini_matched_count += 1
|
727
728
|
log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
|
728
729
|
else:
|
729
|
-
|
730
|
+
# Try fuzzy matching when exact match fails
|
731
|
+
best_match = None
|
732
|
+
best_score = 0
|
733
|
+
|
734
|
+
# Try all possible keys in seq_lookup
|
735
|
+
for key in seq_lookup.keys():
|
736
|
+
if campaign_id in key: # Only consider keys from same campaign
|
737
|
+
# Extract enzyme_id part from composite key
|
738
|
+
try:
|
739
|
+
_, key_enzyme_id = key.split('_', 1)
|
740
|
+
except ValueError:
|
741
|
+
continue
|
742
|
+
|
743
|
+
# Calculate similarity score
|
744
|
+
score = difflib.SequenceMatcher(None, matched_id.lower(), key_enzyme_id.lower()).ratio()
|
745
|
+
|
746
|
+
# Always track the highest score
|
747
|
+
if score > best_score:
|
748
|
+
best_score = score
|
749
|
+
best_match = key
|
750
|
+
|
751
|
+
# Use the best match regardless of threshold (let user see the score)
|
752
|
+
if best_match and best_score > 0.5: # Lower threshold but log the score
|
753
|
+
idx = entry["idx"]
|
754
|
+
df.at[idx, "protein_sequence"] = seq_lookup[best_match]["aa_sequence"]
|
755
|
+
df.at[idx, "aa_sequence"] = seq_lookup[best_match]["aa_sequence"]
|
756
|
+
if seq_lookup[best_match]["nt_sequence"]:
|
757
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[best_match]["nt_sequence"]
|
758
|
+
df.at[idx, "nt_sequence"] = seq_lookup[best_match]["nt_sequence"]
|
759
|
+
|
760
|
+
# Also copy generation and parent_enzyme_id
|
761
|
+
df.at[idx, "generation"] = seq_lookup[best_match]["generation"]
|
762
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[best_match]["parent_enzyme_id"]
|
763
|
+
|
764
|
+
# Store the match for later mutation copying
|
765
|
+
_, matched_enzyme = best_match.split('_', 1)
|
766
|
+
df.at[idx, "_matched_enzyme_id"] = matched_enzyme
|
767
|
+
df.at[idx, "_matched_campaign_id"] = campaign_id
|
768
|
+
|
769
|
+
gemini_matched_count += 1
|
770
|
+
log.info(f"Fuzzy matched '{substrate_id}' -> '{matched_enzyme}' (score: {best_score:.2f}) in campaign {campaign_id}")
|
771
|
+
else:
|
772
|
+
log.warning(f"No fuzzy match found for Gemini suggested '{matched_id}' in campaign {campaign_id} (best score: {best_score:.2f})")
|
730
773
|
|
731
774
|
except Exception as e:
|
732
775
|
log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
|