debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +512 -33
- debase/enzyme_lineage_extractor.py +985 -100
- debase/lineage_format.py +226 -13
- debase/reaction_info_extractor.py +178 -34
- debase/substrate_scope_extractor.py +52 -4
- debase/wrapper.py +155 -151
- debase-0.4.5.dist-info/METADATA +121 -0
- debase-0.4.5.dist-info/RECORD +16 -0
- debase-0.4.3.dist-info/METADATA +0 -296
- debase-0.4.3.dist-info/RECORD +0 -16
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -553,6 +553,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
553
553
|
This function:
|
554
554
|
1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
|
555
555
|
2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
|
556
|
+
3. Uses Gemini API for intelligent matching when exact matches fail
|
556
557
|
"""
|
557
558
|
# Step 1: Clean up 3a data format
|
558
559
|
log.info("Cleaning up reaction data (3a) format...")
|
@@ -564,6 +565,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
564
565
|
|
565
566
|
# Step 2: Create sequence lookup from cleaned 3a data
|
566
567
|
seq_lookup = {}
|
568
|
+
campaign_enzymes = {} # Track enzymes by campaign for Gemini matching
|
567
569
|
|
568
570
|
# Collect sequences from reaction data entries (3a) - these have data_type='lineage'
|
569
571
|
reaction_entries = df[df.get("data_type") == "lineage"]
|
@@ -584,7 +586,9 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
584
586
|
"aa_sequence": aa_seq,
|
585
587
|
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
586
588
|
"campaign_id": campaign_id,
|
587
|
-
"enzyme_id": eid
|
589
|
+
"enzyme_id": eid,
|
590
|
+
"generation": str(row.get("generation", "")),
|
591
|
+
"parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
|
588
592
|
}
|
589
593
|
|
590
594
|
# Also keep simple enzyme_id lookup as fallback
|
@@ -592,16 +596,41 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
592
596
|
"aa_sequence": aa_seq,
|
593
597
|
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
594
598
|
"campaign_id": campaign_id,
|
595
|
-
"enzyme_id": eid
|
599
|
+
"enzyme_id": eid,
|
600
|
+
"generation": str(row.get("generation", "")),
|
601
|
+
"parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
|
596
602
|
}
|
603
|
+
|
604
|
+
# Track enzymes by campaign for Gemini matching
|
605
|
+
if campaign_id not in campaign_enzymes:
|
606
|
+
campaign_enzymes[campaign_id] = []
|
607
|
+
campaign_enzymes[campaign_id].append({
|
608
|
+
"enzyme_id": eid,
|
609
|
+
"has_sequence": True,
|
610
|
+
"generation": str(row.get("generation", "")),
|
611
|
+
"parent_id": str(row.get("parent_enzyme_id", ""))
|
612
|
+
})
|
597
613
|
|
598
614
|
log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
|
599
615
|
|
616
|
+
# Setup Gemini if available
|
617
|
+
gemini_model = None
|
618
|
+
if GEMINI_OK and GEMINI_API_KEY:
|
619
|
+
try:
|
620
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
621
|
+
gemini_model = genai.GenerativeModel('gemini-1.5-flash')
|
622
|
+
log.info("Gemini API configured for intelligent enzyme matching")
|
623
|
+
except Exception as e:
|
624
|
+
log.warning(f"Failed to configure Gemini API: {e}")
|
625
|
+
|
600
626
|
# Step 3: Fill missing sequences in substrate scope entries (3b)
|
601
627
|
substrate_entries = df[df.get("data_type") == "substrate_scope"]
|
602
628
|
log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
|
603
629
|
|
604
630
|
filled_count = 0
|
631
|
+
gemini_matched_count = 0
|
632
|
+
unmatched_enzymes = [] # Track enzymes that need Gemini matching
|
633
|
+
|
605
634
|
for idx, row in df.iterrows():
|
606
635
|
if row.get("data_type") != "substrate_scope":
|
607
636
|
continue
|
@@ -620,6 +649,8 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
620
649
|
if seq_lookup[composite_key]["nt_sequence"]:
|
621
650
|
df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
622
651
|
df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
652
|
+
df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
|
653
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
|
623
654
|
filled_count += 1
|
624
655
|
log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
|
625
656
|
|
@@ -630,18 +661,182 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
|
630
661
|
if seq_lookup[eid]["nt_sequence"]:
|
631
662
|
df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
|
632
663
|
df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
|
664
|
+
df.at[idx, "generation"] = seq_lookup[eid]["generation"]
|
665
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[eid]["parent_enzyme_id"]
|
633
666
|
filled_count += 1
|
634
667
|
log.debug(f"Filled sequence for {eid} (fallback lookup)")
|
635
668
|
|
636
669
|
else:
|
637
|
-
|
670
|
+
# Collect for Gemini matching
|
671
|
+
unmatched_enzymes.append({
|
672
|
+
"idx": idx,
|
673
|
+
"enzyme_id": eid,
|
674
|
+
"campaign_id": campaign_id
|
675
|
+
})
|
676
|
+
|
677
|
+
# Step 4: Use Gemini for intelligent matching of unmatched enzymes
|
678
|
+
if unmatched_enzymes and gemini_model:
|
679
|
+
log.info(f"Using Gemini to intelligently match {len(unmatched_enzymes)} unmatched enzymes")
|
680
|
+
|
681
|
+
# Group unmatched enzymes by campaign
|
682
|
+
unmatched_by_campaign = {}
|
683
|
+
for entry in unmatched_enzymes:
|
684
|
+
cid = entry["campaign_id"]
|
685
|
+
if cid not in unmatched_by_campaign:
|
686
|
+
unmatched_by_campaign[cid] = []
|
687
|
+
unmatched_by_campaign[cid].append(entry)
|
688
|
+
|
689
|
+
# Process each campaign
|
690
|
+
for campaign_id, entries in unmatched_by_campaign.items():
|
691
|
+
if campaign_id not in campaign_enzymes or not campaign_enzymes[campaign_id]:
|
692
|
+
log.warning(f"No enzymes with sequences found in campaign {campaign_id}")
|
693
|
+
continue
|
694
|
+
|
695
|
+
# Get enzyme IDs that need matching
|
696
|
+
unmatched_ids = [e["enzyme_id"] for e in entries]
|
697
|
+
|
698
|
+
# Get available enzymes in this campaign
|
699
|
+
available_ids = [e["enzyme_id"] for e in campaign_enzymes[campaign_id] if e["has_sequence"]]
|
700
|
+
|
701
|
+
if not available_ids:
|
702
|
+
log.warning(f"No enzymes with sequences available in campaign {campaign_id}")
|
703
|
+
continue
|
704
|
+
|
705
|
+
# Create prompt for Gemini
|
706
|
+
prompt = f"""Match enzyme variant IDs from substrate scope data to their corresponding sequences in reaction data.
|
707
|
+
These are from the same campaign ({campaign_id}) but may use slightly different naming conventions.
|
708
|
+
|
709
|
+
Enzymes needing sequences (from substrate scope):
|
710
|
+
{json.dumps(unmatched_ids, indent=2)}
|
711
|
+
|
712
|
+
Enzymes with sequences available (from reaction data):
|
713
|
+
{json.dumps(available_ids, indent=2)}
|
714
|
+
|
715
|
+
Match each enzyme from the first list to its corresponding enzyme in the second list.
|
716
|
+
Consider variations like:
|
717
|
+
- Case differences (p411-hf vs P411-HF)
|
718
|
+
- Underscore vs hyphen (p411_hf vs p411-hf)
|
719
|
+
- Additional prefixes/suffixes
|
720
|
+
- Similar naming patterns within the campaign
|
721
|
+
|
722
|
+
Return ONLY a JSON object mapping substrate scope IDs to reaction data IDs:
|
723
|
+
{{"substrate_scope_id": "reaction_data_id", ...}}
|
724
|
+
|
725
|
+
Only include matches you are confident about. If no match exists, omit that enzyme.
|
726
|
+
"""
|
727
|
+
|
728
|
+
try:
|
729
|
+
response = gemini_model.generate_content(prompt)
|
730
|
+
mapping_text = response.text.strip()
|
731
|
+
|
732
|
+
# Extract JSON from response
|
733
|
+
if '```json' in mapping_text:
|
734
|
+
mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
|
735
|
+
elif '```' in mapping_text:
|
736
|
+
mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
|
737
|
+
|
738
|
+
mapping = json.loads(mapping_text)
|
739
|
+
|
740
|
+
# Apply the matches
|
741
|
+
for entry in entries:
|
742
|
+
substrate_id = entry["enzyme_id"]
|
743
|
+
if substrate_id in mapping:
|
744
|
+
matched_id = mapping[substrate_id]
|
745
|
+
composite_key = f"{campaign_id}_{matched_id}"
|
746
|
+
|
747
|
+
if composite_key in seq_lookup:
|
748
|
+
idx = entry["idx"]
|
749
|
+
df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
750
|
+
df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
751
|
+
if seq_lookup[composite_key]["nt_sequence"]:
|
752
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
753
|
+
df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
754
|
+
|
755
|
+
# Also copy generation and parent_enzyme_id
|
756
|
+
df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
|
757
|
+
df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
|
758
|
+
|
759
|
+
# Store the match for later mutation copying
|
760
|
+
df.at[idx, "_matched_enzyme_id"] = matched_id
|
761
|
+
df.at[idx, "_matched_campaign_id"] = campaign_id
|
762
|
+
|
763
|
+
gemini_matched_count += 1
|
764
|
+
log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
|
765
|
+
else:
|
766
|
+
log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
|
767
|
+
|
768
|
+
except Exception as e:
|
769
|
+
log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
|
770
|
+
|
771
|
+
# Final logging
|
772
|
+
total_filled = filled_count + gemini_matched_count
|
773
|
+
if total_filled > 0:
|
774
|
+
log.info(f"Successfully filled sequences for {total_filled} substrate scope entries "
|
775
|
+
f"({filled_count} exact matches, {gemini_matched_count} Gemini matches)")
|
638
776
|
|
639
|
-
|
640
|
-
|
777
|
+
# Log any remaining unmatched
|
778
|
+
for entry in unmatched_enzymes:
|
779
|
+
if not any(df.at[entry["idx"], col] for col in ["protein_sequence", "aa_sequence"]
|
780
|
+
if col in df.columns and df.at[entry["idx"], col]):
|
781
|
+
log.warning(f"No sequence found for enzyme_id={entry['enzyme_id']} in campaign {entry['campaign_id']}")
|
641
782
|
|
642
783
|
return df
|
643
784
|
|
644
785
|
|
786
|
+
def _copy_mutations_from_matched_enzymes(out_df: pd.DataFrame, orig_df: pd.DataFrame) -> pd.DataFrame:
|
787
|
+
"""Copy nucleotide_mutation and amino_acid_substitutions from matched enzymes.
|
788
|
+
|
789
|
+
This function looks for entries that were matched by Gemini and copies their
|
790
|
+
mutation information from the corresponding matched enzyme.
|
791
|
+
"""
|
792
|
+
# Look for entries with _matched_enzyme_id (these were matched by Gemini)
|
793
|
+
if "_matched_enzyme_id" not in orig_df.columns:
|
794
|
+
return out_df
|
795
|
+
|
796
|
+
matched_entries = orig_df[orig_df["_matched_enzyme_id"].notna()]
|
797
|
+
|
798
|
+
if len(matched_entries) == 0:
|
799
|
+
return out_df
|
800
|
+
|
801
|
+
log.info(f"Copying mutations for {len(matched_entries)} Gemini-matched entries")
|
802
|
+
|
803
|
+
# Create a lookup of mutations from the output dataframe
|
804
|
+
mutation_lookup = {}
|
805
|
+
for idx, row in out_df.iterrows():
|
806
|
+
key = f"{row['campaign_id']}_{row['id']}" # 'id' is the enzyme_id in output
|
807
|
+
mutation_lookup[key] = {
|
808
|
+
"nucleotide_mutation": row.get("nucleotide_mutation", ""),
|
809
|
+
"amino_acid_substitutions": row.get("amino_acid_substitutions", "")
|
810
|
+
}
|
811
|
+
|
812
|
+
# Copy mutations for matched entries
|
813
|
+
mutations_copied = 0
|
814
|
+
for idx, row in out_df.iterrows():
|
815
|
+
# Check if this row needs mutation copying
|
816
|
+
# Find the original row in orig_df with the same enzyme_id and campaign_id
|
817
|
+
orig_mask = (orig_df["enzyme_id"] == row["id"]) & (orig_df["campaign_id"] == row["campaign_id"])
|
818
|
+
orig_rows = orig_df[orig_mask]
|
819
|
+
|
820
|
+
if len(orig_rows) > 0 and "_matched_enzyme_id" in orig_rows.columns:
|
821
|
+
orig_row = orig_rows.iloc[0]
|
822
|
+
if pd.notna(orig_row.get("_matched_enzyme_id")):
|
823
|
+
# This was a Gemini-matched entry
|
824
|
+
matched_id = orig_row["_matched_enzyme_id"]
|
825
|
+
matched_campaign = orig_row["_matched_campaign_id"]
|
826
|
+
lookup_key = f"{matched_campaign}_{matched_id}"
|
827
|
+
|
828
|
+
if lookup_key in mutation_lookup:
|
829
|
+
out_df.at[idx, "nucleotide_mutation"] = mutation_lookup[lookup_key]["nucleotide_mutation"]
|
830
|
+
out_df.at[idx, "amino_acid_substitutions"] = mutation_lookup[lookup_key]["amino_acid_substitutions"]
|
831
|
+
mutations_copied += 1
|
832
|
+
log.debug(f"Copied mutations for {row['id']} from {matched_id}")
|
833
|
+
|
834
|
+
if mutations_copied > 0:
|
835
|
+
log.info(f"Successfully copied mutations for {mutations_copied} entries")
|
836
|
+
|
837
|
+
return out_df
|
838
|
+
|
839
|
+
|
645
840
|
def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
646
841
|
"""Use Gemini API to identify parent enzymes for entries with missing parent information."""
|
647
842
|
if not GEMINI_OK:
|
@@ -885,11 +1080,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
885
1080
|
# Fill missing sequences in substrate scope entries from lineage data
|
886
1081
|
df = _fill_missing_sequences(df)
|
887
1082
|
|
888
|
-
#
|
889
|
-
df = _identify_parents_with_gemini(df)
|
890
|
-
|
891
|
-
# Fill sequences again after parent identification to propagate sequences from identified parents
|
892
|
-
df = _fill_missing_sequences(df)
|
1083
|
+
# Note: Removed parent identification - we only want exact variant matching
|
893
1084
|
|
894
1085
|
# 1. Generate lineage roots once -----------------------------------------
|
895
1086
|
lineage_roots = _generate_lineage_roots(df)
|
@@ -992,15 +1183,33 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
992
1183
|
if generation != "0":
|
993
1184
|
for cid, cmap in campaign_idmap.items():
|
994
1185
|
if cid == campaign_id:
|
1186
|
+
# First try to find generation 0
|
995
1187
|
for enzyme_id, enzyme_row in cmap.items():
|
996
1188
|
enzyme_gen = str(enzyme_row.get("generation", "")).strip()
|
997
1189
|
if enzyme_gen == "0" or enzyme_gen == "0.0":
|
998
1190
|
reference_row = enzyme_row
|
999
1191
|
log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
|
1000
1192
|
break
|
1193
|
+
|
1194
|
+
# If no generation 0 found, find the earliest generation
|
1195
|
+
if not reference_row:
|
1196
|
+
earliest_gen = float('inf')
|
1197
|
+
earliest_enzyme = None
|
1198
|
+
for enzyme_id, enzyme_row in cmap.items():
|
1199
|
+
try:
|
1200
|
+
enzyme_gen = float(str(enzyme_row.get("generation", "")).strip())
|
1201
|
+
if enzyme_gen < earliest_gen and enzyme_gen < float(generation):
|
1202
|
+
earliest_gen = enzyme_gen
|
1203
|
+
earliest_enzyme = enzyme_id
|
1204
|
+
reference_row = enzyme_row
|
1205
|
+
except (ValueError, AttributeError):
|
1206
|
+
continue
|
1207
|
+
|
1208
|
+
if reference_row:
|
1209
|
+
log.info(f"No generation 0 found in campaign {campaign_id}, using generation {earliest_gen} enzyme {earliest_enzyme} as reference for {eid}")
|
1210
|
+
else:
|
1211
|
+
log.warning(f"No suitable reference enzyme found in campaign {campaign_id} for {eid}")
|
1001
1212
|
break
|
1002
|
-
if not reference_row:
|
1003
|
-
log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
|
1004
1213
|
|
1005
1214
|
reference_aa = ""
|
1006
1215
|
reference_nt = ""
|
@@ -1095,6 +1304,10 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
1095
1304
|
|
1096
1305
|
log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
|
1097
1306
|
out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
|
1307
|
+
|
1308
|
+
# Post-process: Copy mutations from matched enzymes for Gemini-matched substrate scope entries
|
1309
|
+
out_df = _copy_mutations_from_matched_enzymes(out_df, df)
|
1310
|
+
|
1098
1311
|
return out_df
|
1099
1312
|
|
1100
1313
|
|
@@ -1137,7 +1350,7 @@ def run_pipeline(reaction_csv: str | Path | None = None,
|
|
1137
1350
|
if not dfs:
|
1138
1351
|
raise ValueError("At least one input CSV must be provided")
|
1139
1352
|
|
1140
|
-
# Combine dataframes
|
1353
|
+
# Combine dataframes without deduplication
|
1141
1354
|
if len(dfs) > 1:
|
1142
1355
|
df_in = pd.concat(dfs, ignore_index=True)
|
1143
1356
|
log.info("Combined data: %d total entries", len(df_in))
|