debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/lineage_format.py CHANGED
@@ -553,6 +553,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
553
553
  This function:
554
554
  1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
555
555
  2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
556
+ 3. Uses Gemini API for intelligent matching when exact matches fail
556
557
  """
557
558
  # Step 1: Clean up 3a data format
558
559
  log.info("Cleaning up reaction data (3a) format...")
@@ -564,6 +565,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
564
565
 
565
566
  # Step 2: Create sequence lookup from cleaned 3a data
566
567
  seq_lookup = {}
568
+ campaign_enzymes = {} # Track enzymes by campaign for Gemini matching
567
569
 
568
570
  # Collect sequences from reaction data entries (3a) - these have data_type='lineage'
569
571
  reaction_entries = df[df.get("data_type") == "lineage"]
@@ -584,7 +586,9 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
584
586
  "aa_sequence": aa_seq,
585
587
  "nt_sequence": nt_seq if nt_seq != "nan" else "",
586
588
  "campaign_id": campaign_id,
587
- "enzyme_id": eid
589
+ "enzyme_id": eid,
590
+ "generation": str(row.get("generation", "")),
591
+ "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
588
592
  }
589
593
 
590
594
  # Also keep simple enzyme_id lookup as fallback
@@ -592,16 +596,41 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
592
596
  "aa_sequence": aa_seq,
593
597
  "nt_sequence": nt_seq if nt_seq != "nan" else "",
594
598
  "campaign_id": campaign_id,
595
- "enzyme_id": eid
599
+ "enzyme_id": eid,
600
+ "generation": str(row.get("generation", "")),
601
+ "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
596
602
  }
603
+
604
+ # Track enzymes by campaign for Gemini matching
605
+ if campaign_id not in campaign_enzymes:
606
+ campaign_enzymes[campaign_id] = []
607
+ campaign_enzymes[campaign_id].append({
608
+ "enzyme_id": eid,
609
+ "has_sequence": True,
610
+ "generation": str(row.get("generation", "")),
611
+ "parent_id": str(row.get("parent_enzyme_id", ""))
612
+ })
597
613
 
598
614
  log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
599
615
 
616
+ # Setup Gemini if available
617
+ gemini_model = None
618
+ if GEMINI_OK and GEMINI_API_KEY:
619
+ try:
620
+ genai.configure(api_key=GEMINI_API_KEY)
621
+ gemini_model = genai.GenerativeModel('gemini-1.5-flash')
622
+ log.info("Gemini API configured for intelligent enzyme matching")
623
+ except Exception as e:
624
+ log.warning(f"Failed to configure Gemini API: {e}")
625
+
600
626
  # Step 3: Fill missing sequences in substrate scope entries (3b)
601
627
  substrate_entries = df[df.get("data_type") == "substrate_scope"]
602
628
  log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
603
629
 
604
630
  filled_count = 0
631
+ gemini_matched_count = 0
632
+ unmatched_enzymes = [] # Track enzymes that need Gemini matching
633
+
605
634
  for idx, row in df.iterrows():
606
635
  if row.get("data_type") != "substrate_scope":
607
636
  continue
@@ -620,6 +649,8 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
620
649
  if seq_lookup[composite_key]["nt_sequence"]:
621
650
  df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
622
651
  df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
652
+ df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
653
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
623
654
  filled_count += 1
624
655
  log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
625
656
 
@@ -630,18 +661,182 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
630
661
  if seq_lookup[eid]["nt_sequence"]:
631
662
  df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
632
663
  df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
664
+ df.at[idx, "generation"] = seq_lookup[eid]["generation"]
665
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[eid]["parent_enzyme_id"]
633
666
  filled_count += 1
634
667
  log.debug(f"Filled sequence for {eid} (fallback lookup)")
635
668
 
636
669
  else:
637
- log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
670
+ # Collect for Gemini matching
671
+ unmatched_enzymes.append({
672
+ "idx": idx,
673
+ "enzyme_id": eid,
674
+ "campaign_id": campaign_id
675
+ })
676
+
677
+ # Step 4: Use Gemini for intelligent matching of unmatched enzymes
678
+ if unmatched_enzymes and gemini_model:
679
+ log.info(f"Using Gemini to intelligently match {len(unmatched_enzymes)} unmatched enzymes")
680
+
681
+ # Group unmatched enzymes by campaign
682
+ unmatched_by_campaign = {}
683
+ for entry in unmatched_enzymes:
684
+ cid = entry["campaign_id"]
685
+ if cid not in unmatched_by_campaign:
686
+ unmatched_by_campaign[cid] = []
687
+ unmatched_by_campaign[cid].append(entry)
688
+
689
+ # Process each campaign
690
+ for campaign_id, entries in unmatched_by_campaign.items():
691
+ if campaign_id not in campaign_enzymes or not campaign_enzymes[campaign_id]:
692
+ log.warning(f"No enzymes with sequences found in campaign {campaign_id}")
693
+ continue
694
+
695
+ # Get enzyme IDs that need matching
696
+ unmatched_ids = [e["enzyme_id"] for e in entries]
697
+
698
+ # Get available enzymes in this campaign
699
+ available_ids = [e["enzyme_id"] for e in campaign_enzymes[campaign_id] if e["has_sequence"]]
700
+
701
+ if not available_ids:
702
+ log.warning(f"No enzymes with sequences available in campaign {campaign_id}")
703
+ continue
704
+
705
+ # Create prompt for Gemini
706
+ prompt = f"""Match enzyme variant IDs from substrate scope data to their corresponding sequences in reaction data.
707
+ These are from the same campaign ({campaign_id}) but may use slightly different naming conventions.
708
+
709
+ Enzymes needing sequences (from substrate scope):
710
+ {json.dumps(unmatched_ids, indent=2)}
711
+
712
+ Enzymes with sequences available (from reaction data):
713
+ {json.dumps(available_ids, indent=2)}
714
+
715
+ Match each enzyme from the first list to its corresponding enzyme in the second list.
716
+ Consider variations like:
717
+ - Case differences (p411-hf vs P411-HF)
718
+ - Underscore vs hyphen (p411_hf vs p411-hf)
719
+ - Additional prefixes/suffixes
720
+ - Similar naming patterns within the campaign
721
+
722
+ Return ONLY a JSON object mapping substrate scope IDs to reaction data IDs:
723
+ {{"substrate_scope_id": "reaction_data_id", ...}}
724
+
725
+ Only include matches you are confident about. If no match exists, omit that enzyme.
726
+ """
727
+
728
+ try:
729
+ response = gemini_model.generate_content(prompt)
730
+ mapping_text = response.text.strip()
731
+
732
+ # Extract JSON from response
733
+ if '```json' in mapping_text:
734
+ mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
735
+ elif '```' in mapping_text:
736
+ mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
737
+
738
+ mapping = json.loads(mapping_text)
739
+
740
+ # Apply the matches
741
+ for entry in entries:
742
+ substrate_id = entry["enzyme_id"]
743
+ if substrate_id in mapping:
744
+ matched_id = mapping[substrate_id]
745
+ composite_key = f"{campaign_id}_{matched_id}"
746
+
747
+ if composite_key in seq_lookup:
748
+ idx = entry["idx"]
749
+ df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
750
+ df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
751
+ if seq_lookup[composite_key]["nt_sequence"]:
752
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
753
+ df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
754
+
755
+ # Also copy generation and parent_enzyme_id
756
+ df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
757
+ df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
758
+
759
+ # Store the match for later mutation copying
760
+ df.at[idx, "_matched_enzyme_id"] = matched_id
761
+ df.at[idx, "_matched_campaign_id"] = campaign_id
762
+
763
+ gemini_matched_count += 1
764
+ log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
765
+ else:
766
+ log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
767
+
768
+ except Exception as e:
769
+ log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
770
+
771
+ # Final logging
772
+ total_filled = filled_count + gemini_matched_count
773
+ if total_filled > 0:
774
+ log.info(f"Successfully filled sequences for {total_filled} substrate scope entries "
775
+ f"({filled_count} exact matches, {gemini_matched_count} Gemini matches)")
638
776
 
639
- if filled_count > 0:
640
- log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
777
+ # Log any remaining unmatched
778
+ for entry in unmatched_enzymes:
779
+ if not any(df.at[entry["idx"], col] for col in ["protein_sequence", "aa_sequence"]
780
+ if col in df.columns and df.at[entry["idx"], col]):
781
+ log.warning(f"No sequence found for enzyme_id={entry['enzyme_id']} in campaign {entry['campaign_id']}")
641
782
 
642
783
  return df
643
784
 
644
785
 
786
+ def _copy_mutations_from_matched_enzymes(out_df: pd.DataFrame, orig_df: pd.DataFrame) -> pd.DataFrame:
787
+ """Copy nucleotide_mutation and amino_acid_substitutions from matched enzymes.
788
+
789
+ This function looks for entries that were matched by Gemini and copies their
790
+ mutation information from the corresponding matched enzyme.
791
+ """
792
+ # Look for entries with _matched_enzyme_id (these were matched by Gemini)
793
+ if "_matched_enzyme_id" not in orig_df.columns:
794
+ return out_df
795
+
796
+ matched_entries = orig_df[orig_df["_matched_enzyme_id"].notna()]
797
+
798
+ if len(matched_entries) == 0:
799
+ return out_df
800
+
801
+ log.info(f"Copying mutations for {len(matched_entries)} Gemini-matched entries")
802
+
803
+ # Create a lookup of mutations from the output dataframe
804
+ mutation_lookup = {}
805
+ for idx, row in out_df.iterrows():
806
+ key = f"{row['campaign_id']}_{row['id']}" # 'id' is the enzyme_id in output
807
+ mutation_lookup[key] = {
808
+ "nucleotide_mutation": row.get("nucleotide_mutation", ""),
809
+ "amino_acid_substitutions": row.get("amino_acid_substitutions", "")
810
+ }
811
+
812
+ # Copy mutations for matched entries
813
+ mutations_copied = 0
814
+ for idx, row in out_df.iterrows():
815
+ # Check if this row needs mutation copying
816
+ # Find the original row in orig_df with the same enzyme_id and campaign_id
817
+ orig_mask = (orig_df["enzyme_id"] == row["id"]) & (orig_df["campaign_id"] == row["campaign_id"])
818
+ orig_rows = orig_df[orig_mask]
819
+
820
+ if len(orig_rows) > 0 and "_matched_enzyme_id" in orig_rows.columns:
821
+ orig_row = orig_rows.iloc[0]
822
+ if pd.notna(orig_row.get("_matched_enzyme_id")):
823
+ # This was a Gemini-matched entry
824
+ matched_id = orig_row["_matched_enzyme_id"]
825
+ matched_campaign = orig_row["_matched_campaign_id"]
826
+ lookup_key = f"{matched_campaign}_{matched_id}"
827
+
828
+ if lookup_key in mutation_lookup:
829
+ out_df.at[idx, "nucleotide_mutation"] = mutation_lookup[lookup_key]["nucleotide_mutation"]
830
+ out_df.at[idx, "amino_acid_substitutions"] = mutation_lookup[lookup_key]["amino_acid_substitutions"]
831
+ mutations_copied += 1
832
+ log.debug(f"Copied mutations for {row['id']} from {matched_id}")
833
+
834
+ if mutations_copied > 0:
835
+ log.info(f"Successfully copied mutations for {mutations_copied} entries")
836
+
837
+ return out_df
838
+
839
+
645
840
  def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
646
841
  """Use Gemini API to identify parent enzymes for entries with missing parent information."""
647
842
  if not GEMINI_OK:
@@ -885,11 +1080,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
885
1080
  # Fill missing sequences in substrate scope entries from lineage data
886
1081
  df = _fill_missing_sequences(df)
887
1082
 
888
- # Use Gemini API to identify parent enzymes for entries with missing sequences
889
- df = _identify_parents_with_gemini(df)
890
-
891
- # Fill sequences again after parent identification to propagate sequences from identified parents
892
- df = _fill_missing_sequences(df)
1083
+ # Note: Removed parent identification - we only want exact variant matching
893
1084
 
894
1085
  # 1. Generate lineage roots once -----------------------------------------
895
1086
  lineage_roots = _generate_lineage_roots(df)
@@ -992,15 +1183,33 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
992
1183
  if generation != "0":
993
1184
  for cid, cmap in campaign_idmap.items():
994
1185
  if cid == campaign_id:
1186
+ # First try to find generation 0
995
1187
  for enzyme_id, enzyme_row in cmap.items():
996
1188
  enzyme_gen = str(enzyme_row.get("generation", "")).strip()
997
1189
  if enzyme_gen == "0" or enzyme_gen == "0.0":
998
1190
  reference_row = enzyme_row
999
1191
  log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
1000
1192
  break
1193
+
1194
+ # If no generation 0 found, find the earliest generation
1195
+ if not reference_row:
1196
+ earliest_gen = float('inf')
1197
+ earliest_enzyme = None
1198
+ for enzyme_id, enzyme_row in cmap.items():
1199
+ try:
1200
+ enzyme_gen = float(str(enzyme_row.get("generation", "")).strip())
1201
+ if enzyme_gen < earliest_gen and enzyme_gen < float(generation):
1202
+ earliest_gen = enzyme_gen
1203
+ earliest_enzyme = enzyme_id
1204
+ reference_row = enzyme_row
1205
+ except (ValueError, AttributeError):
1206
+ continue
1207
+
1208
+ if reference_row:
1209
+ log.info(f"No generation 0 found in campaign {campaign_id}, using generation {earliest_gen} enzyme {earliest_enzyme} as reference for {eid}")
1210
+ else:
1211
+ log.warning(f"No suitable reference enzyme found in campaign {campaign_id} for {eid}")
1001
1212
  break
1002
- if not reference_row:
1003
- log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
1004
1213
 
1005
1214
  reference_aa = ""
1006
1215
  reference_nt = ""
@@ -1095,6 +1304,10 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1095
1304
 
1096
1305
  log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
1097
1306
  out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
1307
+
1308
+ # Post-process: Copy mutations from matched enzymes for Gemini-matched substrate scope entries
1309
+ out_df = _copy_mutations_from_matched_enzymes(out_df, df)
1310
+
1098
1311
  return out_df
1099
1312
 
1100
1313
 
@@ -1137,7 +1350,7 @@ def run_pipeline(reaction_csv: str | Path | None = None,
1137
1350
  if not dfs:
1138
1351
  raise ValueError("At least one input CSV must be provided")
1139
1352
 
1140
- # Combine dataframes
1353
+ # Combine dataframes without deduplication
1141
1354
  if len(dfs) > 1:
1142
1355
  df_in = pd.concat(dfs, ignore_index=True)
1143
1356
  log.info("Combined data: %d total entries", len(df_in))