debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,7 @@ Usage:
11
11
 
12
12
  import argparse
13
13
  import logging
14
+ import os
14
15
  import re
15
16
  import sys
16
17
  from dataclasses import dataclass, field
@@ -19,11 +20,20 @@ from typing import Dict, List, Optional, Set, Tuple, Union
19
20
 
20
21
  import pandas as pd
21
22
 
23
+ try:
24
+ import google.generativeai as genai # type: ignore
25
+ GEMINI_OK = True
26
+ except ImportError: # pragma: no cover
27
+ GEMINI_OK = False
28
+
22
29
 
23
30
  # === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
24
31
 
25
32
  VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
26
33
 
34
+ # Gemini API configuration
35
+ GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
36
+
27
37
  # Configure module logger
28
38
  log = logging.getLogger(__name__)
29
39
 
@@ -193,12 +203,17 @@ class SequenceManipulator:
193
203
  return 0 if zero_matches >= one_matches else 1
194
204
 
195
205
  @classmethod
196
- def apply_mutations(cls, parent_seq: str, mutation_str: str) -> str:
197
- """Apply mutations to a parent sequence."""
206
+ def apply_mutations(cls, parent_seq: str, mutation_str: str) -> Tuple[str, bool]:
207
+ """Apply mutations to a parent sequence.
208
+
209
+ Returns:
210
+ Tuple[str, bool]: (resulting_sequence, all_mutations_applied_successfully)
211
+ """
198
212
  if not parent_seq:
199
- return ""
213
+ return "", True
200
214
 
201
215
  seq = list(parent_seq)
216
+ all_mutations_successful = True
202
217
 
203
218
  # Apply point mutations
204
219
  mutations = MutationParser.parse_mutations(mutation_str)
@@ -207,19 +222,26 @@ class SequenceManipulator:
207
222
 
208
223
  for mut in mutations:
209
224
  idx = mut.position - idx_offset
225
+ mutation_applied = False
226
+
210
227
  # Try primary index
211
228
  if 0 <= idx < len(seq) and seq[idx].upper() == mut.original.upper():
212
229
  seq[idx] = mut.replacement
230
+ mutation_applied = True
213
231
  else:
214
232
  # Try alternate index
215
233
  alt_idx = mut.position - (1 - idx_offset)
216
234
  if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.original.upper():
217
235
  seq[alt_idx] = mut.replacement
218
- else:
219
- log.warning(
220
- f"Mutation {mut} does not match parent sequence at "
221
- f"position {mut.position} (tried both 0- and 1-based indexing)"
222
- )
236
+ mutation_applied = True
237
+
238
+ if not mutation_applied:
239
+ log.error(
240
+ f"MUTATION MISMATCH: {mut} does not match parent sequence at "
241
+ f"position {mut.position} (tried both 0- and 1-based indexing). "
242
+ f"Parent has {seq[idx] if 0 <= idx < len(seq) else 'out-of-bounds'} at position {mut.position}"
243
+ )
244
+ all_mutations_successful = False
223
245
 
224
246
  # Apply complex C-terminal mutations
225
247
  complex_mut = MutationParser.parse_complex_c_terminal(mutation_str)
@@ -242,12 +264,13 @@ class SequenceManipulator:
242
264
  if complex_mut.extension_seq:
243
265
  seq.extend(list(complex_mut.extension_seq))
244
266
  else:
245
- log.warning(
246
- f"Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
267
+ log.error(
268
+ f"COMPLEX MUTATION MISMATCH: Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
247
269
  f"{complex_mut.end_pos} for sequence of length {len(seq)}"
248
270
  )
271
+ all_mutations_successful = False
249
272
 
250
- return "".join(seq)
273
+ return "".join(seq), all_mutations_successful
251
274
 
252
275
  @classmethod
253
276
  def reverse_mutations(cls, child_seq: str, mutation_str: str) -> str:
@@ -390,10 +413,11 @@ class LineageNavigator:
390
413
  class SequenceGenerator:
391
414
  """Main class for generating protein sequences from mutations."""
392
415
 
393
- def __init__(self, df: pd.DataFrame):
416
+ def __init__(self, df: pd.DataFrame, strict_mutation_validation: bool = True):
394
417
  self.df = df
395
418
  self.navigator = LineageNavigator(df)
396
419
  self.manipulator = SequenceManipulator()
420
+ self.strict_mutation_validation = strict_mutation_validation
397
421
  self._update_ground_truths()
398
422
 
399
423
  def _update_ground_truths(self) -> None:
@@ -464,19 +488,62 @@ class SequenceGenerator:
464
488
  parent_id: str
465
489
  ) -> Optional[SequenceGenerationResult]:
466
490
  """Generate sequence by applying mutations to parent."""
467
- parent_row = self.df[self.df["enzyme_id"] == parent_id].iloc[0]
468
- parent_seq = parent_row.get("protein_sequence", "")
469
-
470
- if not parent_seq:
491
+ # Get the variant to find its campaign
492
+ variant_rows = self.df[self.df["enzyme_id"] == variant_id]
493
+ if variant_rows.empty:
471
494
  return None
472
495
 
473
- variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
496
+ variant_row = variant_rows.iloc[0]
497
+ variant_campaign = variant_row.get("campaign_id", "")
474
498
  mutations = variant_row.get("mutations", "")
475
499
 
476
500
  if not mutations:
477
501
  return None
478
502
 
479
- sequence = self.manipulator.apply_mutations(parent_seq, mutations)
503
+ # Find parent in the same campaign first
504
+ parent_rows = self.df[
505
+ (self.df["enzyme_id"] == parent_id) &
506
+ (self.df["campaign_id"] == variant_campaign)
507
+ ]
508
+
509
+ # If not found in same campaign, fall back to any parent with that ID
510
+ if parent_rows.empty:
511
+ parent_rows = self.df[self.df["enzyme_id"] == parent_id]
512
+ if not parent_rows.empty:
513
+ log.warning(f"Parent {parent_id} not found in same campaign {variant_campaign} for variant {variant_id}, using parent from different campaign")
514
+
515
+ if parent_rows.empty:
516
+ log.error(f"Parent {parent_id} not found for variant {variant_id}")
517
+ return None
518
+
519
+ parent_row = parent_rows.iloc[0]
520
+ parent_seq = parent_row.get("protein_sequence", "")
521
+ parent_campaign = parent_row.get("campaign_id", "")
522
+
523
+ if not parent_seq:
524
+ return None
525
+
526
+ # Log which parent sequence is being used
527
+ if parent_campaign != variant_campaign:
528
+ log.info(f"Using parent {parent_id} from campaign {parent_campaign} for variant {variant_id} in campaign {variant_campaign}")
529
+ else:
530
+ log.info(f"Using parent {parent_id} from same campaign {variant_campaign} for variant {variant_id}")
531
+
532
+ sequence, mutations_successful = self.manipulator.apply_mutations(parent_seq, mutations)
533
+
534
+ if not mutations_successful:
535
+ # Check if this might be an exact match case (mutations already present in parent)
536
+ # This happens when an enzyme from another campaign is identified as both parent and exact match
537
+ if parent_id == variant_id or (mutations and parent_seq == sequence):
538
+ log.info(f"Detected exact match scenario for {variant_id} - using parent sequence directly")
539
+ sequence = parent_seq
540
+ mutations_successful = True
541
+ elif self.strict_mutation_validation:
542
+ log.error(f"STRICT MODE: Failed to apply mutations for {variant_id}: mutation mismatch detected. Not populating sequence to prevent incorrect data.")
543
+ return None
544
+ else:
545
+ log.warning(f"Mutation mismatch for {variant_id}, but proceeding with generated sequence (strict_mutation_validation=False)")
546
+ # Continue with the sequence even if mutations failed
480
547
 
481
548
  return SequenceGenerationResult(
482
549
  sequence=sequence,
@@ -538,10 +605,14 @@ class SequenceGenerator:
538
605
 
539
606
  # Generate based on direction
540
607
  if direction == "up" and parent_id and mutations:
541
- if gt_id == parent_id:
542
- return self.generate_from_parent(variant_id, parent_id)
543
- else:
544
- # Non-direct ancestor - less reliable
608
+ # Always try the declared parent first
609
+ result = self.generate_from_parent(variant_id, parent_id)
610
+ if result:
611
+ return result
612
+
613
+ # If declared parent fails, try the ground truth (if different)
614
+ if gt_id != parent_id:
615
+ log.info(f"Declared parent {parent_id} failed for {variant_id}, trying ground truth {gt_id}")
545
616
  result = self.generate_from_parent(variant_id, gt_id)
546
617
  if result:
547
618
  result.confidence = 0.7
@@ -565,14 +636,501 @@ class SequenceGenerator:
565
636
  return None
566
637
 
567
638
 
568
- # === 7. MAIN PROCESSOR === ---------------------------------------------------
639
+ # === 7. GEMINI PARENT IDENTIFICATION === ------------------------------------
640
+
641
+ def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
642
+ """Use Gemini API to identify parent enzymes for entries with missing parent information."""
643
+ if not GEMINI_OK:
644
+ log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
645
+ return df
646
+
647
+ if not GEMINI_API_KEY:
648
+ log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
649
+ return df
650
+
651
+ try:
652
+ genai.configure(api_key=GEMINI_API_KEY)
653
+ model = genai.GenerativeModel('gemini-1.5-flash')
654
+ except Exception as e:
655
+ log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
656
+ return df
657
+
658
+ # Find entries with empty sequences but missing parent information
659
+ entries_needing_parents = []
660
+ for idx, row in df.iterrows():
661
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
662
+ parent_id = str(row.get("parent_enzyme_id", "")).strip()
663
+
664
+ # Only process entries that have empty sequences AND no parent info
665
+ if (not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]) and (not parent_id or parent_id.lower() in ["nan", "none", ""]):
666
+ enzyme_id = str(row.get("enzyme_id", ""))
667
+ campaign_id = str(row.get("campaign_id", ""))
668
+ generation = str(row.get("generation", ""))
669
+
670
+ entries_needing_parents.append({
671
+ "idx": idx,
672
+ "enzyme_id": enzyme_id,
673
+ "campaign_id": campaign_id,
674
+ "generation": generation
675
+ })
676
+
677
+ if not entries_needing_parents:
678
+ log.info("No entries need parent identification from Gemini")
679
+ return df
680
+
681
+ log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
682
+
683
+ # Create a lookup of all available enzyme IDs for context
684
+ available_enzymes = {}
685
+ for idx, row in df.iterrows():
686
+ enzyme_id = str(row.get("enzyme_id", ""))
687
+ campaign_id = str(row.get("campaign_id", ""))
688
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
689
+ generation = str(row.get("generation", ""))
690
+
691
+ if enzyme_id and enzyme_id.lower() != "nan":
692
+ available_enzymes[enzyme_id] = {
693
+ "campaign_id": campaign_id,
694
+ "has_sequence": bool(protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]),
695
+ "generation": generation
696
+ }
697
+
698
+ identified_count = 0
699
+ for entry in entries_needing_parents:
700
+ enzyme_id = entry["enzyme_id"]
701
+ campaign_id = entry["campaign_id"]
702
+ generation = entry["generation"]
703
+
704
+ # Create context for Gemini
705
+ context_info = []
706
+ context_info.append(f"Enzyme ID: {enzyme_id}")
707
+ context_info.append(f"Campaign ID: {campaign_id}")
708
+ if generation:
709
+ context_info.append(f"Generation: {generation}")
710
+
711
+ # Add available enzymes from the same campaign for context
712
+ campaign_enzymes = []
713
+ for enz_id, enz_data in available_enzymes.items():
714
+ if enz_data["campaign_id"] == campaign_id:
715
+ status = "with sequence" if enz_data["has_sequence"] else "without sequence"
716
+ gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
717
+ campaign_enzymes.append(f" - {enz_id} {status} {gen_info}")
718
+
719
+ if campaign_enzymes:
720
+ context_info.append("Available enzymes in same campaign:")
721
+ context_info.extend(campaign_enzymes[:10]) # Limit to first 10 for context
722
+
723
+ context_text = "\n".join(context_info)
724
+
725
+ prompt = f"""
726
+ Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
727
+
728
+ {context_text}
729
+
730
+ This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
731
+
732
+ Please provide your response in this format:
733
+ Parent: [parent_enzyme_id or "Unknown"]
734
+
735
+ If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
736
+ """
737
+
738
+ try:
739
+ response = model.generate_content(prompt)
740
+ response_text = response.text.strip()
741
+
742
+ # Parse the response
743
+ parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
744
+
745
+ if parent_match:
746
+ parent = parent_match.group(1).strip()
747
+ if parent and parent != "Unknown" and parent != "No parent identified":
748
+ # Verify the parent exists in our available enzymes
749
+ if parent in available_enzymes:
750
+ df.at[entry["idx"], "parent_enzyme_id"] = parent
751
+ identified_count += 1
752
+ log.info(f"Identified parent for {enzyme_id}: {parent}")
753
+ else:
754
+ log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
755
+
756
+ except Exception as e:
757
+ log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
758
+ continue
759
+
760
+ if identified_count > 0:
761
+ log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
762
+ else:
763
+ log.info("No parent enzymes were identified using Gemini API")
764
+
765
+ return df
766
+
767
+
768
+ # === 8. SEQUENCE SOURCE IDENTIFICATION === -----------------------------------
769
+
770
+ def identify_sequence_sources_with_gemini(df: pd.DataFrame, debug_dir: Optional[Path] = None) -> pd.DataFrame:
771
+ """Use Gemini API to identify which parent sequences to use for entries with missing sequences."""
772
+ if not GEMINI_OK:
773
+ log.warning("Gemini API not available (missing google.generativeai). Skipping sequence source identification.")
774
+ return df
775
+
776
+ if not GEMINI_API_KEY:
777
+ log.warning("GEMINI_API_KEY not set. Skipping sequence source identification.")
778
+ return df
779
+
780
+ try:
781
+ genai.configure(api_key=GEMINI_API_KEY)
782
+ model = genai.GenerativeModel('gemini-1.5-flash')
783
+ except Exception as e:
784
+ log.warning(f"Failed to configure Gemini API: {e}. Skipping sequence source identification.")
785
+ return df
786
+
787
+ # Group by campaign to process each campaign separately
788
+ campaigns = df['campaign_id'].unique()
789
+
790
+ for campaign_id in campaigns:
791
+ if pd.isna(campaign_id):
792
+ campaign_mask = df['campaign_id'].isna()
793
+ campaign_id_str = "unknown"
794
+ else:
795
+ campaign_mask = df['campaign_id'] == campaign_id
796
+ campaign_id_str = str(campaign_id)
797
+
798
+ campaign_df = df[campaign_mask]
799
+
800
+ # Find entries with empty sequences in this campaign
801
+ empty_seq_entries = []
802
+ available_seq_entries = []
803
+
804
+ for idx, row in campaign_df.iterrows():
805
+ enzyme_id = str(row.get("enzyme_id", ""))
806
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
807
+ parent_id = str(row.get("parent_enzyme_id", "")).strip()
808
+ mutations = str(row.get("mutations", "")).strip()
809
+ generation = str(row.get("generation", ""))
810
+
811
+ if not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]:
812
+ empty_seq_entries.append({
813
+ "idx": idx,
814
+ "enzyme_id": enzyme_id,
815
+ "parent_id": parent_id if parent_id != "nan" else None,
816
+ "mutations": mutations if mutations != "nan" else None,
817
+ "generation": generation
818
+ })
819
+ else:
820
+ available_seq_entries.append({
821
+ "enzyme_id": enzyme_id,
822
+ "generation": generation,
823
+ "seq_length": len(protein_sequence)
824
+ })
825
+
826
+ # Skip if no empty sequences
827
+ if not empty_seq_entries:
828
+ continue
829
+
830
+ # Check if this is a partially empty situation (some have sequences, some don't)
831
+ total_entries = len(campaign_df)
832
+ empty_count = len(empty_seq_entries)
833
+
834
+ log.info(f"Campaign {campaign_id_str}: {empty_count}/{total_entries} entries have empty sequences")
835
+
836
+ if empty_count == total_entries:
837
+ # All sequences are empty - try to find cross-campaign relationships
838
+ log.info(f"Campaign {campaign_id_str}: All sequences are empty ({empty_count}/{total_entries}). "
839
+ f"Searching for cross-campaign parent relationships...")
840
+
841
+ # Get all enzymes with sequences from OTHER campaigns
842
+ other_campaigns_with_seqs = []
843
+ for other_campaign in campaigns:
844
+ if other_campaign == campaign_id or pd.isna(other_campaign):
845
+ continue
846
+ other_mask = df['campaign_id'] == other_campaign
847
+ other_df = df[other_mask]
848
+
849
+ for idx, row in other_df.iterrows():
850
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
851
+ if protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]:
852
+ enzyme_id = str(row.get("enzyme_id", ""))
853
+ generation = str(row.get("generation", ""))
854
+ other_campaigns_with_seqs.append({
855
+ "enzyme_id": enzyme_id,
856
+ "campaign_id": str(other_campaign),
857
+ "generation": generation,
858
+ "seq_length": len(protein_sequence)
859
+ })
860
+
861
+ if not other_campaigns_with_seqs:
862
+ log.info(f"Campaign {campaign_id_str}: No sequences found in other campaigns to use as cross-campaign parents")
863
+ continue
864
+
865
+ # Create context for cross-campaign analysis
866
+ context_lines = []
867
+ context_lines.append(f"Empty Campaign: {campaign_id_str} (all {empty_count} enzymes need sequences)")
868
+ context_lines.append(f"\nEnzymes in empty campaign:")
869
+ for entry in empty_seq_entries[:10]: # Limit for context
870
+ parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
871
+ mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
872
+ context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
873
+
874
+ context_lines.append(f"\nEnzymes with sequences from OTHER campaigns ({len(other_campaigns_with_seqs)}):")
875
+ for entry in other_campaigns_with_seqs[:15]: # Limit for context
876
+ # Get the actual sequence for this enzyme
877
+ enzyme_rows = df[df['enzyme_id'] == entry['enzyme_id']]
878
+ if not enzyme_rows.empty:
879
+ sequence = str(enzyme_rows.iloc[0]['protein_sequence'])
880
+ context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, sequence: {sequence})")
881
+ else:
882
+ context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
883
+
884
+ context_text = "\n".join(context_lines)
885
+
886
+ # Find ONE good cross-campaign seed to bootstrap this campaign
887
+ log.info(f"Campaign {campaign_id_str}: Looking for ONE cross-campaign seed to bootstrap sequences...")
888
+
889
+ # Create a prompt to find the BEST single seed
890
+ prompt = f"""
891
+ Based on enzyme names, identify the SINGLE BEST seed enzyme from other campaigns to bootstrap the empty campaign.
892
+
893
+ {context_text}
894
+
895
+ From the enzymes in the EMPTY campaign, identify which ONE has the clearest match in OTHER campaigns.
896
+ Prioritize:
897
+ 1. EXACT name matches (highest priority)
898
+ 2. Simplest parent relationships (e.g., an enzyme that differs by only 1-2 mutations)
899
+ 3. Earliest generation enzymes (lower generation numbers are better seeds)
900
+
901
+ Return your response as a JSON dictionary with this exact format:
902
+ {{
903
+ "seed_enzyme": {{
904
+ "target_enzyme_id": "the enzyme ID in the empty campaign",
905
+ "relationship_type": "EXACT_MATCH" or "BEST_PARENT",
906
+ "source": {{
907
+ "campaign_id": "the campaign ID",
908
+ "enzyme_id": "the enzyme ID WITHOUT campaign suffix"
909
+ }},
910
+ "confidence": 0.1 to 1.0,
911
+ "reason": "brief explanation of why this is the best seed"
912
+ }}
913
+ }}
914
+
915
+ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
916
+ """
917
+
918
+ try:
919
+ # Save debug information if debug_dir is provided
920
+ if debug_dir:
921
+ import time
922
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
923
+ prompt_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_prompt_{timestamp}.txt"
924
+ prompt_file.write_text(prompt)
925
+
926
+ response = model.generate_content(prompt)
927
+ response_text = response.text.strip()
928
+
929
+ # Save response if debug_dir is provided
930
+ if debug_dir:
931
+ response_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_response_{timestamp}.txt"
932
+ response_file.write_text(response_text)
933
+
934
+ # Parse the JSON response
935
+ import json
936
+ try:
937
+ # Clean the response text if it contains markdown
938
+ if '```json' in response_text:
939
+ response_text = response_text.split('```json')[1].split('```')[0].strip()
940
+ elif '```' in response_text:
941
+ response_text = response_text.split('```')[1].split('```')[0].strip()
942
+
943
+ seed_data = json.loads(response_text)
944
+ seed_info = seed_data.get('seed_enzyme', {})
945
+
946
+ if seed_info:
947
+ target_enzyme_id = seed_info.get('target_enzyme_id', '')
948
+ relationship_type = seed_info.get('relationship_type', '').upper()
949
+ source_info = seed_info.get('source', {})
950
+ source_enzyme_id = source_info.get('enzyme_id', '')
951
+ source_campaign_id = source_info.get('campaign_id', '')
952
+ confidence = float(seed_info.get('confidence', 0.5))
953
+ reason = seed_info.get('reason', '')
954
+
955
+ log.info(f"Campaign {campaign_id_str}: Found seed - {target_enzyme_id} from {source_enzyme_id} ({relationship_type}, confidence: {confidence})")
956
+ log.info(f"Reason: {reason}")
957
+
958
+ if source_enzyme_id:
959
+ # Find the source enzyme's sequence in the dataframe
960
+ source_rows = df[df['enzyme_id'] == source_enzyme_id]
961
+ if source_rows.empty:
962
+ log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
963
+ else:
964
+ source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
965
+ if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
966
+ log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
967
+ else:
968
+ # Find the target enzyme in our empty list
969
+ seed_found = False
970
+ for entry in empty_seq_entries:
971
+ if entry['enzyme_id'] == target_enzyme_id:
972
+ if relationship_type == "EXACT_MATCH":
973
+ # Exact match - copy sequence directly
974
+ df.at[entry['idx'], 'protein_sequence'] = source_sequence
975
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
976
+ log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
977
+ seed_found = True
978
+
979
+ elif relationship_type == "BEST_PARENT":
980
+ # Parent relationship - apply mutations to get the target sequence
981
+ target_mutations = entry.get('mutations', '').strip()
982
+ if target_mutations:
983
+ # Apply mutations using SequenceManipulator
984
+ manipulator = SequenceManipulator()
985
+ mutated_sequence, success = manipulator.apply_mutations(source_sequence, target_mutations)
986
+
987
+ if success:
988
+ df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
989
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
990
+ log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
991
+ seed_found = True
992
+ else:
993
+ log.warning(f"Failed to apply mutations {target_mutations} to parent {source_enzyme_id} for {target_enzyme_id}")
994
+ else:
995
+ # No mutations - use parent sequence directly
996
+ df.at[entry['idx'], 'protein_sequence'] = source_sequence
997
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
998
+ log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
999
+ seed_found = True
1000
+ break
1001
+
1002
+ if seed_found:
1003
+ log.info(f"Campaign {campaign_id_str}: Successfully set cross-campaign seed. Local processing will handle the rest.")
1004
+ else:
1005
+ log.warning(f"Campaign {campaign_id_str}: Could not find target enzyme {target_enzyme_id} in empty list")
1006
+
1007
+ except json.JSONDecodeError as e:
1008
+ log.warning(f"Failed to parse JSON response for cross-campaign seed: {e}")
1009
+ log.debug(f"Response text: {response_text}")
1010
+
1011
+ except Exception as e:
1012
+ log.warning(f"Failed to identify cross-campaign seed for {campaign_id_str}: {e}")
1013
+ continue
1014
+
1015
+ log.info(f"Campaign {campaign_id_str}: Found {empty_count}/{total_entries} entries with empty sequences. "
1016
+ f"Querying Gemini for sequence sources...")
1017
+
1018
+ # Create context for Gemini
1019
+ context_lines = []
1020
+ context_lines.append(f"Campaign: {campaign_id_str}")
1021
+ context_lines.append(f"\nEnzymes WITH sequences ({len(available_seq_entries)}):")
1022
+ for entry in available_seq_entries[:15]: # Limit to first 15 for context
1023
+ context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
1024
+
1025
+ context_lines.append(f"\nEnzymes WITHOUT sequences ({len(empty_seq_entries)}):")
1026
+ for entry in empty_seq_entries[:15]: # Limit to first 15 for context
1027
+ parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
1028
+ mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
1029
+ context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
1030
+
1031
+ context_text = "\n".join(context_lines)
1032
+
1033
+ # Process in batches if there are many empty sequences
1034
+ batch_size = 10
1035
+ identified_count = 0
1036
+
1037
+ for i in range(0, len(empty_seq_entries), batch_size):
1038
+ batch = empty_seq_entries[i:i+batch_size]
1039
+
1040
+ # Create batch request
1041
+ batch_request = []
1042
+ for entry in batch:
1043
+ parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ""
1044
+ mut_info = f", mutations: {entry['mutations']}" if entry['mutations'] else ""
1045
+ batch_request.append(f"{entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
1046
+
1047
+ prompt = f"""
1048
+ Based on the enzyme lineage information provided, identify which enzyme sequences should be used as the source to calculate sequences for the enzymes without sequences.
1049
+
1050
+ {context_text}
1051
+
1052
+ For each of these enzymes without sequences, identify which enzyme WITH a sequence should be used as the source:
1053
+ {chr(10).join(batch_request)}
1054
+
1055
+ Instructions:
1056
+ 1. If an enzyme has a parent_id and mutations, suggest using the parent's sequence
1057
+ 2. If an enzyme has no parent_id, look for the most logical ancestor or related enzyme with a sequence
1058
+ 3. Consider the generation numbers and enzyme naming patterns
1059
+ 4. Only suggest enzymes that actually have sequences
1060
+
1061
+ Please provide your response in this format:
1062
+ enzyme_id -> source_enzyme_id
1063
+ enzyme_id -> source_enzyme_id
1064
+ ...
1065
+
1066
+ If you cannot identify a suitable source, use "None" as the source_enzyme_id.
1067
+ """
1068
+
1069
+ try:
1070
+ # Save debug information if debug_dir is provided
1071
+ if debug_dir:
1072
+ import time
1073
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
1074
+ prompt_file = debug_dir / f"sequence_source_{campaign_id_str}_prompt_{timestamp}.txt"
1075
+ prompt_file.write_text(prompt)
1076
+
1077
+ response = model.generate_content(prompt)
1078
+ response_text = response.text.strip()
1079
+
1080
+ # Save response if debug_dir is provided
1081
+ if debug_dir:
1082
+ response_file = debug_dir / f"sequence_source_{campaign_id_str}_response_{timestamp}.txt"
1083
+ response_file.write_text(response_text)
1084
+
1085
+ # Parse the response
1086
+ for line in response_text.split('\n'):
1087
+ if '->' in line:
1088
+ parts = line.split('->')
1089
+ if len(parts) == 2:
1090
+ target_enzyme = parts[0].strip()
1091
+ source_enzyme = parts[1].strip()
1092
+
1093
+ if source_enzyme and source_enzyme != "None":
1094
+ # Find the target enzyme in our batch
1095
+ for entry in batch:
1096
+ if entry['enzyme_id'] == target_enzyme:
1097
+ # Verify the source enzyme exists and has a sequence
1098
+ source_rows = df[df['enzyme_id'] == source_enzyme]
1099
+ if not source_rows.empty:
1100
+ source_seq = source_rows.iloc[0]['protein_sequence']
1101
+ if source_seq and str(source_seq).strip() and str(source_seq) != "nan":
1102
+ # Update the parent_enzyme_id if it's missing
1103
+ if not entry['parent_id']:
1104
+ df.at[entry['idx'], 'parent_enzyme_id'] = source_enzyme
1105
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_suggested_parent"
1106
+ identified_count += 1
1107
+ log.info(f"Set {source_enzyme} as parent for {target_enzyme} (Gemini suggestion)")
1108
+ elif entry['parent_id'] != source_enzyme:
1109
+ # Log if Gemini suggests a different parent than what's recorded
1110
+ log.info(f"Gemini suggests {source_enzyme} as source for {target_enzyme}, "
1111
+ f"but parent is recorded as {entry['parent_id']}")
1112
+ break
1113
+
1114
+ except Exception as e:
1115
+ log.warning(f"Failed to identify sequence sources for batch {i//batch_size + 1}: {e}")
1116
+ continue
1117
+
1118
+ if identified_count > 0:
1119
+ log.info(f"Campaign {campaign_id_str}: Successfully identified {identified_count} sequence sources using Gemini")
1120
+
1121
+ return df
1122
+
1123
+
1124
+ # === 9. MAIN PROCESSOR === ---------------------------------------------------
569
1125
 
570
1126
  class SequenceProcessor:
571
1127
  """Main processor for handling the complete workflow."""
572
1128
 
573
- def __init__(self, input_csv: Path, output_csv: Path):
1129
+ def __init__(self, input_csv: Path, output_csv: Path, debug_dir: Optional[Path] = None, strict_mutation_validation: bool = True):
574
1130
  self.input_csv = input_csv
575
1131
  self.output_csv = output_csv
1132
+ self.debug_dir = debug_dir
1133
+ self.strict_mutation_validation = strict_mutation_validation
576
1134
  self.df = None
577
1135
  self.generator = None
578
1136
 
@@ -593,7 +1151,7 @@ class SequenceProcessor:
593
1151
  self.df["flag"] = ""
594
1152
 
595
1153
  # Initialize generator
596
- self.generator = SequenceGenerator(self.df)
1154
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
597
1155
 
598
1156
  def _normalize_columns(self) -> None:
599
1157
  """Automatically detect and normalize column names from different formats."""
@@ -855,7 +1413,7 @@ class SequenceProcessor:
855
1413
  self.df = self.df[campaign_mask].copy()
856
1414
 
857
1415
  # Rebuild relationships for this campaign
858
- self.generator = SequenceGenerator(self.df)
1416
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
859
1417
 
860
1418
  # Flag complex mutations
861
1419
  self.flag_complex_mutations()
@@ -866,6 +1424,17 @@ class SequenceProcessor:
866
1424
  self.process_remaining()
867
1425
  self.backward_pass()
868
1426
 
1427
+ # Use Gemini to identify parent enzymes for entries with missing sequences
1428
+ log.info(f"Identifying parents with Gemini for campaign: {campaign_id}")
1429
+ self.df = identify_parents_with_gemini(self.df)
1430
+
1431
+ # Rebuild relationships after parent identification
1432
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
1433
+
1434
+ # Try to fill sequences again after parent identification
1435
+ log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
1436
+ self.process_remaining()
1437
+
869
1438
  # Update the original dataframe with results
870
1439
  original_df.loc[campaign_mask, :] = self.df
871
1440
 
@@ -874,6 +1443,50 @@ class SequenceProcessor:
874
1443
 
875
1444
  log.info(f"Completed campaign: {campaign_id}")
876
1445
 
1446
+ # After processing all campaigns, check for any remaining empty sequences
1447
+ # and use Gemini to identify sequence sources (including cross-campaign relationships)
1448
+ empty_count = sum(self.df["protein_sequence"].str.strip() == "")
1449
+ total_count = len(self.df)
1450
+
1451
+ if empty_count > 0:
1452
+ log.info(f"Found {empty_count}/{total_count} empty sequences after initial processing. "
1453
+ "Using Gemini to identify sequence sources (including cross-campaign relationships)...")
1454
+ self.df = identify_sequence_sources_with_gemini(self.df, self.debug_dir)
1455
+
1456
+ # Process campaigns again after identifying new parent relationships
1457
+ log.info("Reprocessing campaigns after sequence source identification...")
1458
+
1459
+ for campaign_id in campaigns:
1460
+ if pd.isna(campaign_id):
1461
+ campaign_id = "unknown"
1462
+
1463
+ log.info(f"Reprocessing campaign: {campaign_id}")
1464
+
1465
+ # Filter data for this campaign
1466
+ campaign_mask = self.df['campaign_id'] == campaign_id
1467
+ if pd.isna(campaign_id):
1468
+ campaign_mask = self.df['campaign_id'].isna()
1469
+
1470
+ # Store original dataframe
1471
+ original_df = self.df
1472
+
1473
+ # Process only this campaign's data
1474
+ self.df = self.df[campaign_mask].copy()
1475
+
1476
+ # Rebuild relationships for this campaign
1477
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
1478
+
1479
+ # Try to fill sequences again
1480
+ self.process_remaining()
1481
+
1482
+ # Update the original dataframe with results
1483
+ original_df.loc[campaign_mask, :] = self.df
1484
+
1485
+ # Restore original dataframe
1486
+ self.df = original_df
1487
+
1488
+ log.info(f"Completed reprocessing campaign: {campaign_id}")
1489
+
877
1490
  # Save results
878
1491
  self.save_results()
879
1492
 
@@ -922,6 +1535,16 @@ def main(argv: Optional[List[str]] = None) -> None:
922
1535
  default=0,
923
1536
  help="Increase verbosity (use -vv for debug output)"
924
1537
  )
1538
+ parser.add_argument(
1539
+ "--debug-dir",
1540
+ type=Path,
1541
+ help="Directory to save debug information (Gemini prompts and responses)"
1542
+ )
1543
+ parser.add_argument(
1544
+ "--allow-mutation-mismatches",
1545
+ action="store_true",
1546
+ help="Allow sequence generation even when mutations don't match (default: strict validation)"
1547
+ )
925
1548
 
926
1549
  args = parser.parse_args(argv)
927
1550
 
@@ -929,7 +1552,13 @@ def main(argv: Optional[List[str]] = None) -> None:
929
1552
  setup_logging(args.verbose)
930
1553
 
931
1554
  # Process the data (format detection is automatic)
932
- processor = SequenceProcessor(args.input_csv, args.output_csv)
1555
+ strict_validation = not args.allow_mutation_mismatches
1556
+ processor = SequenceProcessor(
1557
+ args.input_csv,
1558
+ args.output_csv,
1559
+ getattr(args, 'debug_dir', None),
1560
+ strict_mutation_validation=strict_validation
1561
+ )
933
1562
  processor.run()
934
1563
 
935
1564