debase 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -203,12 +203,17 @@ class SequenceManipulator:
203
203
  return 0 if zero_matches >= one_matches else 1
204
204
 
205
205
  @classmethod
206
- def apply_mutations(cls, parent_seq: str, mutation_str: str) -> str:
207
- """Apply mutations to a parent sequence."""
206
+ def apply_mutations(cls, parent_seq: str, mutation_str: str) -> Tuple[str, bool]:
207
+ """Apply mutations to a parent sequence.
208
+
209
+ Returns:
210
+ Tuple[str, bool]: (resulting_sequence, all_mutations_applied_successfully)
211
+ """
208
212
  if not parent_seq:
209
- return ""
213
+ return "", True
210
214
 
211
215
  seq = list(parent_seq)
216
+ all_mutations_successful = True
212
217
 
213
218
  # Apply point mutations
214
219
  mutations = MutationParser.parse_mutations(mutation_str)
@@ -217,19 +222,26 @@ class SequenceManipulator:
217
222
 
218
223
  for mut in mutations:
219
224
  idx = mut.position - idx_offset
225
+ mutation_applied = False
226
+
220
227
  # Try primary index
221
228
  if 0 <= idx < len(seq) and seq[idx].upper() == mut.original.upper():
222
229
  seq[idx] = mut.replacement
230
+ mutation_applied = True
223
231
  else:
224
232
  # Try alternate index
225
233
  alt_idx = mut.position - (1 - idx_offset)
226
234
  if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.original.upper():
227
235
  seq[alt_idx] = mut.replacement
228
- else:
229
- log.warning(
230
- f"Mutation {mut} does not match parent sequence at "
231
- f"position {mut.position} (tried both 0- and 1-based indexing)"
232
- )
236
+ mutation_applied = True
237
+
238
+ if not mutation_applied:
239
+ log.error(
240
+ f"MUTATION MISMATCH: {mut} does not match parent sequence at "
241
+ f"position {mut.position} (tried both 0- and 1-based indexing). "
242
+ f"Parent has {seq[idx] if 0 <= idx < len(seq) else 'out-of-bounds'} at position {mut.position}"
243
+ )
244
+ all_mutations_successful = False
233
245
 
234
246
  # Apply complex C-terminal mutations
235
247
  complex_mut = MutationParser.parse_complex_c_terminal(mutation_str)
@@ -252,12 +264,13 @@ class SequenceManipulator:
252
264
  if complex_mut.extension_seq:
253
265
  seq.extend(list(complex_mut.extension_seq))
254
266
  else:
255
- log.warning(
256
- f"Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
267
+ log.error(
268
+ f"COMPLEX MUTATION MISMATCH: Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
257
269
  f"{complex_mut.end_pos} for sequence of length {len(seq)}"
258
270
  )
271
+ all_mutations_successful = False
259
272
 
260
- return "".join(seq)
273
+ return "".join(seq), all_mutations_successful
261
274
 
262
275
  @classmethod
263
276
  def reverse_mutations(cls, child_seq: str, mutation_str: str) -> str:
@@ -400,10 +413,11 @@ class LineageNavigator:
400
413
  class SequenceGenerator:
401
414
  """Main class for generating protein sequences from mutations."""
402
415
 
403
- def __init__(self, df: pd.DataFrame):
416
+ def __init__(self, df: pd.DataFrame, strict_mutation_validation: bool = True):
404
417
  self.df = df
405
418
  self.navigator = LineageNavigator(df)
406
419
  self.manipulator = SequenceManipulator()
420
+ self.strict_mutation_validation = strict_mutation_validation
407
421
  self._update_ground_truths()
408
422
 
409
423
  def _update_ground_truths(self) -> None:
@@ -474,19 +488,62 @@ class SequenceGenerator:
474
488
  parent_id: str
475
489
  ) -> Optional[SequenceGenerationResult]:
476
490
  """Generate sequence by applying mutations to parent."""
477
- parent_row = self.df[self.df["enzyme_id"] == parent_id].iloc[0]
478
- parent_seq = parent_row.get("protein_sequence", "")
479
-
480
- if not parent_seq:
491
+ # Get the variant to find its campaign
492
+ variant_rows = self.df[self.df["enzyme_id"] == variant_id]
493
+ if variant_rows.empty:
481
494
  return None
482
495
 
483
- variant_row = self.df[self.df["enzyme_id"] == variant_id].iloc[0]
496
+ variant_row = variant_rows.iloc[0]
497
+ variant_campaign = variant_row.get("campaign_id", "")
484
498
  mutations = variant_row.get("mutations", "")
485
499
 
486
500
  if not mutations:
487
501
  return None
488
502
 
489
- sequence = self.manipulator.apply_mutations(parent_seq, mutations)
503
+ # Find parent in the same campaign first
504
+ parent_rows = self.df[
505
+ (self.df["enzyme_id"] == parent_id) &
506
+ (self.df["campaign_id"] == variant_campaign)
507
+ ]
508
+
509
+ # If not found in same campaign, fall back to any parent with that ID
510
+ if parent_rows.empty:
511
+ parent_rows = self.df[self.df["enzyme_id"] == parent_id]
512
+ if not parent_rows.empty:
513
+ log.warning(f"Parent {parent_id} not found in same campaign {variant_campaign} for variant {variant_id}, using parent from different campaign")
514
+
515
+ if parent_rows.empty:
516
+ log.error(f"Parent {parent_id} not found for variant {variant_id}")
517
+ return None
518
+
519
+ parent_row = parent_rows.iloc[0]
520
+ parent_seq = parent_row.get("protein_sequence", "")
521
+ parent_campaign = parent_row.get("campaign_id", "")
522
+
523
+ if not parent_seq:
524
+ return None
525
+
526
+ # Log which parent sequence is being used
527
+ if parent_campaign != variant_campaign:
528
+ log.info(f"Using parent {parent_id} from campaign {parent_campaign} for variant {variant_id} in campaign {variant_campaign}")
529
+ else:
530
+ log.info(f"Using parent {parent_id} from same campaign {variant_campaign} for variant {variant_id}")
531
+
532
+ sequence, mutations_successful = self.manipulator.apply_mutations(parent_seq, mutations)
533
+
534
+ if not mutations_successful:
535
+ # Check if this might be an exact match case (mutations already present in parent)
536
+ # This happens when an enzyme from another campaign is identified as both parent and exact match
537
+ if parent_id == variant_id or (mutations and parent_seq == sequence):
538
+ log.info(f"Detected exact match scenario for {variant_id} - using parent sequence directly")
539
+ sequence = parent_seq
540
+ mutations_successful = True
541
+ elif self.strict_mutation_validation:
542
+ log.error(f"STRICT MODE: Failed to apply mutations for {variant_id}: mutation mismatch detected. Not populating sequence to prevent incorrect data.")
543
+ return None
544
+ else:
545
+ log.warning(f"Mutation mismatch for {variant_id}, but proceeding with generated sequence (strict_mutation_validation=False)")
546
+ # Continue with the sequence even if mutations failed
490
547
 
491
548
  return SequenceGenerationResult(
492
549
  sequence=sequence,
@@ -548,10 +605,14 @@ class SequenceGenerator:
548
605
 
549
606
  # Generate based on direction
550
607
  if direction == "up" and parent_id and mutations:
551
- if gt_id == parent_id:
552
- return self.generate_from_parent(variant_id, parent_id)
553
- else:
554
- # Non-direct ancestor - less reliable
608
+ # Always try the declared parent first
609
+ result = self.generate_from_parent(variant_id, parent_id)
610
+ if result:
611
+ return result
612
+
613
+ # If declared parent fails, try the ground truth (if different)
614
+ if gt_id != parent_id:
615
+ log.info(f"Declared parent {parent_id} failed for {variant_id}, trying ground truth {gt_id}")
555
616
  result = self.generate_from_parent(variant_id, gt_id)
556
617
  if result:
557
618
  result.confidence = 0.7
@@ -597,11 +658,11 @@ def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
597
658
  # Find entries with empty sequences but missing parent information
598
659
  entries_needing_parents = []
599
660
  for idx, row in df.iterrows():
600
- protein_seq = str(row.get("protein_sequence", "")).strip()
661
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
601
662
  parent_id = str(row.get("parent_enzyme_id", "")).strip()
602
663
 
603
664
  # Only process entries that have empty sequences AND no parent info
604
- if (not protein_seq or protein_seq == "nan") and (not parent_id or parent_id == "nan"):
665
+ if (not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]) and (not parent_id or parent_id.lower() in ["nan", "none", ""]):
605
666
  enzyme_id = str(row.get("enzyme_id", ""))
606
667
  campaign_id = str(row.get("campaign_id", ""))
607
668
  generation = str(row.get("generation", ""))
@@ -624,13 +685,13 @@ def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
624
685
  for idx, row in df.iterrows():
625
686
  enzyme_id = str(row.get("enzyme_id", ""))
626
687
  campaign_id = str(row.get("campaign_id", ""))
627
- protein_seq = str(row.get("protein_sequence", "")).strip()
688
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
628
689
  generation = str(row.get("generation", ""))
629
690
 
630
- if enzyme_id and enzyme_id != "nan":
691
+ if enzyme_id and enzyme_id.lower() != "nan":
631
692
  available_enzymes[enzyme_id] = {
632
693
  "campaign_id": campaign_id,
633
- "has_sequence": bool(protein_seq and protein_seq != "nan"),
694
+ "has_sequence": bool(protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]),
634
695
  "generation": generation
635
696
  }
636
697
 
@@ -704,14 +765,372 @@ If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
704
765
  return df
705
766
 
706
767
 
707
- # === 8. MAIN PROCESSOR === ---------------------------------------------------
768
+ # === 8. SEQUENCE SOURCE IDENTIFICATION === -----------------------------------
769
+
770
+ def identify_sequence_sources_with_gemini(df: pd.DataFrame, debug_dir: Optional[Path] = None) -> pd.DataFrame:
771
+ """Use Gemini API to identify which parent sequences to use for entries with missing sequences."""
772
+ if not GEMINI_OK:
773
+ log.warning("Gemini API not available (missing google.generativeai). Skipping sequence source identification.")
774
+ return df
775
+
776
+ if not GEMINI_API_KEY:
777
+ log.warning("GEMINI_API_KEY not set. Skipping sequence source identification.")
778
+ return df
779
+
780
+ try:
781
+ genai.configure(api_key=GEMINI_API_KEY)
782
+ model = genai.GenerativeModel('gemini-1.5-flash')
783
+ except Exception as e:
784
+ log.warning(f"Failed to configure Gemini API: {e}. Skipping sequence source identification.")
785
+ return df
786
+
787
+ # Group by campaign to process each campaign separately
788
+ campaigns = df['campaign_id'].unique()
789
+
790
+ for campaign_id in campaigns:
791
+ if pd.isna(campaign_id):
792
+ campaign_mask = df['campaign_id'].isna()
793
+ campaign_id_str = "unknown"
794
+ else:
795
+ campaign_mask = df['campaign_id'] == campaign_id
796
+ campaign_id_str = str(campaign_id)
797
+
798
+ campaign_df = df[campaign_mask]
799
+
800
+ # Find entries with empty sequences in this campaign
801
+ empty_seq_entries = []
802
+ available_seq_entries = []
803
+
804
+ for idx, row in campaign_df.iterrows():
805
+ enzyme_id = str(row.get("enzyme_id", ""))
806
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
807
+ parent_id = str(row.get("parent_enzyme_id", "")).strip()
808
+ mutations = str(row.get("mutations", "")).strip()
809
+ generation = str(row.get("generation", ""))
810
+
811
+ if not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]:
812
+ empty_seq_entries.append({
813
+ "idx": idx,
814
+ "enzyme_id": enzyme_id,
815
+ "parent_id": parent_id if parent_id != "nan" else None,
816
+ "mutations": mutations if mutations != "nan" else None,
817
+ "generation": generation
818
+ })
819
+ else:
820
+ available_seq_entries.append({
821
+ "enzyme_id": enzyme_id,
822
+ "generation": generation,
823
+ "seq_length": len(protein_sequence)
824
+ })
825
+
826
+ # Skip if no empty sequences
827
+ if not empty_seq_entries:
828
+ continue
829
+
830
+ # Check if this is a partially empty situation (some have sequences, some don't)
831
+ total_entries = len(campaign_df)
832
+ empty_count = len(empty_seq_entries)
833
+
834
+ log.info(f"Campaign {campaign_id_str}: {empty_count}/{total_entries} entries have empty sequences")
835
+
836
+ if empty_count == total_entries:
837
+ # All sequences are empty - try to find cross-campaign relationships
838
+ log.info(f"Campaign {campaign_id_str}: All sequences are empty ({empty_count}/{total_entries}). "
839
+ f"Searching for cross-campaign parent relationships...")
840
+
841
+ # Get all enzymes with sequences from OTHER campaigns
842
+ other_campaigns_with_seqs = []
843
+ for other_campaign in campaigns:
844
+ if other_campaign == campaign_id or pd.isna(other_campaign):
845
+ continue
846
+ other_mask = df['campaign_id'] == other_campaign
847
+ other_df = df[other_mask]
848
+
849
+ for idx, row in other_df.iterrows():
850
+ protein_sequence = str(row.get("protein_sequence", "")).strip()
851
+ if protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]:
852
+ enzyme_id = str(row.get("enzyme_id", ""))
853
+ generation = str(row.get("generation", ""))
854
+ other_campaigns_with_seqs.append({
855
+ "enzyme_id": enzyme_id,
856
+ "campaign_id": str(other_campaign),
857
+ "generation": generation,
858
+ "seq_length": len(protein_sequence)
859
+ })
860
+
861
+ if not other_campaigns_with_seqs:
862
+ log.info(f"Campaign {campaign_id_str}: No sequences found in other campaigns to use as cross-campaign parents")
863
+ continue
864
+
865
+ # Create context for cross-campaign analysis
866
+ context_lines = []
867
+ context_lines.append(f"Empty Campaign: {campaign_id_str} (all {empty_count} enzymes need sequences)")
868
+ context_lines.append(f"\nEnzymes in empty campaign:")
869
+ for entry in empty_seq_entries[:10]: # Limit for context
870
+ parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
871
+ mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
872
+ context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
873
+
874
+ context_lines.append(f"\nEnzymes with sequences from OTHER campaigns ({len(other_campaigns_with_seqs)}):")
875
+ for entry in other_campaigns_with_seqs[:15]: # Limit for context
876
+ # Get the actual sequence for this enzyme
877
+ enzyme_rows = df[df['enzyme_id'] == entry['enzyme_id']]
878
+ if not enzyme_rows.empty:
879
+ sequence = str(enzyme_rows.iloc[0]['protein_sequence'])
880
+ context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, sequence: {sequence})")
881
+ else:
882
+ context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
883
+
884
+ context_text = "\n".join(context_lines)
885
+
886
+ # Find ONE good cross-campaign seed to bootstrap this campaign
887
+ log.info(f"Campaign {campaign_id_str}: Looking for ONE cross-campaign seed to bootstrap sequences...")
888
+
889
+ # Create a prompt to find the BEST single seed
890
+ prompt = f"""
891
+ Based on enzyme names, identify the SINGLE BEST seed enzyme from other campaigns to bootstrap the empty campaign.
892
+
893
+ {context_text}
894
+
895
+ From the enzymes in the EMPTY campaign, identify which ONE has the clearest match in OTHER campaigns.
896
+ Prioritize:
897
+ 1. EXACT name matches (highest priority)
898
+ 2. Simplest parent relationships (e.g., an enzyme that differs by only 1-2 mutations)
899
+ 3. Earliest generation enzymes (lower generation numbers are better seeds)
900
+
901
+ Return your response as a JSON dictionary with this exact format:
902
+ {{
903
+ "seed_enzyme": {{
904
+ "target_enzyme_id": "the enzyme ID in the empty campaign",
905
+ "relationship_type": "EXACT_MATCH" or "BEST_PARENT",
906
+ "source": {{
907
+ "campaign_id": "the campaign ID",
908
+ "enzyme_id": "the enzyme ID WITHOUT campaign suffix"
909
+ }},
910
+ "confidence": 0.1 to 1.0,
911
+ "reason": "brief explanation of why this is the best seed"
912
+ }}
913
+ }}
914
+
915
+ Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
916
+ """
917
+
918
+ try:
919
+ # Save debug information if debug_dir is provided
920
+ if debug_dir:
921
+ import time
922
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
923
+ prompt_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_prompt_{timestamp}.txt"
924
+ prompt_file.write_text(prompt)
925
+
926
+ response = model.generate_content(prompt)
927
+ response_text = response.text.strip()
928
+
929
+ # Save response if debug_dir is provided
930
+ if debug_dir:
931
+ response_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_response_{timestamp}.txt"
932
+ response_file.write_text(response_text)
933
+
934
+ # Parse the JSON response
935
+ import json
936
+ try:
937
+ # Clean the response text if it contains markdown
938
+ if '```json' in response_text:
939
+ response_text = response_text.split('```json')[1].split('```')[0].strip()
940
+ elif '```' in response_text:
941
+ response_text = response_text.split('```')[1].split('```')[0].strip()
942
+
943
+ seed_data = json.loads(response_text)
944
+ seed_info = seed_data.get('seed_enzyme', {})
945
+
946
+ if seed_info:
947
+ target_enzyme_id = seed_info.get('target_enzyme_id', '')
948
+ relationship_type = seed_info.get('relationship_type', '').upper()
949
+ source_info = seed_info.get('source', {})
950
+ source_enzyme_id = source_info.get('enzyme_id', '')
951
+ source_campaign_id = source_info.get('campaign_id', '')
952
+ confidence = float(seed_info.get('confidence', 0.5))
953
+ reason = seed_info.get('reason', '')
954
+
955
+ log.info(f"Campaign {campaign_id_str}: Found seed - {target_enzyme_id} from {source_enzyme_id} ({relationship_type}, confidence: {confidence})")
956
+ log.info(f"Reason: {reason}")
957
+
958
+ if source_enzyme_id:
959
+ # Find the source enzyme's sequence in the dataframe
960
+ source_rows = df[df['enzyme_id'] == source_enzyme_id]
961
+ if source_rows.empty:
962
+ log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
963
+ else:
964
+ source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
965
+ if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
966
+ log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
967
+ else:
968
+ # Find the target enzyme in our empty list
969
+ seed_found = False
970
+ for entry in empty_seq_entries:
971
+ if entry['enzyme_id'] == target_enzyme_id:
972
+ if relationship_type == "EXACT_MATCH":
973
+ # Exact match - copy sequence directly
974
+ df.at[entry['idx'], 'protein_sequence'] = source_sequence
975
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
976
+ log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
977
+ seed_found = True
978
+
979
+ elif relationship_type == "BEST_PARENT":
980
+ # Parent relationship - apply mutations to get the target sequence
981
+ target_mutations = entry.get('mutations', '').strip()
982
+ if target_mutations:
983
+ # Apply mutations using SequenceManipulator
984
+ manipulator = SequenceManipulator()
985
+ mutated_sequence, success = manipulator.apply_mutations(source_sequence, target_mutations)
986
+
987
+ if success:
988
+ df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
989
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
990
+ log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
991
+ seed_found = True
992
+ else:
993
+ log.warning(f"Failed to apply mutations {target_mutations} to parent {source_enzyme_id} for {target_enzyme_id}")
994
+ else:
995
+ # No mutations - use parent sequence directly
996
+ df.at[entry['idx'], 'protein_sequence'] = source_sequence
997
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
998
+ log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
999
+ seed_found = True
1000
+ break
1001
+
1002
+ if seed_found:
1003
+ log.info(f"Campaign {campaign_id_str}: Successfully set cross-campaign seed. Local processing will handle the rest.")
1004
+ else:
1005
+ log.warning(f"Campaign {campaign_id_str}: Could not find target enzyme {target_enzyme_id} in empty list")
1006
+
1007
+ except json.JSONDecodeError as e:
1008
+ log.warning(f"Failed to parse JSON response for cross-campaign seed: {e}")
1009
+ log.debug(f"Response text: {response_text}")
1010
+
1011
+ except Exception as e:
1012
+ log.warning(f"Failed to identify cross-campaign seed for {campaign_id_str}: {e}")
1013
+ continue
1014
+
1015
+ log.info(f"Campaign {campaign_id_str}: Found {empty_count}/{total_entries} entries with empty sequences. "
1016
+ f"Querying Gemini for sequence sources...")
1017
+
1018
+ # Create context for Gemini
1019
+ context_lines = []
1020
+ context_lines.append(f"Campaign: {campaign_id_str}")
1021
+ context_lines.append(f"\nEnzymes WITH sequences ({len(available_seq_entries)}):")
1022
+ for entry in available_seq_entries[:15]: # Limit to first 15 for context
1023
+ context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
1024
+
1025
+ context_lines.append(f"\nEnzymes WITHOUT sequences ({len(empty_seq_entries)}):")
1026
+ for entry in empty_seq_entries[:15]: # Limit to first 15 for context
1027
+ parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
1028
+ mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
1029
+ context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
1030
+
1031
+ context_text = "\n".join(context_lines)
1032
+
1033
+ # Process in batches if there are many empty sequences
1034
+ batch_size = 10
1035
+ identified_count = 0
1036
+
1037
+ for i in range(0, len(empty_seq_entries), batch_size):
1038
+ batch = empty_seq_entries[i:i+batch_size]
1039
+
1040
+ # Create batch request
1041
+ batch_request = []
1042
+ for entry in batch:
1043
+ parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ""
1044
+ mut_info = f", mutations: {entry['mutations']}" if entry['mutations'] else ""
1045
+ batch_request.append(f"{entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
1046
+
1047
+ prompt = f"""
1048
+ Based on the enzyme lineage information provided, identify which enzyme sequences should be used as the source to calculate sequences for the enzymes without sequences.
1049
+
1050
+ {context_text}
1051
+
1052
+ For each of these enzymes without sequences, identify which enzyme WITH a sequence should be used as the source:
1053
+ {chr(10).join(batch_request)}
1054
+
1055
+ Instructions:
1056
+ 1. If an enzyme has a parent_id and mutations, suggest using the parent's sequence
1057
+ 2. If an enzyme has no parent_id, look for the most logical ancestor or related enzyme with a sequence
1058
+ 3. Consider the generation numbers and enzyme naming patterns
1059
+ 4. Only suggest enzymes that actually have sequences
1060
+
1061
+ Please provide your response in this format:
1062
+ enzyme_id -> source_enzyme_id
1063
+ enzyme_id -> source_enzyme_id
1064
+ ...
1065
+
1066
+ If you cannot identify a suitable source, use "None" as the source_enzyme_id.
1067
+ """
1068
+
1069
+ try:
1070
+ # Save debug information if debug_dir is provided
1071
+ if debug_dir:
1072
+ import time
1073
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
1074
+ prompt_file = debug_dir / f"sequence_source_{campaign_id_str}_prompt_{timestamp}.txt"
1075
+ prompt_file.write_text(prompt)
1076
+
1077
+ response = model.generate_content(prompt)
1078
+ response_text = response.text.strip()
1079
+
1080
+ # Save response if debug_dir is provided
1081
+ if debug_dir:
1082
+ response_file = debug_dir / f"sequence_source_{campaign_id_str}_response_{timestamp}.txt"
1083
+ response_file.write_text(response_text)
1084
+
1085
+ # Parse the response
1086
+ for line in response_text.split('\n'):
1087
+ if '->' in line:
1088
+ parts = line.split('->')
1089
+ if len(parts) == 2:
1090
+ target_enzyme = parts[0].strip()
1091
+ source_enzyme = parts[1].strip()
1092
+
1093
+ if source_enzyme and source_enzyme != "None":
1094
+ # Find the target enzyme in our batch
1095
+ for entry in batch:
1096
+ if entry['enzyme_id'] == target_enzyme:
1097
+ # Verify the source enzyme exists and has a sequence
1098
+ source_rows = df[df['enzyme_id'] == source_enzyme]
1099
+ if not source_rows.empty:
1100
+ source_seq = source_rows.iloc[0]['protein_sequence']
1101
+ if source_seq and str(source_seq).strip() and str(source_seq) != "nan":
1102
+ # Update the parent_enzyme_id if it's missing
1103
+ if not entry['parent_id']:
1104
+ df.at[entry['idx'], 'parent_enzyme_id'] = source_enzyme
1105
+ df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_suggested_parent"
1106
+ identified_count += 1
1107
+ log.info(f"Set {source_enzyme} as parent for {target_enzyme} (Gemini suggestion)")
1108
+ elif entry['parent_id'] != source_enzyme:
1109
+ # Log if Gemini suggests a different parent than what's recorded
1110
+ log.info(f"Gemini suggests {source_enzyme} as source for {target_enzyme}, "
1111
+ f"but parent is recorded as {entry['parent_id']}")
1112
+ break
1113
+
1114
+ except Exception as e:
1115
+ log.warning(f"Failed to identify sequence sources for batch {i//batch_size + 1}: {e}")
1116
+ continue
1117
+
1118
+ if identified_count > 0:
1119
+ log.info(f"Campaign {campaign_id_str}: Successfully identified {identified_count} sequence sources using Gemini")
1120
+
1121
+ return df
1122
+
1123
+
1124
+ # === 9. MAIN PROCESSOR === ---------------------------------------------------
708
1125
 
709
1126
  class SequenceProcessor:
710
1127
  """Main processor for handling the complete workflow."""
711
1128
 
712
- def __init__(self, input_csv: Path, output_csv: Path):
1129
+ def __init__(self, input_csv: Path, output_csv: Path, debug_dir: Optional[Path] = None, strict_mutation_validation: bool = True):
713
1130
  self.input_csv = input_csv
714
1131
  self.output_csv = output_csv
1132
+ self.debug_dir = debug_dir
1133
+ self.strict_mutation_validation = strict_mutation_validation
715
1134
  self.df = None
716
1135
  self.generator = None
717
1136
 
@@ -732,7 +1151,7 @@ class SequenceProcessor:
732
1151
  self.df["flag"] = ""
733
1152
 
734
1153
  # Initialize generator
735
- self.generator = SequenceGenerator(self.df)
1154
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
736
1155
 
737
1156
  def _normalize_columns(self) -> None:
738
1157
  """Automatically detect and normalize column names from different formats."""
@@ -994,7 +1413,7 @@ class SequenceProcessor:
994
1413
  self.df = self.df[campaign_mask].copy()
995
1414
 
996
1415
  # Rebuild relationships for this campaign
997
- self.generator = SequenceGenerator(self.df)
1416
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
998
1417
 
999
1418
  # Flag complex mutations
1000
1419
  self.flag_complex_mutations()
@@ -1010,7 +1429,7 @@ class SequenceProcessor:
1010
1429
  self.df = identify_parents_with_gemini(self.df)
1011
1430
 
1012
1431
  # Rebuild relationships after parent identification
1013
- self.generator = SequenceGenerator(self.df)
1432
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
1014
1433
 
1015
1434
  # Try to fill sequences again after parent identification
1016
1435
  log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
@@ -1024,6 +1443,50 @@ class SequenceProcessor:
1024
1443
 
1025
1444
  log.info(f"Completed campaign: {campaign_id}")
1026
1445
 
1446
+ # After processing all campaigns, check for any remaining empty sequences
1447
+ # and use Gemini to identify sequence sources (including cross-campaign relationships)
1448
+ empty_count = sum(self.df["protein_sequence"].str.strip() == "")
1449
+ total_count = len(self.df)
1450
+
1451
+ if empty_count > 0:
1452
+ log.info(f"Found {empty_count}/{total_count} empty sequences after initial processing. "
1453
+ "Using Gemini to identify sequence sources (including cross-campaign relationships)...")
1454
+ self.df = identify_sequence_sources_with_gemini(self.df, self.debug_dir)
1455
+
1456
+ # Process campaigns again after identifying new parent relationships
1457
+ log.info("Reprocessing campaigns after sequence source identification...")
1458
+
1459
+ for campaign_id in campaigns:
1460
+ if pd.isna(campaign_id):
1461
+ campaign_id = "unknown"
1462
+
1463
+ log.info(f"Reprocessing campaign: {campaign_id}")
1464
+
1465
+ # Filter data for this campaign
1466
+ campaign_mask = self.df['campaign_id'] == campaign_id
1467
+ if pd.isna(campaign_id):
1468
+ campaign_mask = self.df['campaign_id'].isna()
1469
+
1470
+ # Store original dataframe
1471
+ original_df = self.df
1472
+
1473
+ # Process only this campaign's data
1474
+ self.df = self.df[campaign_mask].copy()
1475
+
1476
+ # Rebuild relationships for this campaign
1477
+ self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
1478
+
1479
+ # Try to fill sequences again
1480
+ self.process_remaining()
1481
+
1482
+ # Update the original dataframe with results
1483
+ original_df.loc[campaign_mask, :] = self.df
1484
+
1485
+ # Restore original dataframe
1486
+ self.df = original_df
1487
+
1488
+ log.info(f"Completed reprocessing campaign: {campaign_id}")
1489
+
1027
1490
  # Save results
1028
1491
  self.save_results()
1029
1492
 
@@ -1072,6 +1535,16 @@ def main(argv: Optional[List[str]] = None) -> None:
1072
1535
  default=0,
1073
1536
  help="Increase verbosity (use -vv for debug output)"
1074
1537
  )
1538
+ parser.add_argument(
1539
+ "--debug-dir",
1540
+ type=Path,
1541
+ help="Directory to save debug information (Gemini prompts and responses)"
1542
+ )
1543
+ parser.add_argument(
1544
+ "--allow-mutation-mismatches",
1545
+ action="store_true",
1546
+ help="Allow sequence generation even when mutations don't match (default: strict validation)"
1547
+ )
1075
1548
 
1076
1549
  args = parser.parse_args(argv)
1077
1550
 
@@ -1079,7 +1552,13 @@ def main(argv: Optional[List[str]] = None) -> None:
1079
1552
  setup_logging(args.verbose)
1080
1553
 
1081
1554
  # Process the data (format detection is automatic)
1082
- processor = SequenceProcessor(args.input_csv, args.output_csv)
1555
+ strict_validation = not args.allow_mutation_mismatches
1556
+ processor = SequenceProcessor(
1557
+ args.input_csv,
1558
+ args.output_csv,
1559
+ getattr(args, 'debug_dir', None),
1560
+ strict_mutation_validation=strict_validation
1561
+ )
1083
1562
  processor.run()
1084
1563
 
1085
1564