debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +512 -33
- debase/enzyme_lineage_extractor.py +985 -100
- debase/lineage_format.py +226 -13
- debase/reaction_info_extractor.py +178 -34
- debase/substrate_scope_extractor.py +52 -4
- debase/wrapper.py +155 -151
- debase-0.4.5.dist-info/METADATA +121 -0
- debase-0.4.5.dist-info/RECORD +16 -0
- debase-0.4.3.dist-info/METADATA +0 -296
- debase-0.4.3.dist-info/RECORD +0 -16
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.3.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0
debase/cleanup_sequence.py
CHANGED
@@ -203,12 +203,17 @@ class SequenceManipulator:
|
|
203
203
|
return 0 if zero_matches >= one_matches else 1
|
204
204
|
|
205
205
|
@classmethod
|
206
|
-
def apply_mutations(cls, parent_seq: str, mutation_str: str) -> str:
|
207
|
-
"""Apply mutations to a parent sequence.
|
206
|
+
def apply_mutations(cls, parent_seq: str, mutation_str: str) -> Tuple[str, bool]:
|
207
|
+
"""Apply mutations to a parent sequence.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
Tuple[str, bool]: (resulting_sequence, all_mutations_applied_successfully)
|
211
|
+
"""
|
208
212
|
if not parent_seq:
|
209
|
-
return ""
|
213
|
+
return "", True
|
210
214
|
|
211
215
|
seq = list(parent_seq)
|
216
|
+
all_mutations_successful = True
|
212
217
|
|
213
218
|
# Apply point mutations
|
214
219
|
mutations = MutationParser.parse_mutations(mutation_str)
|
@@ -217,19 +222,26 @@ class SequenceManipulator:
|
|
217
222
|
|
218
223
|
for mut in mutations:
|
219
224
|
idx = mut.position - idx_offset
|
225
|
+
mutation_applied = False
|
226
|
+
|
220
227
|
# Try primary index
|
221
228
|
if 0 <= idx < len(seq) and seq[idx].upper() == mut.original.upper():
|
222
229
|
seq[idx] = mut.replacement
|
230
|
+
mutation_applied = True
|
223
231
|
else:
|
224
232
|
# Try alternate index
|
225
233
|
alt_idx = mut.position - (1 - idx_offset)
|
226
234
|
if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.original.upper():
|
227
235
|
seq[alt_idx] = mut.replacement
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
236
|
+
mutation_applied = True
|
237
|
+
|
238
|
+
if not mutation_applied:
|
239
|
+
log.error(
|
240
|
+
f"MUTATION MISMATCH: {mut} does not match parent sequence at "
|
241
|
+
f"position {mut.position} (tried both 0- and 1-based indexing). "
|
242
|
+
f"Parent has {seq[idx] if 0 <= idx < len(seq) else 'out-of-bounds'} at position {mut.position}"
|
243
|
+
)
|
244
|
+
all_mutations_successful = False
|
233
245
|
|
234
246
|
# Apply complex C-terminal mutations
|
235
247
|
complex_mut = MutationParser.parse_complex_c_terminal(mutation_str)
|
@@ -252,12 +264,13 @@ class SequenceManipulator:
|
|
252
264
|
if complex_mut.extension_seq:
|
253
265
|
seq.extend(list(complex_mut.extension_seq))
|
254
266
|
else:
|
255
|
-
log.
|
256
|
-
f"Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
|
267
|
+
log.error(
|
268
|
+
f"COMPLEX MUTATION MISMATCH: Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
|
257
269
|
f"{complex_mut.end_pos} for sequence of length {len(seq)}"
|
258
270
|
)
|
271
|
+
all_mutations_successful = False
|
259
272
|
|
260
|
-
return "".join(seq)
|
273
|
+
return "".join(seq), all_mutations_successful
|
261
274
|
|
262
275
|
@classmethod
|
263
276
|
def reverse_mutations(cls, child_seq: str, mutation_str: str) -> str:
|
@@ -400,10 +413,11 @@ class LineageNavigator:
|
|
400
413
|
class SequenceGenerator:
|
401
414
|
"""Main class for generating protein sequences from mutations."""
|
402
415
|
|
403
|
-
def __init__(self, df: pd.DataFrame):
|
416
|
+
def __init__(self, df: pd.DataFrame, strict_mutation_validation: bool = True):
|
404
417
|
self.df = df
|
405
418
|
self.navigator = LineageNavigator(df)
|
406
419
|
self.manipulator = SequenceManipulator()
|
420
|
+
self.strict_mutation_validation = strict_mutation_validation
|
407
421
|
self._update_ground_truths()
|
408
422
|
|
409
423
|
def _update_ground_truths(self) -> None:
|
@@ -474,19 +488,62 @@ class SequenceGenerator:
|
|
474
488
|
parent_id: str
|
475
489
|
) -> Optional[SequenceGenerationResult]:
|
476
490
|
"""Generate sequence by applying mutations to parent."""
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
if not parent_seq:
|
491
|
+
# Get the variant to find its campaign
|
492
|
+
variant_rows = self.df[self.df["enzyme_id"] == variant_id]
|
493
|
+
if variant_rows.empty:
|
481
494
|
return None
|
482
495
|
|
483
|
-
variant_row =
|
496
|
+
variant_row = variant_rows.iloc[0]
|
497
|
+
variant_campaign = variant_row.get("campaign_id", "")
|
484
498
|
mutations = variant_row.get("mutations", "")
|
485
499
|
|
486
500
|
if not mutations:
|
487
501
|
return None
|
488
502
|
|
489
|
-
|
503
|
+
# Find parent in the same campaign first
|
504
|
+
parent_rows = self.df[
|
505
|
+
(self.df["enzyme_id"] == parent_id) &
|
506
|
+
(self.df["campaign_id"] == variant_campaign)
|
507
|
+
]
|
508
|
+
|
509
|
+
# If not found in same campaign, fall back to any parent with that ID
|
510
|
+
if parent_rows.empty:
|
511
|
+
parent_rows = self.df[self.df["enzyme_id"] == parent_id]
|
512
|
+
if not parent_rows.empty:
|
513
|
+
log.warning(f"Parent {parent_id} not found in same campaign {variant_campaign} for variant {variant_id}, using parent from different campaign")
|
514
|
+
|
515
|
+
if parent_rows.empty:
|
516
|
+
log.error(f"Parent {parent_id} not found for variant {variant_id}")
|
517
|
+
return None
|
518
|
+
|
519
|
+
parent_row = parent_rows.iloc[0]
|
520
|
+
parent_seq = parent_row.get("protein_sequence", "")
|
521
|
+
parent_campaign = parent_row.get("campaign_id", "")
|
522
|
+
|
523
|
+
if not parent_seq:
|
524
|
+
return None
|
525
|
+
|
526
|
+
# Log which parent sequence is being used
|
527
|
+
if parent_campaign != variant_campaign:
|
528
|
+
log.info(f"Using parent {parent_id} from campaign {parent_campaign} for variant {variant_id} in campaign {variant_campaign}")
|
529
|
+
else:
|
530
|
+
log.info(f"Using parent {parent_id} from same campaign {variant_campaign} for variant {variant_id}")
|
531
|
+
|
532
|
+
sequence, mutations_successful = self.manipulator.apply_mutations(parent_seq, mutations)
|
533
|
+
|
534
|
+
if not mutations_successful:
|
535
|
+
# Check if this might be an exact match case (mutations already present in parent)
|
536
|
+
# This happens when an enzyme from another campaign is identified as both parent and exact match
|
537
|
+
if parent_id == variant_id or (mutations and parent_seq == sequence):
|
538
|
+
log.info(f"Detected exact match scenario for {variant_id} - using parent sequence directly")
|
539
|
+
sequence = parent_seq
|
540
|
+
mutations_successful = True
|
541
|
+
elif self.strict_mutation_validation:
|
542
|
+
log.error(f"STRICT MODE: Failed to apply mutations for {variant_id}: mutation mismatch detected. Not populating sequence to prevent incorrect data.")
|
543
|
+
return None
|
544
|
+
else:
|
545
|
+
log.warning(f"Mutation mismatch for {variant_id}, but proceeding with generated sequence (strict_mutation_validation=False)")
|
546
|
+
# Continue with the sequence even if mutations failed
|
490
547
|
|
491
548
|
return SequenceGenerationResult(
|
492
549
|
sequence=sequence,
|
@@ -548,10 +605,14 @@ class SequenceGenerator:
|
|
548
605
|
|
549
606
|
# Generate based on direction
|
550
607
|
if direction == "up" and parent_id and mutations:
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
608
|
+
# Always try the declared parent first
|
609
|
+
result = self.generate_from_parent(variant_id, parent_id)
|
610
|
+
if result:
|
611
|
+
return result
|
612
|
+
|
613
|
+
# If declared parent fails, try the ground truth (if different)
|
614
|
+
if gt_id != parent_id:
|
615
|
+
log.info(f"Declared parent {parent_id} failed for {variant_id}, trying ground truth {gt_id}")
|
555
616
|
result = self.generate_from_parent(variant_id, gt_id)
|
556
617
|
if result:
|
557
618
|
result.confidence = 0.7
|
@@ -597,11 +658,11 @@ def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
|
597
658
|
# Find entries with empty sequences but missing parent information
|
598
659
|
entries_needing_parents = []
|
599
660
|
for idx, row in df.iterrows():
|
600
|
-
|
661
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
601
662
|
parent_id = str(row.get("parent_enzyme_id", "")).strip()
|
602
663
|
|
603
664
|
# Only process entries that have empty sequences AND no parent info
|
604
|
-
if (not
|
665
|
+
if (not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]) and (not parent_id or parent_id.lower() in ["nan", "none", ""]):
|
605
666
|
enzyme_id = str(row.get("enzyme_id", ""))
|
606
667
|
campaign_id = str(row.get("campaign_id", ""))
|
607
668
|
generation = str(row.get("generation", ""))
|
@@ -624,13 +685,13 @@ def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
|
624
685
|
for idx, row in df.iterrows():
|
625
686
|
enzyme_id = str(row.get("enzyme_id", ""))
|
626
687
|
campaign_id = str(row.get("campaign_id", ""))
|
627
|
-
|
688
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
628
689
|
generation = str(row.get("generation", ""))
|
629
690
|
|
630
|
-
if enzyme_id and enzyme_id != "nan":
|
691
|
+
if enzyme_id and enzyme_id.lower() != "nan":
|
631
692
|
available_enzymes[enzyme_id] = {
|
632
693
|
"campaign_id": campaign_id,
|
633
|
-
"has_sequence": bool(
|
694
|
+
"has_sequence": bool(protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]),
|
634
695
|
"generation": generation
|
635
696
|
}
|
636
697
|
|
@@ -704,14 +765,372 @@ If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
|
|
704
765
|
return df
|
705
766
|
|
706
767
|
|
707
|
-
# === 8.
|
768
|
+
# === 8. SEQUENCE SOURCE IDENTIFICATION === -----------------------------------
|
769
|
+
|
770
|
+
def identify_sequence_sources_with_gemini(df: pd.DataFrame, debug_dir: Optional[Path] = None) -> pd.DataFrame:
|
771
|
+
"""Use Gemini API to identify which parent sequences to use for entries with missing sequences."""
|
772
|
+
if not GEMINI_OK:
|
773
|
+
log.warning("Gemini API not available (missing google.generativeai). Skipping sequence source identification.")
|
774
|
+
return df
|
775
|
+
|
776
|
+
if not GEMINI_API_KEY:
|
777
|
+
log.warning("GEMINI_API_KEY not set. Skipping sequence source identification.")
|
778
|
+
return df
|
779
|
+
|
780
|
+
try:
|
781
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
782
|
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
783
|
+
except Exception as e:
|
784
|
+
log.warning(f"Failed to configure Gemini API: {e}. Skipping sequence source identification.")
|
785
|
+
return df
|
786
|
+
|
787
|
+
# Group by campaign to process each campaign separately
|
788
|
+
campaigns = df['campaign_id'].unique()
|
789
|
+
|
790
|
+
for campaign_id in campaigns:
|
791
|
+
if pd.isna(campaign_id):
|
792
|
+
campaign_mask = df['campaign_id'].isna()
|
793
|
+
campaign_id_str = "unknown"
|
794
|
+
else:
|
795
|
+
campaign_mask = df['campaign_id'] == campaign_id
|
796
|
+
campaign_id_str = str(campaign_id)
|
797
|
+
|
798
|
+
campaign_df = df[campaign_mask]
|
799
|
+
|
800
|
+
# Find entries with empty sequences in this campaign
|
801
|
+
empty_seq_entries = []
|
802
|
+
available_seq_entries = []
|
803
|
+
|
804
|
+
for idx, row in campaign_df.iterrows():
|
805
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
806
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
807
|
+
parent_id = str(row.get("parent_enzyme_id", "")).strip()
|
808
|
+
mutations = str(row.get("mutations", "")).strip()
|
809
|
+
generation = str(row.get("generation", ""))
|
810
|
+
|
811
|
+
if not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]:
|
812
|
+
empty_seq_entries.append({
|
813
|
+
"idx": idx,
|
814
|
+
"enzyme_id": enzyme_id,
|
815
|
+
"parent_id": parent_id if parent_id != "nan" else None,
|
816
|
+
"mutations": mutations if mutations != "nan" else None,
|
817
|
+
"generation": generation
|
818
|
+
})
|
819
|
+
else:
|
820
|
+
available_seq_entries.append({
|
821
|
+
"enzyme_id": enzyme_id,
|
822
|
+
"generation": generation,
|
823
|
+
"seq_length": len(protein_sequence)
|
824
|
+
})
|
825
|
+
|
826
|
+
# Skip if no empty sequences
|
827
|
+
if not empty_seq_entries:
|
828
|
+
continue
|
829
|
+
|
830
|
+
# Check if this is a partially empty situation (some have sequences, some don't)
|
831
|
+
total_entries = len(campaign_df)
|
832
|
+
empty_count = len(empty_seq_entries)
|
833
|
+
|
834
|
+
log.info(f"Campaign {campaign_id_str}: {empty_count}/{total_entries} entries have empty sequences")
|
835
|
+
|
836
|
+
if empty_count == total_entries:
|
837
|
+
# All sequences are empty - try to find cross-campaign relationships
|
838
|
+
log.info(f"Campaign {campaign_id_str}: All sequences are empty ({empty_count}/{total_entries}). "
|
839
|
+
f"Searching for cross-campaign parent relationships...")
|
840
|
+
|
841
|
+
# Get all enzymes with sequences from OTHER campaigns
|
842
|
+
other_campaigns_with_seqs = []
|
843
|
+
for other_campaign in campaigns:
|
844
|
+
if other_campaign == campaign_id or pd.isna(other_campaign):
|
845
|
+
continue
|
846
|
+
other_mask = df['campaign_id'] == other_campaign
|
847
|
+
other_df = df[other_mask]
|
848
|
+
|
849
|
+
for idx, row in other_df.iterrows():
|
850
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
851
|
+
if protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]:
|
852
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
853
|
+
generation = str(row.get("generation", ""))
|
854
|
+
other_campaigns_with_seqs.append({
|
855
|
+
"enzyme_id": enzyme_id,
|
856
|
+
"campaign_id": str(other_campaign),
|
857
|
+
"generation": generation,
|
858
|
+
"seq_length": len(protein_sequence)
|
859
|
+
})
|
860
|
+
|
861
|
+
if not other_campaigns_with_seqs:
|
862
|
+
log.info(f"Campaign {campaign_id_str}: No sequences found in other campaigns to use as cross-campaign parents")
|
863
|
+
continue
|
864
|
+
|
865
|
+
# Create context for cross-campaign analysis
|
866
|
+
context_lines = []
|
867
|
+
context_lines.append(f"Empty Campaign: {campaign_id_str} (all {empty_count} enzymes need sequences)")
|
868
|
+
context_lines.append(f"\nEnzymes in empty campaign:")
|
869
|
+
for entry in empty_seq_entries[:10]: # Limit for context
|
870
|
+
parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
|
871
|
+
mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
|
872
|
+
context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
|
873
|
+
|
874
|
+
context_lines.append(f"\nEnzymes with sequences from OTHER campaigns ({len(other_campaigns_with_seqs)}):")
|
875
|
+
for entry in other_campaigns_with_seqs[:15]: # Limit for context
|
876
|
+
# Get the actual sequence for this enzyme
|
877
|
+
enzyme_rows = df[df['enzyme_id'] == entry['enzyme_id']]
|
878
|
+
if not enzyme_rows.empty:
|
879
|
+
sequence = str(enzyme_rows.iloc[0]['protein_sequence'])
|
880
|
+
context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, sequence: {sequence})")
|
881
|
+
else:
|
882
|
+
context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
|
883
|
+
|
884
|
+
context_text = "\n".join(context_lines)
|
885
|
+
|
886
|
+
# Find ONE good cross-campaign seed to bootstrap this campaign
|
887
|
+
log.info(f"Campaign {campaign_id_str}: Looking for ONE cross-campaign seed to bootstrap sequences...")
|
888
|
+
|
889
|
+
# Create a prompt to find the BEST single seed
|
890
|
+
prompt = f"""
|
891
|
+
Based on enzyme names, identify the SINGLE BEST seed enzyme from other campaigns to bootstrap the empty campaign.
|
892
|
+
|
893
|
+
{context_text}
|
894
|
+
|
895
|
+
From the enzymes in the EMPTY campaign, identify which ONE has the clearest match in OTHER campaigns.
|
896
|
+
Prioritize:
|
897
|
+
1. EXACT name matches (highest priority)
|
898
|
+
2. Simplest parent relationships (e.g., an enzyme that differs by only 1-2 mutations)
|
899
|
+
3. Earliest generation enzymes (lower generation numbers are better seeds)
|
900
|
+
|
901
|
+
Return your response as a JSON dictionary with this exact format:
|
902
|
+
{{
|
903
|
+
"seed_enzyme": {{
|
904
|
+
"target_enzyme_id": "the enzyme ID in the empty campaign",
|
905
|
+
"relationship_type": "EXACT_MATCH" or "BEST_PARENT",
|
906
|
+
"source": {{
|
907
|
+
"campaign_id": "the campaign ID",
|
908
|
+
"enzyme_id": "the enzyme ID WITHOUT campaign suffix"
|
909
|
+
}},
|
910
|
+
"confidence": 0.1 to 1.0,
|
911
|
+
"reason": "brief explanation of why this is the best seed"
|
912
|
+
}}
|
913
|
+
}}
|
914
|
+
|
915
|
+
Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
916
|
+
"""
|
917
|
+
|
918
|
+
try:
|
919
|
+
# Save debug information if debug_dir is provided
|
920
|
+
if debug_dir:
|
921
|
+
import time
|
922
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
923
|
+
prompt_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_prompt_{timestamp}.txt"
|
924
|
+
prompt_file.write_text(prompt)
|
925
|
+
|
926
|
+
response = model.generate_content(prompt)
|
927
|
+
response_text = response.text.strip()
|
928
|
+
|
929
|
+
# Save response if debug_dir is provided
|
930
|
+
if debug_dir:
|
931
|
+
response_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_response_{timestamp}.txt"
|
932
|
+
response_file.write_text(response_text)
|
933
|
+
|
934
|
+
# Parse the JSON response
|
935
|
+
import json
|
936
|
+
try:
|
937
|
+
# Clean the response text if it contains markdown
|
938
|
+
if '```json' in response_text:
|
939
|
+
response_text = response_text.split('```json')[1].split('```')[0].strip()
|
940
|
+
elif '```' in response_text:
|
941
|
+
response_text = response_text.split('```')[1].split('```')[0].strip()
|
942
|
+
|
943
|
+
seed_data = json.loads(response_text)
|
944
|
+
seed_info = seed_data.get('seed_enzyme', {})
|
945
|
+
|
946
|
+
if seed_info:
|
947
|
+
target_enzyme_id = seed_info.get('target_enzyme_id', '')
|
948
|
+
relationship_type = seed_info.get('relationship_type', '').upper()
|
949
|
+
source_info = seed_info.get('source', {})
|
950
|
+
source_enzyme_id = source_info.get('enzyme_id', '')
|
951
|
+
source_campaign_id = source_info.get('campaign_id', '')
|
952
|
+
confidence = float(seed_info.get('confidence', 0.5))
|
953
|
+
reason = seed_info.get('reason', '')
|
954
|
+
|
955
|
+
log.info(f"Campaign {campaign_id_str}: Found seed - {target_enzyme_id} from {source_enzyme_id} ({relationship_type}, confidence: {confidence})")
|
956
|
+
log.info(f"Reason: {reason}")
|
957
|
+
|
958
|
+
if source_enzyme_id:
|
959
|
+
# Find the source enzyme's sequence in the dataframe
|
960
|
+
source_rows = df[df['enzyme_id'] == source_enzyme_id]
|
961
|
+
if source_rows.empty:
|
962
|
+
log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
|
963
|
+
else:
|
964
|
+
source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
|
965
|
+
if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
|
966
|
+
log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
|
967
|
+
else:
|
968
|
+
# Find the target enzyme in our empty list
|
969
|
+
seed_found = False
|
970
|
+
for entry in empty_seq_entries:
|
971
|
+
if entry['enzyme_id'] == target_enzyme_id:
|
972
|
+
if relationship_type == "EXACT_MATCH":
|
973
|
+
# Exact match - copy sequence directly
|
974
|
+
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
975
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
|
976
|
+
log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
|
977
|
+
seed_found = True
|
978
|
+
|
979
|
+
elif relationship_type == "BEST_PARENT":
|
980
|
+
# Parent relationship - apply mutations to get the target sequence
|
981
|
+
target_mutations = entry.get('mutations', '').strip()
|
982
|
+
if target_mutations:
|
983
|
+
# Apply mutations using SequenceManipulator
|
984
|
+
manipulator = SequenceManipulator()
|
985
|
+
mutated_sequence, success = manipulator.apply_mutations(source_sequence, target_mutations)
|
986
|
+
|
987
|
+
if success:
|
988
|
+
df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
|
989
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
|
990
|
+
log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
|
991
|
+
seed_found = True
|
992
|
+
else:
|
993
|
+
log.warning(f"Failed to apply mutations {target_mutations} to parent {source_enzyme_id} for {target_enzyme_id}")
|
994
|
+
else:
|
995
|
+
# No mutations - use parent sequence directly
|
996
|
+
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
997
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
|
998
|
+
log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
|
999
|
+
seed_found = True
|
1000
|
+
break
|
1001
|
+
|
1002
|
+
if seed_found:
|
1003
|
+
log.info(f"Campaign {campaign_id_str}: Successfully set cross-campaign seed. Local processing will handle the rest.")
|
1004
|
+
else:
|
1005
|
+
log.warning(f"Campaign {campaign_id_str}: Could not find target enzyme {target_enzyme_id} in empty list")
|
1006
|
+
|
1007
|
+
except json.JSONDecodeError as e:
|
1008
|
+
log.warning(f"Failed to parse JSON response for cross-campaign seed: {e}")
|
1009
|
+
log.debug(f"Response text: {response_text}")
|
1010
|
+
|
1011
|
+
except Exception as e:
|
1012
|
+
log.warning(f"Failed to identify cross-campaign seed for {campaign_id_str}: {e}")
|
1013
|
+
continue
|
1014
|
+
|
1015
|
+
log.info(f"Campaign {campaign_id_str}: Found {empty_count}/{total_entries} entries with empty sequences. "
|
1016
|
+
f"Querying Gemini for sequence sources...")
|
1017
|
+
|
1018
|
+
# Create context for Gemini
|
1019
|
+
context_lines = []
|
1020
|
+
context_lines.append(f"Campaign: {campaign_id_str}")
|
1021
|
+
context_lines.append(f"\nEnzymes WITH sequences ({len(available_seq_entries)}):")
|
1022
|
+
for entry in available_seq_entries[:15]: # Limit to first 15 for context
|
1023
|
+
context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
|
1024
|
+
|
1025
|
+
context_lines.append(f"\nEnzymes WITHOUT sequences ({len(empty_seq_entries)}):")
|
1026
|
+
for entry in empty_seq_entries[:15]: # Limit to first 15 for context
|
1027
|
+
parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
|
1028
|
+
mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
|
1029
|
+
context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
|
1030
|
+
|
1031
|
+
context_text = "\n".join(context_lines)
|
1032
|
+
|
1033
|
+
# Process in batches if there are many empty sequences
|
1034
|
+
batch_size = 10
|
1035
|
+
identified_count = 0
|
1036
|
+
|
1037
|
+
for i in range(0, len(empty_seq_entries), batch_size):
|
1038
|
+
batch = empty_seq_entries[i:i+batch_size]
|
1039
|
+
|
1040
|
+
# Create batch request
|
1041
|
+
batch_request = []
|
1042
|
+
for entry in batch:
|
1043
|
+
parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ""
|
1044
|
+
mut_info = f", mutations: {entry['mutations']}" if entry['mutations'] else ""
|
1045
|
+
batch_request.append(f"{entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
|
1046
|
+
|
1047
|
+
prompt = f"""
|
1048
|
+
Based on the enzyme lineage information provided, identify which enzyme sequences should be used as the source to calculate sequences for the enzymes without sequences.
|
1049
|
+
|
1050
|
+
{context_text}
|
1051
|
+
|
1052
|
+
For each of these enzymes without sequences, identify which enzyme WITH a sequence should be used as the source:
|
1053
|
+
{chr(10).join(batch_request)}
|
1054
|
+
|
1055
|
+
Instructions:
|
1056
|
+
1. If an enzyme has a parent_id and mutations, suggest using the parent's sequence
|
1057
|
+
2. If an enzyme has no parent_id, look for the most logical ancestor or related enzyme with a sequence
|
1058
|
+
3. Consider the generation numbers and enzyme naming patterns
|
1059
|
+
4. Only suggest enzymes that actually have sequences
|
1060
|
+
|
1061
|
+
Please provide your response in this format:
|
1062
|
+
enzyme_id -> source_enzyme_id
|
1063
|
+
enzyme_id -> source_enzyme_id
|
1064
|
+
...
|
1065
|
+
|
1066
|
+
If you cannot identify a suitable source, use "None" as the source_enzyme_id.
|
1067
|
+
"""
|
1068
|
+
|
1069
|
+
try:
|
1070
|
+
# Save debug information if debug_dir is provided
|
1071
|
+
if debug_dir:
|
1072
|
+
import time
|
1073
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
1074
|
+
prompt_file = debug_dir / f"sequence_source_{campaign_id_str}_prompt_{timestamp}.txt"
|
1075
|
+
prompt_file.write_text(prompt)
|
1076
|
+
|
1077
|
+
response = model.generate_content(prompt)
|
1078
|
+
response_text = response.text.strip()
|
1079
|
+
|
1080
|
+
# Save response if debug_dir is provided
|
1081
|
+
if debug_dir:
|
1082
|
+
response_file = debug_dir / f"sequence_source_{campaign_id_str}_response_{timestamp}.txt"
|
1083
|
+
response_file.write_text(response_text)
|
1084
|
+
|
1085
|
+
# Parse the response
|
1086
|
+
for line in response_text.split('\n'):
|
1087
|
+
if '->' in line:
|
1088
|
+
parts = line.split('->')
|
1089
|
+
if len(parts) == 2:
|
1090
|
+
target_enzyme = parts[0].strip()
|
1091
|
+
source_enzyme = parts[1].strip()
|
1092
|
+
|
1093
|
+
if source_enzyme and source_enzyme != "None":
|
1094
|
+
# Find the target enzyme in our batch
|
1095
|
+
for entry in batch:
|
1096
|
+
if entry['enzyme_id'] == target_enzyme:
|
1097
|
+
# Verify the source enzyme exists and has a sequence
|
1098
|
+
source_rows = df[df['enzyme_id'] == source_enzyme]
|
1099
|
+
if not source_rows.empty:
|
1100
|
+
source_seq = source_rows.iloc[0]['protein_sequence']
|
1101
|
+
if source_seq and str(source_seq).strip() and str(source_seq) != "nan":
|
1102
|
+
# Update the parent_enzyme_id if it's missing
|
1103
|
+
if not entry['parent_id']:
|
1104
|
+
df.at[entry['idx'], 'parent_enzyme_id'] = source_enzyme
|
1105
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_suggested_parent"
|
1106
|
+
identified_count += 1
|
1107
|
+
log.info(f"Set {source_enzyme} as parent for {target_enzyme} (Gemini suggestion)")
|
1108
|
+
elif entry['parent_id'] != source_enzyme:
|
1109
|
+
# Log if Gemini suggests a different parent than what's recorded
|
1110
|
+
log.info(f"Gemini suggests {source_enzyme} as source for {target_enzyme}, "
|
1111
|
+
f"but parent is recorded as {entry['parent_id']}")
|
1112
|
+
break
|
1113
|
+
|
1114
|
+
except Exception as e:
|
1115
|
+
log.warning(f"Failed to identify sequence sources for batch {i//batch_size + 1}: {e}")
|
1116
|
+
continue
|
1117
|
+
|
1118
|
+
if identified_count > 0:
|
1119
|
+
log.info(f"Campaign {campaign_id_str}: Successfully identified {identified_count} sequence sources using Gemini")
|
1120
|
+
|
1121
|
+
return df
|
1122
|
+
|
1123
|
+
|
1124
|
+
# === 9. MAIN PROCESSOR === ---------------------------------------------------
|
708
1125
|
|
709
1126
|
class SequenceProcessor:
|
710
1127
|
"""Main processor for handling the complete workflow."""
|
711
1128
|
|
712
|
-
def __init__(self, input_csv: Path, output_csv: Path):
|
1129
|
+
def __init__(self, input_csv: Path, output_csv: Path, debug_dir: Optional[Path] = None, strict_mutation_validation: bool = True):
|
713
1130
|
self.input_csv = input_csv
|
714
1131
|
self.output_csv = output_csv
|
1132
|
+
self.debug_dir = debug_dir
|
1133
|
+
self.strict_mutation_validation = strict_mutation_validation
|
715
1134
|
self.df = None
|
716
1135
|
self.generator = None
|
717
1136
|
|
@@ -732,7 +1151,7 @@ class SequenceProcessor:
|
|
732
1151
|
self.df["flag"] = ""
|
733
1152
|
|
734
1153
|
# Initialize generator
|
735
|
-
self.generator = SequenceGenerator(self.df)
|
1154
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
736
1155
|
|
737
1156
|
def _normalize_columns(self) -> None:
|
738
1157
|
"""Automatically detect and normalize column names from different formats."""
|
@@ -994,7 +1413,7 @@ class SequenceProcessor:
|
|
994
1413
|
self.df = self.df[campaign_mask].copy()
|
995
1414
|
|
996
1415
|
# Rebuild relationships for this campaign
|
997
|
-
self.generator = SequenceGenerator(self.df)
|
1416
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
998
1417
|
|
999
1418
|
# Flag complex mutations
|
1000
1419
|
self.flag_complex_mutations()
|
@@ -1010,7 +1429,7 @@ class SequenceProcessor:
|
|
1010
1429
|
self.df = identify_parents_with_gemini(self.df)
|
1011
1430
|
|
1012
1431
|
# Rebuild relationships after parent identification
|
1013
|
-
self.generator = SequenceGenerator(self.df)
|
1432
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
1014
1433
|
|
1015
1434
|
# Try to fill sequences again after parent identification
|
1016
1435
|
log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
|
@@ -1024,6 +1443,50 @@ class SequenceProcessor:
|
|
1024
1443
|
|
1025
1444
|
log.info(f"Completed campaign: {campaign_id}")
|
1026
1445
|
|
1446
|
+
# After processing all campaigns, check for any remaining empty sequences
|
1447
|
+
# and use Gemini to identify sequence sources (including cross-campaign relationships)
|
1448
|
+
empty_count = sum(self.df["protein_sequence"].str.strip() == "")
|
1449
|
+
total_count = len(self.df)
|
1450
|
+
|
1451
|
+
if empty_count > 0:
|
1452
|
+
log.info(f"Found {empty_count}/{total_count} empty sequences after initial processing. "
|
1453
|
+
"Using Gemini to identify sequence sources (including cross-campaign relationships)...")
|
1454
|
+
self.df = identify_sequence_sources_with_gemini(self.df, self.debug_dir)
|
1455
|
+
|
1456
|
+
# Process campaigns again after identifying new parent relationships
|
1457
|
+
log.info("Reprocessing campaigns after sequence source identification...")
|
1458
|
+
|
1459
|
+
for campaign_id in campaigns:
|
1460
|
+
if pd.isna(campaign_id):
|
1461
|
+
campaign_id = "unknown"
|
1462
|
+
|
1463
|
+
log.info(f"Reprocessing campaign: {campaign_id}")
|
1464
|
+
|
1465
|
+
# Filter data for this campaign
|
1466
|
+
campaign_mask = self.df['campaign_id'] == campaign_id
|
1467
|
+
if pd.isna(campaign_id):
|
1468
|
+
campaign_mask = self.df['campaign_id'].isna()
|
1469
|
+
|
1470
|
+
# Store original dataframe
|
1471
|
+
original_df = self.df
|
1472
|
+
|
1473
|
+
# Process only this campaign's data
|
1474
|
+
self.df = self.df[campaign_mask].copy()
|
1475
|
+
|
1476
|
+
# Rebuild relationships for this campaign
|
1477
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
1478
|
+
|
1479
|
+
# Try to fill sequences again
|
1480
|
+
self.process_remaining()
|
1481
|
+
|
1482
|
+
# Update the original dataframe with results
|
1483
|
+
original_df.loc[campaign_mask, :] = self.df
|
1484
|
+
|
1485
|
+
# Restore original dataframe
|
1486
|
+
self.df = original_df
|
1487
|
+
|
1488
|
+
log.info(f"Completed reprocessing campaign: {campaign_id}")
|
1489
|
+
|
1027
1490
|
# Save results
|
1028
1491
|
self.save_results()
|
1029
1492
|
|
@@ -1072,6 +1535,16 @@ def main(argv: Optional[List[str]] = None) -> None:
|
|
1072
1535
|
default=0,
|
1073
1536
|
help="Increase verbosity (use -vv for debug output)"
|
1074
1537
|
)
|
1538
|
+
parser.add_argument(
|
1539
|
+
"--debug-dir",
|
1540
|
+
type=Path,
|
1541
|
+
help="Directory to save debug information (Gemini prompts and responses)"
|
1542
|
+
)
|
1543
|
+
parser.add_argument(
|
1544
|
+
"--allow-mutation-mismatches",
|
1545
|
+
action="store_true",
|
1546
|
+
help="Allow sequence generation even when mutations don't match (default: strict validation)"
|
1547
|
+
)
|
1075
1548
|
|
1076
1549
|
args = parser.parse_args(argv)
|
1077
1550
|
|
@@ -1079,7 +1552,13 @@ def main(argv: Optional[List[str]] = None) -> None:
|
|
1079
1552
|
setup_logging(args.verbose)
|
1080
1553
|
|
1081
1554
|
# Process the data (format detection is automatic)
|
1082
|
-
|
1555
|
+
strict_validation = not args.allow_mutation_mismatches
|
1556
|
+
processor = SequenceProcessor(
|
1557
|
+
args.input_csv,
|
1558
|
+
args.output_csv,
|
1559
|
+
getattr(args, 'debug_dir', None),
|
1560
|
+
strict_mutation_validation=strict_validation
|
1561
|
+
)
|
1083
1562
|
processor.run()
|
1084
1563
|
|
1085
1564
|
|