debase 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +656 -27
- debase/enzyme_lineage_extractor.py +1077 -109
- debase/lineage_format.py +221 -12
- debase/reaction_info_extractor.py +133 -23
- debase/substrate_scope_extractor.py +49 -2
- debase/wrapper.py +155 -151
- debase-0.4.4.dist-info/METADATA +121 -0
- debase-0.4.4.dist-info/RECORD +16 -0
- debase-0.4.2.dist-info/METADATA +0 -296
- debase-0.4.2.dist-info/RECORD +0 -16
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/WHEEL +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/entry_points.txt +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.2.dist-info → debase-0.4.4.dist-info}/top_level.txt +0 -0
debase/cleanup_sequence.py
CHANGED
@@ -11,6 +11,7 @@ Usage:
|
|
11
11
|
|
12
12
|
import argparse
|
13
13
|
import logging
|
14
|
+
import os
|
14
15
|
import re
|
15
16
|
import sys
|
16
17
|
from dataclasses import dataclass, field
|
@@ -19,11 +20,20 @@ from typing import Dict, List, Optional, Set, Tuple, Union
|
|
19
20
|
|
20
21
|
import pandas as pd
|
21
22
|
|
23
|
+
try:
|
24
|
+
import google.generativeai as genai # type: ignore
|
25
|
+
GEMINI_OK = True
|
26
|
+
except ImportError: # pragma: no cover
|
27
|
+
GEMINI_OK = False
|
28
|
+
|
22
29
|
|
23
30
|
# === 1. CONFIGURATION & CONSTANTS === ----------------------------------------
|
24
31
|
|
25
32
|
VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY*") # Include * for stop codons
|
26
33
|
|
34
|
+
# Gemini API configuration
|
35
|
+
GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
|
36
|
+
|
27
37
|
# Configure module logger
|
28
38
|
log = logging.getLogger(__name__)
|
29
39
|
|
@@ -193,12 +203,17 @@ class SequenceManipulator:
|
|
193
203
|
return 0 if zero_matches >= one_matches else 1
|
194
204
|
|
195
205
|
@classmethod
|
196
|
-
def apply_mutations(cls, parent_seq: str, mutation_str: str) -> str:
|
197
|
-
"""Apply mutations to a parent sequence.
|
206
|
+
def apply_mutations(cls, parent_seq: str, mutation_str: str) -> Tuple[str, bool]:
|
207
|
+
"""Apply mutations to a parent sequence.
|
208
|
+
|
209
|
+
Returns:
|
210
|
+
Tuple[str, bool]: (resulting_sequence, all_mutations_applied_successfully)
|
211
|
+
"""
|
198
212
|
if not parent_seq:
|
199
|
-
return ""
|
213
|
+
return "", True
|
200
214
|
|
201
215
|
seq = list(parent_seq)
|
216
|
+
all_mutations_successful = True
|
202
217
|
|
203
218
|
# Apply point mutations
|
204
219
|
mutations = MutationParser.parse_mutations(mutation_str)
|
@@ -207,19 +222,26 @@ class SequenceManipulator:
|
|
207
222
|
|
208
223
|
for mut in mutations:
|
209
224
|
idx = mut.position - idx_offset
|
225
|
+
mutation_applied = False
|
226
|
+
|
210
227
|
# Try primary index
|
211
228
|
if 0 <= idx < len(seq) and seq[idx].upper() == mut.original.upper():
|
212
229
|
seq[idx] = mut.replacement
|
230
|
+
mutation_applied = True
|
213
231
|
else:
|
214
232
|
# Try alternate index
|
215
233
|
alt_idx = mut.position - (1 - idx_offset)
|
216
234
|
if 0 <= alt_idx < len(seq) and seq[alt_idx].upper() == mut.original.upper():
|
217
235
|
seq[alt_idx] = mut.replacement
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
236
|
+
mutation_applied = True
|
237
|
+
|
238
|
+
if not mutation_applied:
|
239
|
+
log.error(
|
240
|
+
f"MUTATION MISMATCH: {mut} does not match parent sequence at "
|
241
|
+
f"position {mut.position} (tried both 0- and 1-based indexing). "
|
242
|
+
f"Parent has {seq[idx] if 0 <= idx < len(seq) else 'out-of-bounds'} at position {mut.position}"
|
243
|
+
)
|
244
|
+
all_mutations_successful = False
|
223
245
|
|
224
246
|
# Apply complex C-terminal mutations
|
225
247
|
complex_mut = MutationParser.parse_complex_c_terminal(mutation_str)
|
@@ -242,12 +264,13 @@ class SequenceManipulator:
|
|
242
264
|
if complex_mut.extension_seq:
|
243
265
|
seq.extend(list(complex_mut.extension_seq))
|
244
266
|
else:
|
245
|
-
log.
|
246
|
-
f"Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
|
267
|
+
log.error(
|
268
|
+
f"COMPLEX MUTATION MISMATCH: Invalid C-terminal mutation positions: {complex_mut.start_pos}-"
|
247
269
|
f"{complex_mut.end_pos} for sequence of length {len(seq)}"
|
248
270
|
)
|
271
|
+
all_mutations_successful = False
|
249
272
|
|
250
|
-
return "".join(seq)
|
273
|
+
return "".join(seq), all_mutations_successful
|
251
274
|
|
252
275
|
@classmethod
|
253
276
|
def reverse_mutations(cls, child_seq: str, mutation_str: str) -> str:
|
@@ -390,10 +413,11 @@ class LineageNavigator:
|
|
390
413
|
class SequenceGenerator:
|
391
414
|
"""Main class for generating protein sequences from mutations."""
|
392
415
|
|
393
|
-
def __init__(self, df: pd.DataFrame):
|
416
|
+
def __init__(self, df: pd.DataFrame, strict_mutation_validation: bool = True):
|
394
417
|
self.df = df
|
395
418
|
self.navigator = LineageNavigator(df)
|
396
419
|
self.manipulator = SequenceManipulator()
|
420
|
+
self.strict_mutation_validation = strict_mutation_validation
|
397
421
|
self._update_ground_truths()
|
398
422
|
|
399
423
|
def _update_ground_truths(self) -> None:
|
@@ -464,19 +488,62 @@ class SequenceGenerator:
|
|
464
488
|
parent_id: str
|
465
489
|
) -> Optional[SequenceGenerationResult]:
|
466
490
|
"""Generate sequence by applying mutations to parent."""
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
if not parent_seq:
|
491
|
+
# Get the variant to find its campaign
|
492
|
+
variant_rows = self.df[self.df["enzyme_id"] == variant_id]
|
493
|
+
if variant_rows.empty:
|
471
494
|
return None
|
472
495
|
|
473
|
-
variant_row =
|
496
|
+
variant_row = variant_rows.iloc[0]
|
497
|
+
variant_campaign = variant_row.get("campaign_id", "")
|
474
498
|
mutations = variant_row.get("mutations", "")
|
475
499
|
|
476
500
|
if not mutations:
|
477
501
|
return None
|
478
502
|
|
479
|
-
|
503
|
+
# Find parent in the same campaign first
|
504
|
+
parent_rows = self.df[
|
505
|
+
(self.df["enzyme_id"] == parent_id) &
|
506
|
+
(self.df["campaign_id"] == variant_campaign)
|
507
|
+
]
|
508
|
+
|
509
|
+
# If not found in same campaign, fall back to any parent with that ID
|
510
|
+
if parent_rows.empty:
|
511
|
+
parent_rows = self.df[self.df["enzyme_id"] == parent_id]
|
512
|
+
if not parent_rows.empty:
|
513
|
+
log.warning(f"Parent {parent_id} not found in same campaign {variant_campaign} for variant {variant_id}, using parent from different campaign")
|
514
|
+
|
515
|
+
if parent_rows.empty:
|
516
|
+
log.error(f"Parent {parent_id} not found for variant {variant_id}")
|
517
|
+
return None
|
518
|
+
|
519
|
+
parent_row = parent_rows.iloc[0]
|
520
|
+
parent_seq = parent_row.get("protein_sequence", "")
|
521
|
+
parent_campaign = parent_row.get("campaign_id", "")
|
522
|
+
|
523
|
+
if not parent_seq:
|
524
|
+
return None
|
525
|
+
|
526
|
+
# Log which parent sequence is being used
|
527
|
+
if parent_campaign != variant_campaign:
|
528
|
+
log.info(f"Using parent {parent_id} from campaign {parent_campaign} for variant {variant_id} in campaign {variant_campaign}")
|
529
|
+
else:
|
530
|
+
log.info(f"Using parent {parent_id} from same campaign {variant_campaign} for variant {variant_id}")
|
531
|
+
|
532
|
+
sequence, mutations_successful = self.manipulator.apply_mutations(parent_seq, mutations)
|
533
|
+
|
534
|
+
if not mutations_successful:
|
535
|
+
# Check if this might be an exact match case (mutations already present in parent)
|
536
|
+
# This happens when an enzyme from another campaign is identified as both parent and exact match
|
537
|
+
if parent_id == variant_id or (mutations and parent_seq == sequence):
|
538
|
+
log.info(f"Detected exact match scenario for {variant_id} - using parent sequence directly")
|
539
|
+
sequence = parent_seq
|
540
|
+
mutations_successful = True
|
541
|
+
elif self.strict_mutation_validation:
|
542
|
+
log.error(f"STRICT MODE: Failed to apply mutations for {variant_id}: mutation mismatch detected. Not populating sequence to prevent incorrect data.")
|
543
|
+
return None
|
544
|
+
else:
|
545
|
+
log.warning(f"Mutation mismatch for {variant_id}, but proceeding with generated sequence (strict_mutation_validation=False)")
|
546
|
+
# Continue with the sequence even if mutations failed
|
480
547
|
|
481
548
|
return SequenceGenerationResult(
|
482
549
|
sequence=sequence,
|
@@ -538,10 +605,14 @@ class SequenceGenerator:
|
|
538
605
|
|
539
606
|
# Generate based on direction
|
540
607
|
if direction == "up" and parent_id and mutations:
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
608
|
+
# Always try the declared parent first
|
609
|
+
result = self.generate_from_parent(variant_id, parent_id)
|
610
|
+
if result:
|
611
|
+
return result
|
612
|
+
|
613
|
+
# If declared parent fails, try the ground truth (if different)
|
614
|
+
if gt_id != parent_id:
|
615
|
+
log.info(f"Declared parent {parent_id} failed for {variant_id}, trying ground truth {gt_id}")
|
545
616
|
result = self.generate_from_parent(variant_id, gt_id)
|
546
617
|
if result:
|
547
618
|
result.confidence = 0.7
|
@@ -565,14 +636,501 @@ class SequenceGenerator:
|
|
565
636
|
return None
|
566
637
|
|
567
638
|
|
568
|
-
# === 7.
|
639
|
+
# === 7. GEMINI PARENT IDENTIFICATION === ------------------------------------
|
640
|
+
|
641
|
+
def identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
642
|
+
"""Use Gemini API to identify parent enzymes for entries with missing parent information."""
|
643
|
+
if not GEMINI_OK:
|
644
|
+
log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
|
645
|
+
return df
|
646
|
+
|
647
|
+
if not GEMINI_API_KEY:
|
648
|
+
log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
|
649
|
+
return df
|
650
|
+
|
651
|
+
try:
|
652
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
653
|
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
654
|
+
except Exception as e:
|
655
|
+
log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
|
656
|
+
return df
|
657
|
+
|
658
|
+
# Find entries with empty sequences but missing parent information
|
659
|
+
entries_needing_parents = []
|
660
|
+
for idx, row in df.iterrows():
|
661
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
662
|
+
parent_id = str(row.get("parent_enzyme_id", "")).strip()
|
663
|
+
|
664
|
+
# Only process entries that have empty sequences AND no parent info
|
665
|
+
if (not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]) and (not parent_id or parent_id.lower() in ["nan", "none", ""]):
|
666
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
667
|
+
campaign_id = str(row.get("campaign_id", ""))
|
668
|
+
generation = str(row.get("generation", ""))
|
669
|
+
|
670
|
+
entries_needing_parents.append({
|
671
|
+
"idx": idx,
|
672
|
+
"enzyme_id": enzyme_id,
|
673
|
+
"campaign_id": campaign_id,
|
674
|
+
"generation": generation
|
675
|
+
})
|
676
|
+
|
677
|
+
if not entries_needing_parents:
|
678
|
+
log.info("No entries need parent identification from Gemini")
|
679
|
+
return df
|
680
|
+
|
681
|
+
log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
|
682
|
+
|
683
|
+
# Create a lookup of all available enzyme IDs for context
|
684
|
+
available_enzymes = {}
|
685
|
+
for idx, row in df.iterrows():
|
686
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
687
|
+
campaign_id = str(row.get("campaign_id", ""))
|
688
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
689
|
+
generation = str(row.get("generation", ""))
|
690
|
+
|
691
|
+
if enzyme_id and enzyme_id.lower() != "nan":
|
692
|
+
available_enzymes[enzyme_id] = {
|
693
|
+
"campaign_id": campaign_id,
|
694
|
+
"has_sequence": bool(protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]),
|
695
|
+
"generation": generation
|
696
|
+
}
|
697
|
+
|
698
|
+
identified_count = 0
|
699
|
+
for entry in entries_needing_parents:
|
700
|
+
enzyme_id = entry["enzyme_id"]
|
701
|
+
campaign_id = entry["campaign_id"]
|
702
|
+
generation = entry["generation"]
|
703
|
+
|
704
|
+
# Create context for Gemini
|
705
|
+
context_info = []
|
706
|
+
context_info.append(f"Enzyme ID: {enzyme_id}")
|
707
|
+
context_info.append(f"Campaign ID: {campaign_id}")
|
708
|
+
if generation:
|
709
|
+
context_info.append(f"Generation: {generation}")
|
710
|
+
|
711
|
+
# Add available enzymes from the same campaign for context
|
712
|
+
campaign_enzymes = []
|
713
|
+
for enz_id, enz_data in available_enzymes.items():
|
714
|
+
if enz_data["campaign_id"] == campaign_id:
|
715
|
+
status = "with sequence" if enz_data["has_sequence"] else "without sequence"
|
716
|
+
gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
|
717
|
+
campaign_enzymes.append(f" - {enz_id} {status} {gen_info}")
|
718
|
+
|
719
|
+
if campaign_enzymes:
|
720
|
+
context_info.append("Available enzymes in same campaign:")
|
721
|
+
context_info.extend(campaign_enzymes[:10]) # Limit to first 10 for context
|
722
|
+
|
723
|
+
context_text = "\n".join(context_info)
|
724
|
+
|
725
|
+
prompt = f"""
|
726
|
+
Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
|
727
|
+
|
728
|
+
{context_text}
|
729
|
+
|
730
|
+
This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
|
731
|
+
|
732
|
+
Please provide your response in this format:
|
733
|
+
Parent: [parent_enzyme_id or "Unknown"]
|
734
|
+
|
735
|
+
If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
|
736
|
+
"""
|
737
|
+
|
738
|
+
try:
|
739
|
+
response = model.generate_content(prompt)
|
740
|
+
response_text = response.text.strip()
|
741
|
+
|
742
|
+
# Parse the response
|
743
|
+
parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
|
744
|
+
|
745
|
+
if parent_match:
|
746
|
+
parent = parent_match.group(1).strip()
|
747
|
+
if parent and parent != "Unknown" and parent != "No parent identified":
|
748
|
+
# Verify the parent exists in our available enzymes
|
749
|
+
if parent in available_enzymes:
|
750
|
+
df.at[entry["idx"], "parent_enzyme_id"] = parent
|
751
|
+
identified_count += 1
|
752
|
+
log.info(f"Identified parent for {enzyme_id}: {parent}")
|
753
|
+
else:
|
754
|
+
log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
|
755
|
+
|
756
|
+
except Exception as e:
|
757
|
+
log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
|
758
|
+
continue
|
759
|
+
|
760
|
+
if identified_count > 0:
|
761
|
+
log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
|
762
|
+
else:
|
763
|
+
log.info("No parent enzymes were identified using Gemini API")
|
764
|
+
|
765
|
+
return df
|
766
|
+
|
767
|
+
|
768
|
+
# === 8. SEQUENCE SOURCE IDENTIFICATION === -----------------------------------
|
769
|
+
|
770
|
+
def identify_sequence_sources_with_gemini(df: pd.DataFrame, debug_dir: Optional[Path] = None) -> pd.DataFrame:
|
771
|
+
"""Use Gemini API to identify which parent sequences to use for entries with missing sequences."""
|
772
|
+
if not GEMINI_OK:
|
773
|
+
log.warning("Gemini API not available (missing google.generativeai). Skipping sequence source identification.")
|
774
|
+
return df
|
775
|
+
|
776
|
+
if not GEMINI_API_KEY:
|
777
|
+
log.warning("GEMINI_API_KEY not set. Skipping sequence source identification.")
|
778
|
+
return df
|
779
|
+
|
780
|
+
try:
|
781
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
782
|
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
783
|
+
except Exception as e:
|
784
|
+
log.warning(f"Failed to configure Gemini API: {e}. Skipping sequence source identification.")
|
785
|
+
return df
|
786
|
+
|
787
|
+
# Group by campaign to process each campaign separately
|
788
|
+
campaigns = df['campaign_id'].unique()
|
789
|
+
|
790
|
+
for campaign_id in campaigns:
|
791
|
+
if pd.isna(campaign_id):
|
792
|
+
campaign_mask = df['campaign_id'].isna()
|
793
|
+
campaign_id_str = "unknown"
|
794
|
+
else:
|
795
|
+
campaign_mask = df['campaign_id'] == campaign_id
|
796
|
+
campaign_id_str = str(campaign_id)
|
797
|
+
|
798
|
+
campaign_df = df[campaign_mask]
|
799
|
+
|
800
|
+
# Find entries with empty sequences in this campaign
|
801
|
+
empty_seq_entries = []
|
802
|
+
available_seq_entries = []
|
803
|
+
|
804
|
+
for idx, row in campaign_df.iterrows():
|
805
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
806
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
807
|
+
parent_id = str(row.get("parent_enzyme_id", "")).strip()
|
808
|
+
mutations = str(row.get("mutations", "")).strip()
|
809
|
+
generation = str(row.get("generation", ""))
|
810
|
+
|
811
|
+
if not protein_sequence or protein_sequence.lower() in ["nan", "none", ""]:
|
812
|
+
empty_seq_entries.append({
|
813
|
+
"idx": idx,
|
814
|
+
"enzyme_id": enzyme_id,
|
815
|
+
"parent_id": parent_id if parent_id != "nan" else None,
|
816
|
+
"mutations": mutations if mutations != "nan" else None,
|
817
|
+
"generation": generation
|
818
|
+
})
|
819
|
+
else:
|
820
|
+
available_seq_entries.append({
|
821
|
+
"enzyme_id": enzyme_id,
|
822
|
+
"generation": generation,
|
823
|
+
"seq_length": len(protein_sequence)
|
824
|
+
})
|
825
|
+
|
826
|
+
# Skip if no empty sequences
|
827
|
+
if not empty_seq_entries:
|
828
|
+
continue
|
829
|
+
|
830
|
+
# Check if this is a partially empty situation (some have sequences, some don't)
|
831
|
+
total_entries = len(campaign_df)
|
832
|
+
empty_count = len(empty_seq_entries)
|
833
|
+
|
834
|
+
log.info(f"Campaign {campaign_id_str}: {empty_count}/{total_entries} entries have empty sequences")
|
835
|
+
|
836
|
+
if empty_count == total_entries:
|
837
|
+
# All sequences are empty - try to find cross-campaign relationships
|
838
|
+
log.info(f"Campaign {campaign_id_str}: All sequences are empty ({empty_count}/{total_entries}). "
|
839
|
+
f"Searching for cross-campaign parent relationships...")
|
840
|
+
|
841
|
+
# Get all enzymes with sequences from OTHER campaigns
|
842
|
+
other_campaigns_with_seqs = []
|
843
|
+
for other_campaign in campaigns:
|
844
|
+
if other_campaign == campaign_id or pd.isna(other_campaign):
|
845
|
+
continue
|
846
|
+
other_mask = df['campaign_id'] == other_campaign
|
847
|
+
other_df = df[other_mask]
|
848
|
+
|
849
|
+
for idx, row in other_df.iterrows():
|
850
|
+
protein_sequence = str(row.get("protein_sequence", "")).strip()
|
851
|
+
if protein_sequence and protein_sequence.lower() not in ["nan", "none", ""]:
|
852
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
853
|
+
generation = str(row.get("generation", ""))
|
854
|
+
other_campaigns_with_seqs.append({
|
855
|
+
"enzyme_id": enzyme_id,
|
856
|
+
"campaign_id": str(other_campaign),
|
857
|
+
"generation": generation,
|
858
|
+
"seq_length": len(protein_sequence)
|
859
|
+
})
|
860
|
+
|
861
|
+
if not other_campaigns_with_seqs:
|
862
|
+
log.info(f"Campaign {campaign_id_str}: No sequences found in other campaigns to use as cross-campaign parents")
|
863
|
+
continue
|
864
|
+
|
865
|
+
# Create context for cross-campaign analysis
|
866
|
+
context_lines = []
|
867
|
+
context_lines.append(f"Empty Campaign: {campaign_id_str} (all {empty_count} enzymes need sequences)")
|
868
|
+
context_lines.append(f"\nEnzymes in empty campaign:")
|
869
|
+
for entry in empty_seq_entries[:10]: # Limit for context
|
870
|
+
parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
|
871
|
+
mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
|
872
|
+
context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
|
873
|
+
|
874
|
+
context_lines.append(f"\nEnzymes with sequences from OTHER campaigns ({len(other_campaigns_with_seqs)}):")
|
875
|
+
for entry in other_campaigns_with_seqs[:15]: # Limit for context
|
876
|
+
# Get the actual sequence for this enzyme
|
877
|
+
enzyme_rows = df[df['enzyme_id'] == entry['enzyme_id']]
|
878
|
+
if not enzyme_rows.empty:
|
879
|
+
sequence = str(enzyme_rows.iloc[0]['protein_sequence'])
|
880
|
+
context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, sequence: {sequence})")
|
881
|
+
else:
|
882
|
+
context_lines.append(f" - {entry['enzyme_id']} from {entry['campaign_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
|
883
|
+
|
884
|
+
context_text = "\n".join(context_lines)
|
885
|
+
|
886
|
+
# Find ONE good cross-campaign seed to bootstrap this campaign
|
887
|
+
log.info(f"Campaign {campaign_id_str}: Looking for ONE cross-campaign seed to bootstrap sequences...")
|
888
|
+
|
889
|
+
# Create a prompt to find the BEST single seed
|
890
|
+
prompt = f"""
|
891
|
+
Based on enzyme names, identify the SINGLE BEST seed enzyme from other campaigns to bootstrap the empty campaign.
|
892
|
+
|
893
|
+
{context_text}
|
894
|
+
|
895
|
+
From the enzymes in the EMPTY campaign, identify which ONE has the clearest match in OTHER campaigns.
|
896
|
+
Prioritize:
|
897
|
+
1. EXACT name matches (highest priority)
|
898
|
+
2. Simplest parent relationships (e.g., an enzyme that differs by only 1-2 mutations)
|
899
|
+
3. Earliest generation enzymes (lower generation numbers are better seeds)
|
900
|
+
|
901
|
+
Return your response as a JSON dictionary with this exact format:
|
902
|
+
{{
|
903
|
+
"seed_enzyme": {{
|
904
|
+
"target_enzyme_id": "the enzyme ID in the empty campaign",
|
905
|
+
"relationship_type": "EXACT_MATCH" or "BEST_PARENT",
|
906
|
+
"source": {{
|
907
|
+
"campaign_id": "the campaign ID",
|
908
|
+
"enzyme_id": "the enzyme ID WITHOUT campaign suffix"
|
909
|
+
}},
|
910
|
+
"confidence": 0.1 to 1.0,
|
911
|
+
"reason": "brief explanation of why this is the best seed"
|
912
|
+
}}
|
913
|
+
}}
|
914
|
+
|
915
|
+
Return ONLY valid JSON with information about the SINGLE BEST seed enzyme.
|
916
|
+
"""
|
917
|
+
|
918
|
+
try:
|
919
|
+
# Save debug information if debug_dir is provided
|
920
|
+
if debug_dir:
|
921
|
+
import time
|
922
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
923
|
+
prompt_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_prompt_{timestamp}.txt"
|
924
|
+
prompt_file.write_text(prompt)
|
925
|
+
|
926
|
+
response = model.generate_content(prompt)
|
927
|
+
response_text = response.text.strip()
|
928
|
+
|
929
|
+
# Save response if debug_dir is provided
|
930
|
+
if debug_dir:
|
931
|
+
response_file = debug_dir / f"cross_campaign_seed_{campaign_id_str}_response_{timestamp}.txt"
|
932
|
+
response_file.write_text(response_text)
|
933
|
+
|
934
|
+
# Parse the JSON response
|
935
|
+
import json
|
936
|
+
try:
|
937
|
+
# Clean the response text if it contains markdown
|
938
|
+
if '```json' in response_text:
|
939
|
+
response_text = response_text.split('```json')[1].split('```')[0].strip()
|
940
|
+
elif '```' in response_text:
|
941
|
+
response_text = response_text.split('```')[1].split('```')[0].strip()
|
942
|
+
|
943
|
+
seed_data = json.loads(response_text)
|
944
|
+
seed_info = seed_data.get('seed_enzyme', {})
|
945
|
+
|
946
|
+
if seed_info:
|
947
|
+
target_enzyme_id = seed_info.get('target_enzyme_id', '')
|
948
|
+
relationship_type = seed_info.get('relationship_type', '').upper()
|
949
|
+
source_info = seed_info.get('source', {})
|
950
|
+
source_enzyme_id = source_info.get('enzyme_id', '')
|
951
|
+
source_campaign_id = source_info.get('campaign_id', '')
|
952
|
+
confidence = float(seed_info.get('confidence', 0.5))
|
953
|
+
reason = seed_info.get('reason', '')
|
954
|
+
|
955
|
+
log.info(f"Campaign {campaign_id_str}: Found seed - {target_enzyme_id} from {source_enzyme_id} ({relationship_type}, confidence: {confidence})")
|
956
|
+
log.info(f"Reason: {reason}")
|
957
|
+
|
958
|
+
if source_enzyme_id:
|
959
|
+
# Find the source enzyme's sequence in the dataframe
|
960
|
+
source_rows = df[df['enzyme_id'] == source_enzyme_id]
|
961
|
+
if source_rows.empty:
|
962
|
+
log.warning(f"Source enzyme {source_enzyme_id} not found in dataframe")
|
963
|
+
else:
|
964
|
+
source_sequence = str(source_rows.iloc[0]['protein_sequence']).strip()
|
965
|
+
if not source_sequence or source_sequence.lower() in ["nan", "none", ""]:
|
966
|
+
log.warning(f"Source enzyme {source_enzyme_id} has no sequence")
|
967
|
+
else:
|
968
|
+
# Find the target enzyme in our empty list
|
969
|
+
seed_found = False
|
970
|
+
for entry in empty_seq_entries:
|
971
|
+
if entry['enzyme_id'] == target_enzyme_id:
|
972
|
+
if relationship_type == "EXACT_MATCH":
|
973
|
+
# Exact match - copy sequence directly
|
974
|
+
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
975
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_exact"
|
976
|
+
log.info(f"Set seed sequence for {target_enzyme_id} from exact match {source_enzyme_id} (length: {len(source_sequence)})")
|
977
|
+
seed_found = True
|
978
|
+
|
979
|
+
elif relationship_type == "BEST_PARENT":
|
980
|
+
# Parent relationship - apply mutations to get the target sequence
|
981
|
+
target_mutations = entry.get('mutations', '').strip()
|
982
|
+
if target_mutations:
|
983
|
+
# Apply mutations using SequenceManipulator
|
984
|
+
manipulator = SequenceManipulator()
|
985
|
+
mutated_sequence, success = manipulator.apply_mutations(source_sequence, target_mutations)
|
986
|
+
|
987
|
+
if success:
|
988
|
+
df.at[entry['idx'], 'protein_sequence'] = mutated_sequence
|
989
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent"
|
990
|
+
log.info(f"Set seed sequence for {target_enzyme_id} by applying mutations {target_mutations} to parent {source_enzyme_id} (length: {len(mutated_sequence)})")
|
991
|
+
seed_found = True
|
992
|
+
else:
|
993
|
+
log.warning(f"Failed to apply mutations {target_mutations} to parent {source_enzyme_id} for {target_enzyme_id}")
|
994
|
+
else:
|
995
|
+
# No mutations - use parent sequence directly
|
996
|
+
df.at[entry['idx'], 'protein_sequence'] = source_sequence
|
997
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_cross_campaign_seed_parent_no_mutations"
|
998
|
+
log.info(f"Set seed sequence for {target_enzyme_id} from parent {source_enzyme_id} (no mutations, length: {len(source_sequence)})")
|
999
|
+
seed_found = True
|
1000
|
+
break
|
1001
|
+
|
1002
|
+
if seed_found:
|
1003
|
+
log.info(f"Campaign {campaign_id_str}: Successfully set cross-campaign seed. Local processing will handle the rest.")
|
1004
|
+
else:
|
1005
|
+
log.warning(f"Campaign {campaign_id_str}: Could not find target enzyme {target_enzyme_id} in empty list")
|
1006
|
+
|
1007
|
+
except json.JSONDecodeError as e:
|
1008
|
+
log.warning(f"Failed to parse JSON response for cross-campaign seed: {e}")
|
1009
|
+
log.debug(f"Response text: {response_text}")
|
1010
|
+
|
1011
|
+
except Exception as e:
|
1012
|
+
log.warning(f"Failed to identify cross-campaign seed for {campaign_id_str}: {e}")
|
1013
|
+
continue
|
1014
|
+
|
1015
|
+
log.info(f"Campaign {campaign_id_str}: Found {empty_count}/{total_entries} entries with empty sequences. "
|
1016
|
+
f"Querying Gemini for sequence sources...")
|
1017
|
+
|
1018
|
+
# Create context for Gemini
|
1019
|
+
context_lines = []
|
1020
|
+
context_lines.append(f"Campaign: {campaign_id_str}")
|
1021
|
+
context_lines.append(f"\nEnzymes WITH sequences ({len(available_seq_entries)}):")
|
1022
|
+
for entry in available_seq_entries[:15]: # Limit to first 15 for context
|
1023
|
+
context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}, {entry['seq_length']} aa)")
|
1024
|
+
|
1025
|
+
context_lines.append(f"\nEnzymes WITHOUT sequences ({len(empty_seq_entries)}):")
|
1026
|
+
for entry in empty_seq_entries[:15]: # Limit to first 15 for context
|
1027
|
+
parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ", no parent info"
|
1028
|
+
mut_info = f", mutations: {entry['mutations'][:50]}..." if entry['mutations'] and len(entry['mutations']) > 50 else f", mutations: {entry['mutations']}" if entry['mutations'] else ""
|
1029
|
+
context_lines.append(f" - {entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
|
1030
|
+
|
1031
|
+
context_text = "\n".join(context_lines)
|
1032
|
+
|
1033
|
+
# Process in batches if there are many empty sequences
|
1034
|
+
batch_size = 10
|
1035
|
+
identified_count = 0
|
1036
|
+
|
1037
|
+
for i in range(0, len(empty_seq_entries), batch_size):
|
1038
|
+
batch = empty_seq_entries[i:i+batch_size]
|
1039
|
+
|
1040
|
+
# Create batch request
|
1041
|
+
batch_request = []
|
1042
|
+
for entry in batch:
|
1043
|
+
parent_info = f", parent: {entry['parent_id']}" if entry['parent_id'] else ""
|
1044
|
+
mut_info = f", mutations: {entry['mutations']}" if entry['mutations'] else ""
|
1045
|
+
batch_request.append(f"{entry['enzyme_id']} (gen {entry['generation']}{parent_info}{mut_info})")
|
1046
|
+
|
1047
|
+
prompt = f"""
|
1048
|
+
Based on the enzyme lineage information provided, identify which enzyme sequences should be used as the source to calculate sequences for the enzymes without sequences.
|
1049
|
+
|
1050
|
+
{context_text}
|
1051
|
+
|
1052
|
+
For each of these enzymes without sequences, identify which enzyme WITH a sequence should be used as the source:
|
1053
|
+
{chr(10).join(batch_request)}
|
1054
|
+
|
1055
|
+
Instructions:
|
1056
|
+
1. If an enzyme has a parent_id and mutations, suggest using the parent's sequence
|
1057
|
+
2. If an enzyme has no parent_id, look for the most logical ancestor or related enzyme with a sequence
|
1058
|
+
3. Consider the generation numbers and enzyme naming patterns
|
1059
|
+
4. Only suggest enzymes that actually have sequences
|
1060
|
+
|
1061
|
+
Please provide your response in this format:
|
1062
|
+
enzyme_id -> source_enzyme_id
|
1063
|
+
enzyme_id -> source_enzyme_id
|
1064
|
+
...
|
1065
|
+
|
1066
|
+
If you cannot identify a suitable source, use "None" as the source_enzyme_id.
|
1067
|
+
"""
|
1068
|
+
|
1069
|
+
try:
|
1070
|
+
# Save debug information if debug_dir is provided
|
1071
|
+
if debug_dir:
|
1072
|
+
import time
|
1073
|
+
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
1074
|
+
prompt_file = debug_dir / f"sequence_source_{campaign_id_str}_prompt_{timestamp}.txt"
|
1075
|
+
prompt_file.write_text(prompt)
|
1076
|
+
|
1077
|
+
response = model.generate_content(prompt)
|
1078
|
+
response_text = response.text.strip()
|
1079
|
+
|
1080
|
+
# Save response if debug_dir is provided
|
1081
|
+
if debug_dir:
|
1082
|
+
response_file = debug_dir / f"sequence_source_{campaign_id_str}_response_{timestamp}.txt"
|
1083
|
+
response_file.write_text(response_text)
|
1084
|
+
|
1085
|
+
# Parse the response
|
1086
|
+
for line in response_text.split('\n'):
|
1087
|
+
if '->' in line:
|
1088
|
+
parts = line.split('->')
|
1089
|
+
if len(parts) == 2:
|
1090
|
+
target_enzyme = parts[0].strip()
|
1091
|
+
source_enzyme = parts[1].strip()
|
1092
|
+
|
1093
|
+
if source_enzyme and source_enzyme != "None":
|
1094
|
+
# Find the target enzyme in our batch
|
1095
|
+
for entry in batch:
|
1096
|
+
if entry['enzyme_id'] == target_enzyme:
|
1097
|
+
# Verify the source enzyme exists and has a sequence
|
1098
|
+
source_rows = df[df['enzyme_id'] == source_enzyme]
|
1099
|
+
if not source_rows.empty:
|
1100
|
+
source_seq = source_rows.iloc[0]['protein_sequence']
|
1101
|
+
if source_seq and str(source_seq).strip() and str(source_seq) != "nan":
|
1102
|
+
# Update the parent_enzyme_id if it's missing
|
1103
|
+
if not entry['parent_id']:
|
1104
|
+
df.at[entry['idx'], 'parent_enzyme_id'] = source_enzyme
|
1105
|
+
df.at[entry['idx'], 'flag'] = df.at[entry['idx'], 'flag'] + " gemini_suggested_parent"
|
1106
|
+
identified_count += 1
|
1107
|
+
log.info(f"Set {source_enzyme} as parent for {target_enzyme} (Gemini suggestion)")
|
1108
|
+
elif entry['parent_id'] != source_enzyme:
|
1109
|
+
# Log if Gemini suggests a different parent than what's recorded
|
1110
|
+
log.info(f"Gemini suggests {source_enzyme} as source for {target_enzyme}, "
|
1111
|
+
f"but parent is recorded as {entry['parent_id']}")
|
1112
|
+
break
|
1113
|
+
|
1114
|
+
except Exception as e:
|
1115
|
+
log.warning(f"Failed to identify sequence sources for batch {i//batch_size + 1}: {e}")
|
1116
|
+
continue
|
1117
|
+
|
1118
|
+
if identified_count > 0:
|
1119
|
+
log.info(f"Campaign {campaign_id_str}: Successfully identified {identified_count} sequence sources using Gemini")
|
1120
|
+
|
1121
|
+
return df
|
1122
|
+
|
1123
|
+
|
1124
|
+
# === 9. MAIN PROCESSOR === ---------------------------------------------------
|
569
1125
|
|
570
1126
|
class SequenceProcessor:
|
571
1127
|
"""Main processor for handling the complete workflow."""
|
572
1128
|
|
573
|
-
def __init__(self, input_csv: Path, output_csv: Path):
|
1129
|
+
def __init__(self, input_csv: Path, output_csv: Path, debug_dir: Optional[Path] = None, strict_mutation_validation: bool = True):
|
574
1130
|
self.input_csv = input_csv
|
575
1131
|
self.output_csv = output_csv
|
1132
|
+
self.debug_dir = debug_dir
|
1133
|
+
self.strict_mutation_validation = strict_mutation_validation
|
576
1134
|
self.df = None
|
577
1135
|
self.generator = None
|
578
1136
|
|
@@ -593,7 +1151,7 @@ class SequenceProcessor:
|
|
593
1151
|
self.df["flag"] = ""
|
594
1152
|
|
595
1153
|
# Initialize generator
|
596
|
-
self.generator = SequenceGenerator(self.df)
|
1154
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
597
1155
|
|
598
1156
|
def _normalize_columns(self) -> None:
|
599
1157
|
"""Automatically detect and normalize column names from different formats."""
|
@@ -855,7 +1413,7 @@ class SequenceProcessor:
|
|
855
1413
|
self.df = self.df[campaign_mask].copy()
|
856
1414
|
|
857
1415
|
# Rebuild relationships for this campaign
|
858
|
-
self.generator = SequenceGenerator(self.df)
|
1416
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
859
1417
|
|
860
1418
|
# Flag complex mutations
|
861
1419
|
self.flag_complex_mutations()
|
@@ -866,6 +1424,17 @@ class SequenceProcessor:
|
|
866
1424
|
self.process_remaining()
|
867
1425
|
self.backward_pass()
|
868
1426
|
|
1427
|
+
# Use Gemini to identify parent enzymes for entries with missing sequences
|
1428
|
+
log.info(f"Identifying parents with Gemini for campaign: {campaign_id}")
|
1429
|
+
self.df = identify_parents_with_gemini(self.df)
|
1430
|
+
|
1431
|
+
# Rebuild relationships after parent identification
|
1432
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
1433
|
+
|
1434
|
+
# Try to fill sequences again after parent identification
|
1435
|
+
log.info(f"Attempting to fill sequences after parent identification for campaign: {campaign_id}")
|
1436
|
+
self.process_remaining()
|
1437
|
+
|
869
1438
|
# Update the original dataframe with results
|
870
1439
|
original_df.loc[campaign_mask, :] = self.df
|
871
1440
|
|
@@ -874,6 +1443,50 @@ class SequenceProcessor:
|
|
874
1443
|
|
875
1444
|
log.info(f"Completed campaign: {campaign_id}")
|
876
1445
|
|
1446
|
+
# After processing all campaigns, check for any remaining empty sequences
|
1447
|
+
# and use Gemini to identify sequence sources (including cross-campaign relationships)
|
1448
|
+
empty_count = sum(self.df["protein_sequence"].str.strip() == "")
|
1449
|
+
total_count = len(self.df)
|
1450
|
+
|
1451
|
+
if empty_count > 0:
|
1452
|
+
log.info(f"Found {empty_count}/{total_count} empty sequences after initial processing. "
|
1453
|
+
"Using Gemini to identify sequence sources (including cross-campaign relationships)...")
|
1454
|
+
self.df = identify_sequence_sources_with_gemini(self.df, self.debug_dir)
|
1455
|
+
|
1456
|
+
# Process campaigns again after identifying new parent relationships
|
1457
|
+
log.info("Reprocessing campaigns after sequence source identification...")
|
1458
|
+
|
1459
|
+
for campaign_id in campaigns:
|
1460
|
+
if pd.isna(campaign_id):
|
1461
|
+
campaign_id = "unknown"
|
1462
|
+
|
1463
|
+
log.info(f"Reprocessing campaign: {campaign_id}")
|
1464
|
+
|
1465
|
+
# Filter data for this campaign
|
1466
|
+
campaign_mask = self.df['campaign_id'] == campaign_id
|
1467
|
+
if pd.isna(campaign_id):
|
1468
|
+
campaign_mask = self.df['campaign_id'].isna()
|
1469
|
+
|
1470
|
+
# Store original dataframe
|
1471
|
+
original_df = self.df
|
1472
|
+
|
1473
|
+
# Process only this campaign's data
|
1474
|
+
self.df = self.df[campaign_mask].copy()
|
1475
|
+
|
1476
|
+
# Rebuild relationships for this campaign
|
1477
|
+
self.generator = SequenceGenerator(self.df, strict_mutation_validation=self.strict_mutation_validation)
|
1478
|
+
|
1479
|
+
# Try to fill sequences again
|
1480
|
+
self.process_remaining()
|
1481
|
+
|
1482
|
+
# Update the original dataframe with results
|
1483
|
+
original_df.loc[campaign_mask, :] = self.df
|
1484
|
+
|
1485
|
+
# Restore original dataframe
|
1486
|
+
self.df = original_df
|
1487
|
+
|
1488
|
+
log.info(f"Completed reprocessing campaign: {campaign_id}")
|
1489
|
+
|
877
1490
|
# Save results
|
878
1491
|
self.save_results()
|
879
1492
|
|
@@ -922,6 +1535,16 @@ def main(argv: Optional[List[str]] = None) -> None:
|
|
922
1535
|
default=0,
|
923
1536
|
help="Increase verbosity (use -vv for debug output)"
|
924
1537
|
)
|
1538
|
+
parser.add_argument(
|
1539
|
+
"--debug-dir",
|
1540
|
+
type=Path,
|
1541
|
+
help="Directory to save debug information (Gemini prompts and responses)"
|
1542
|
+
)
|
1543
|
+
parser.add_argument(
|
1544
|
+
"--allow-mutation-mismatches",
|
1545
|
+
action="store_true",
|
1546
|
+
help="Allow sequence generation even when mutations don't match (default: strict validation)"
|
1547
|
+
)
|
925
1548
|
|
926
1549
|
args = parser.parse_args(argv)
|
927
1550
|
|
@@ -929,7 +1552,13 @@ def main(argv: Optional[List[str]] = None) -> None:
|
|
929
1552
|
setup_logging(args.verbose)
|
930
1553
|
|
931
1554
|
# Process the data (format detection is automatic)
|
932
|
-
|
1555
|
+
strict_validation = not args.allow_mutation_mismatches
|
1556
|
+
processor = SequenceProcessor(
|
1557
|
+
args.input_csv,
|
1558
|
+
args.output_csv,
|
1559
|
+
getattr(args, 'debug_dir', None),
|
1560
|
+
strict_mutation_validation=strict_validation
|
1561
|
+
)
|
933
1562
|
processor.run()
|
934
1563
|
|
935
1564
|
|