debase 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/cleanup_sequence.py +151 -1
- debase/enzyme_lineage_extractor.py +114 -20
- debase/lineage_format.py +335 -56
- debase/reaction_info_extractor.py +60 -32
- debase/substrate_scope_extractor.py +366 -93
- debase/wrapper.py +37 -11
- {debase-0.4.1.dist-info → debase-0.4.3.dist-info}/METADATA +1 -1
- debase-0.4.3.dist-info/RECORD +16 -0
- debase-0.4.1.dist-info/RECORD +0 -16
- {debase-0.4.1.dist-info → debase-0.4.3.dist-info}/WHEEL +0 -0
- {debase-0.4.1.dist-info → debase-0.4.3.dist-info}/entry_points.txt +0 -0
- {debase-0.4.1.dist-info → debase-0.4.3.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.1.dist-info → debase-0.4.3.dist-info}/top_level.txt +0 -0
debase/lineage_format.py
CHANGED
@@ -52,6 +52,12 @@ try:
|
|
52
52
|
except ImportError: # pragma: no cover
|
53
53
|
RDKIT_OK = False
|
54
54
|
|
55
|
+
try:
|
56
|
+
import google.generativeai as genai # type: ignore
|
57
|
+
GEMINI_OK = True
|
58
|
+
except ImportError: # pragma: no cover
|
59
|
+
GEMINI_OK = False
|
60
|
+
|
55
61
|
# Input columns that MUST be present ------------------------------------------------
|
56
62
|
INPUT_REQUIRED: Tuple[str, ...] = (
|
57
63
|
"enzyme_id",
|
@@ -106,9 +112,13 @@ OUTPUT_COLUMNS: Tuple[str, ...] = (
|
|
106
112
|
"x_coordinate",
|
107
113
|
"y_coordinate",
|
108
114
|
"fitness_value",
|
115
|
+
"fitness_type",
|
109
116
|
"cofactor",
|
110
117
|
"reaction_condition",
|
111
118
|
"ee",
|
119
|
+
"campaign_id",
|
120
|
+
"generation",
|
121
|
+
"parent_enzyme_id",
|
112
122
|
"additional_information",
|
113
123
|
)
|
114
124
|
|
@@ -130,6 +140,9 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
130
140
|
# Local PubChem DB (optional) --------------------------------------------------------
|
131
141
|
PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
|
132
142
|
|
143
|
+
# Gemini API configuration -----------------------------------------------------------
|
144
|
+
GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
|
145
|
+
|
133
146
|
# Miscellaneous ----------------------------------------------------------------------
|
134
147
|
WELL_ROWS: str = "ABCDEFGH" # 8 rows, 12 cols => 96 wells
|
135
148
|
|
@@ -231,9 +244,13 @@ class FlatRow:
|
|
231
244
|
x_coordinate: str = ""
|
232
245
|
y_coordinate: str = ""
|
233
246
|
fitness_value: Optional[float] = None
|
247
|
+
fitness_type: str = ""
|
234
248
|
cofactor: str = ""
|
235
249
|
reaction_condition: str = ""
|
236
250
|
ee: str = ""
|
251
|
+
campaign_id: str = ""
|
252
|
+
generation: str = ""
|
253
|
+
parent_enzyme_id: str = ""
|
237
254
|
additional_information: str = ""
|
238
255
|
|
239
256
|
def as_dict(self) -> Dict[str, str]:
|
@@ -253,9 +270,13 @@ class FlatRow:
|
|
253
270
|
"x_coordinate": self.x_coordinate,
|
254
271
|
"y_coordinate": self.y_coordinate,
|
255
272
|
"fitness_value": self.fitness_value,
|
273
|
+
"fitness_type": self.fitness_type,
|
256
274
|
"cofactor": self.cofactor,
|
257
275
|
"reaction_condition": self.reaction_condition,
|
258
276
|
"ee": self.ee,
|
277
|
+
"campaign_id": self.campaign_id,
|
278
|
+
"generation": self.generation,
|
279
|
+
"parent_enzyme_id": self.parent_enzyme_id,
|
259
280
|
"additional_information": self.additional_information,
|
260
281
|
}
|
261
282
|
# Convert None to empty string for CSV friendliness
|
@@ -527,39 +548,224 @@ def _batch_convert(names: Sequence[str], is_substrate: bool) -> Dict[str, str]:
|
|
527
548
|
# === 7. FLATTENING CORE ============================================================
|
528
549
|
|
529
550
|
def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
530
|
-
"""Fill missing sequences in substrate scope entries from
|
531
|
-
|
551
|
+
"""Fill missing sequences in substrate scope entries from reaction data entries.
|
552
|
+
|
553
|
+
This function:
|
554
|
+
1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
|
555
|
+
2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
|
556
|
+
"""
|
557
|
+
# Step 1: Clean up 3a data format
|
558
|
+
log.info("Cleaning up reaction data (3a) format...")
|
559
|
+
|
560
|
+
# Handle column aliasing for enzyme_id
|
561
|
+
if 'enzyme' in df.columns and 'enzyme_id' not in df.columns:
|
562
|
+
df['enzyme_id'] = df['enzyme']
|
563
|
+
log.info("Renamed 'enzyme' column to 'enzyme_id' in reaction data")
|
564
|
+
|
565
|
+
# Step 2: Create sequence lookup from cleaned 3a data
|
532
566
|
seq_lookup = {}
|
533
567
|
|
534
|
-
#
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
568
|
+
# Collect sequences from reaction data entries (3a) - these have data_type='lineage'
|
569
|
+
reaction_entries = df[df.get("data_type") == "lineage"]
|
570
|
+
log.info(f"Found {len(reaction_entries)} reaction data entries to extract sequences from")
|
571
|
+
|
572
|
+
for _, row in reaction_entries.iterrows():
|
573
|
+
eid = str(row["enzyme_id"])
|
574
|
+
campaign_id = str(row.get("campaign_id", "default"))
|
575
|
+
|
576
|
+
# Prioritize protein_sequence (from 3a) over aa_sequence (from lineage file)
|
577
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
578
|
+
nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", "")) or str(row.get("dna_seq", ""))
|
579
|
+
|
580
|
+
if aa_seq and aa_seq != "nan" and aa_seq != "":
|
581
|
+
# Use campaign_id + enzyme_id as composite key for exact matching
|
582
|
+
composite_key = f"{campaign_id}_{eid}"
|
583
|
+
seq_lookup[composite_key] = {
|
584
|
+
"aa_sequence": aa_seq,
|
585
|
+
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
586
|
+
"campaign_id": campaign_id,
|
587
|
+
"enzyme_id": eid
|
588
|
+
}
|
589
|
+
|
590
|
+
# Also keep simple enzyme_id lookup as fallback
|
591
|
+
seq_lookup[eid] = {
|
592
|
+
"aa_sequence": aa_seq,
|
593
|
+
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
594
|
+
"campaign_id": campaign_id,
|
595
|
+
"enzyme_id": eid
|
596
|
+
}
|
597
|
+
|
598
|
+
log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
|
599
|
+
|
600
|
+
# Step 3: Fill missing sequences in substrate scope entries (3b)
|
601
|
+
substrate_entries = df[df.get("data_type") == "substrate_scope"]
|
602
|
+
log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
|
603
|
+
|
547
604
|
filled_count = 0
|
548
605
|
for idx, row in df.iterrows():
|
606
|
+
if row.get("data_type") != "substrate_scope":
|
607
|
+
continue
|
608
|
+
|
549
609
|
eid = str(row["enzyme_id"])
|
610
|
+
campaign_id = str(row.get("campaign_id", "default"))
|
550
611
|
|
551
612
|
# Check if this row needs sequence filling
|
552
613
|
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
553
|
-
if
|
554
|
-
|
555
|
-
|
556
|
-
if seq_lookup
|
557
|
-
df.at[idx, "
|
558
|
-
df.at[idx, "
|
559
|
-
|
614
|
+
if not aa_seq or aa_seq == "nan" or aa_seq == "":
|
615
|
+
# Try campaign-specific lookup first (most precise match)
|
616
|
+
composite_key = f"{campaign_id}_{eid}"
|
617
|
+
if composite_key in seq_lookup:
|
618
|
+
df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
619
|
+
df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
620
|
+
if seq_lookup[composite_key]["nt_sequence"]:
|
621
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
622
|
+
df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
623
|
+
filled_count += 1
|
624
|
+
log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
|
625
|
+
|
626
|
+
# Fallback to enzyme_id only lookup
|
627
|
+
elif eid in seq_lookup:
|
628
|
+
df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
|
629
|
+
df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
|
630
|
+
if seq_lookup[eid]["nt_sequence"]:
|
631
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
|
632
|
+
df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
|
633
|
+
filled_count += 1
|
634
|
+
log.debug(f"Filled sequence for {eid} (fallback lookup)")
|
635
|
+
|
636
|
+
else:
|
637
|
+
log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
|
560
638
|
|
561
639
|
if filled_count > 0:
|
562
|
-
log.info(f"
|
640
|
+
log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
|
641
|
+
|
642
|
+
return df
|
643
|
+
|
644
|
+
|
645
|
+
def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
646
|
+
"""Use Gemini API to identify parent enzymes for entries with missing parent information."""
|
647
|
+
if not GEMINI_OK:
|
648
|
+
log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
|
649
|
+
return df
|
650
|
+
|
651
|
+
if not GEMINI_API_KEY:
|
652
|
+
log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
|
653
|
+
return df
|
654
|
+
|
655
|
+
try:
|
656
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
657
|
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
658
|
+
except Exception as e:
|
659
|
+
log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
|
660
|
+
return df
|
661
|
+
|
662
|
+
# Find entries with empty sequences but missing parent information
|
663
|
+
entries_needing_parents = []
|
664
|
+
for idx, row in df.iterrows():
|
665
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
666
|
+
nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
|
667
|
+
parent_id = str(row.get("parent_enzyme_id", "")).strip()
|
668
|
+
|
669
|
+
# Only process entries that have empty sequences AND no parent info
|
670
|
+
if (not aa_seq or aa_seq == "nan" or aa_seq == "") and (not nt_seq or nt_seq == "nan" or nt_seq == "") and (not parent_id or parent_id == "nan"):
|
671
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
672
|
+
campaign_id = str(row.get("campaign_id", ""))
|
673
|
+
generation = str(row.get("generation", ""))
|
674
|
+
|
675
|
+
entries_needing_parents.append({
|
676
|
+
"idx": idx,
|
677
|
+
"enzyme_id": enzyme_id,
|
678
|
+
"campaign_id": campaign_id,
|
679
|
+
"generation": generation
|
680
|
+
})
|
681
|
+
|
682
|
+
if not entries_needing_parents:
|
683
|
+
log.info("No entries need parent identification from Gemini")
|
684
|
+
return df
|
685
|
+
|
686
|
+
log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
|
687
|
+
|
688
|
+
# Create a lookup of all available enzyme IDs for context
|
689
|
+
available_enzymes = {}
|
690
|
+
for idx, row in df.iterrows():
|
691
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
692
|
+
campaign_id = str(row.get("campaign_id", ""))
|
693
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
694
|
+
generation = str(row.get("generation", ""))
|
695
|
+
|
696
|
+
if enzyme_id and enzyme_id != "nan":
|
697
|
+
available_enzymes[enzyme_id] = {
|
698
|
+
"campaign_id": campaign_id,
|
699
|
+
"has_sequence": bool(aa_seq and aa_seq != "nan" and aa_seq != ""),
|
700
|
+
"generation": generation
|
701
|
+
}
|
702
|
+
|
703
|
+
identified_count = 0
|
704
|
+
for entry in entries_needing_parents:
|
705
|
+
enzyme_id = entry["enzyme_id"]
|
706
|
+
campaign_id = entry["campaign_id"]
|
707
|
+
generation = entry["generation"]
|
708
|
+
|
709
|
+
# Create context for Gemini
|
710
|
+
context_info = []
|
711
|
+
context_info.append(f"Enzyme ID: {enzyme_id}")
|
712
|
+
context_info.append(f"Campaign ID: {campaign_id}")
|
713
|
+
if generation:
|
714
|
+
context_info.append(f"Generation: {generation}")
|
715
|
+
|
716
|
+
# Add available enzymes from the same campaign for context
|
717
|
+
campaign_enzymes = []
|
718
|
+
for enz_id, enz_data in available_enzymes.items():
|
719
|
+
if enz_data["campaign_id"] == campaign_id:
|
720
|
+
status = "with sequence" if enz_data["has_sequence"] else "without sequence"
|
721
|
+
gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
|
722
|
+
campaign_enzymes.append(f" - {enz_id} {status} {gen_info}")
|
723
|
+
|
724
|
+
if campaign_enzymes:
|
725
|
+
context_info.append("Available enzymes in same campaign:")
|
726
|
+
context_info.extend(campaign_enzymes[:10]) # Limit to first 10 for context
|
727
|
+
|
728
|
+
context_text = "\n".join(context_info)
|
729
|
+
|
730
|
+
prompt = f"""
|
731
|
+
Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
|
732
|
+
|
733
|
+
{context_text}
|
734
|
+
|
735
|
+
This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
|
736
|
+
|
737
|
+
Please provide your response in this format:
|
738
|
+
Parent: [parent_enzyme_id or "Unknown"]
|
739
|
+
|
740
|
+
If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
|
741
|
+
"""
|
742
|
+
|
743
|
+
try:
|
744
|
+
response = model.generate_content(prompt)
|
745
|
+
response_text = response.text.strip()
|
746
|
+
|
747
|
+
# Parse the response
|
748
|
+
parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
|
749
|
+
|
750
|
+
if parent_match:
|
751
|
+
parent = parent_match.group(1).strip()
|
752
|
+
if parent and parent != "Unknown" and parent != "No parent identified":
|
753
|
+
# Verify the parent exists in our available enzymes
|
754
|
+
if parent in available_enzymes:
|
755
|
+
df.at[entry["idx"], "parent_enzyme_id"] = parent
|
756
|
+
identified_count += 1
|
757
|
+
log.info(f"Identified parent for {enzyme_id}: {parent}")
|
758
|
+
else:
|
759
|
+
log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
|
760
|
+
|
761
|
+
except Exception as e:
|
762
|
+
log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
|
763
|
+
continue
|
764
|
+
|
765
|
+
if identified_count > 0:
|
766
|
+
log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
|
767
|
+
else:
|
768
|
+
log.info("No parent enzymes were identified using Gemini API")
|
563
769
|
|
564
770
|
return df
|
565
771
|
|
@@ -574,7 +780,7 @@ def _plate_and_well(index: int) -> Tuple[int, str, str]:
|
|
574
780
|
return plate_number, plate_name, well
|
575
781
|
|
576
782
|
|
577
|
-
def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str]) -> str:
|
783
|
+
def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str], campaign_id: str = "default") -> str:
|
578
784
|
"""Get root enzyme id, falling back to generation 0 ancestor or self."""
|
579
785
|
if eid in lineage_roots:
|
580
786
|
return lineage_roots[eid]
|
@@ -582,7 +788,12 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
|
|
582
788
|
seen: set[str] = set()
|
583
789
|
while cur and cur not in seen:
|
584
790
|
seen.add(cur)
|
791
|
+
# Try campaign-specific lookup first, then fall back to composite key
|
585
792
|
row = idmap.get(cur, {})
|
793
|
+
if not row:
|
794
|
+
composite_key = f"{campaign_id}_{cur}"
|
795
|
+
row = idmap.get(composite_key, {})
|
796
|
+
|
586
797
|
# Look for generation 0 as the root
|
587
798
|
if str(row.get("generation", "")).strip() == "0":
|
588
799
|
return cur
|
@@ -674,6 +885,12 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
674
885
|
# Fill missing sequences in substrate scope entries from lineage data
|
675
886
|
df = _fill_missing_sequences(df)
|
676
887
|
|
888
|
+
# Use Gemini API to identify parent enzymes for entries with missing sequences
|
889
|
+
df = _identify_parents_with_gemini(df)
|
890
|
+
|
891
|
+
# Fill sequences again after parent identification to propagate sequences from identified parents
|
892
|
+
df = _fill_missing_sequences(df)
|
893
|
+
|
677
894
|
# 1. Generate lineage roots once -----------------------------------------
|
678
895
|
lineage_roots = _generate_lineage_roots(df)
|
679
896
|
|
@@ -694,24 +911,42 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
694
911
|
# _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
|
695
912
|
|
696
913
|
# 3. Flatten rows ---------------------------------------------------------
|
697
|
-
# Create idmap for parent lookups,
|
914
|
+
# Create idmap for parent lookups, using campaign_id + enzyme_id as composite key
|
698
915
|
idmap = {}
|
916
|
+
campaign_idmap = {} # For within-campaign lookups
|
917
|
+
|
699
918
|
for _, r in df.iterrows():
|
700
919
|
eid = str(r["enzyme_id"])
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
920
|
+
campaign_id = str(r.get("campaign_id", "default"))
|
921
|
+
|
922
|
+
# Use composite key for global idmap
|
923
|
+
composite_key = f"{campaign_id}_{eid}"
|
924
|
+
idmap[composite_key] = r.to_dict()
|
925
|
+
|
926
|
+
# Also maintain campaign-specific idmap for parent lookups
|
927
|
+
if campaign_id not in campaign_idmap:
|
928
|
+
campaign_idmap[campaign_id] = {}
|
929
|
+
campaign_idmap[campaign_id][eid] = r.to_dict()
|
930
|
+
|
931
|
+
# Check for duplicate enzyme_ids within campaigns
|
932
|
+
from collections import defaultdict, Counter
|
933
|
+
campaign_enzyme_counts = defaultdict(list)
|
934
|
+
for _, r in df.iterrows():
|
935
|
+
eid = str(r["enzyme_id"])
|
936
|
+
campaign_id = str(r.get("campaign_id", "default"))
|
937
|
+
campaign_enzyme_counts[campaign_id].append(eid)
|
938
|
+
|
939
|
+
total_duplicates = 0
|
940
|
+
for campaign_id, enzyme_ids in campaign_enzyme_counts.items():
|
711
941
|
id_counts = Counter(enzyme_ids)
|
712
942
|
duplicates = {k: v for k, v in id_counts.items() if v > 1}
|
713
|
-
|
714
|
-
|
943
|
+
if duplicates:
|
944
|
+
total_duplicates += sum(duplicates.values()) - len(duplicates)
|
945
|
+
log.warning(f"Campaign {campaign_id} has duplicate enzyme_ids: {duplicates}")
|
946
|
+
|
947
|
+
if total_duplicates > 0:
|
948
|
+
log.warning(f"Found {total_duplicates} duplicate enzyme_ids across campaigns")
|
949
|
+
log.info("All entries within each campaign will be preserved")
|
715
950
|
|
716
951
|
output_rows: List[Dict[str, str]] = []
|
717
952
|
skipped_count = 0
|
@@ -747,23 +982,58 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
747
982
|
smiles_reaction = ".".join(sub_smiles) + " >> " + ".".join(prod_smiles)
|
748
983
|
smiles_string = _canonical_smiles(smiles_string)
|
749
984
|
|
750
|
-
# Mutations
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
985
|
+
# Mutations - calculate based on generation 0 enzyme in same campaign --------
|
986
|
+
campaign_id = str(rec.row.get("campaign_id", "default"))
|
987
|
+
generation = str(rec.row.get("generation", "")).strip()
|
988
|
+
parent_id = rec.parent_id
|
989
|
+
|
990
|
+
# Find generation 0 enzyme in same campaign as reference (only for non-gen-0 enzymes)
|
991
|
+
reference_row = {}
|
992
|
+
if generation != "0":
|
993
|
+
for cid, cmap in campaign_idmap.items():
|
994
|
+
if cid == campaign_id:
|
995
|
+
for enzyme_id, enzyme_row in cmap.items():
|
996
|
+
enzyme_gen = str(enzyme_row.get("generation", "")).strip()
|
997
|
+
if enzyme_gen == "0" or enzyme_gen == "0.0":
|
998
|
+
reference_row = enzyme_row
|
999
|
+
log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
|
1000
|
+
break
|
1001
|
+
break
|
1002
|
+
if not reference_row:
|
1003
|
+
log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
|
764
1004
|
|
765
|
-
|
766
|
-
|
1005
|
+
reference_aa = ""
|
1006
|
+
reference_nt = ""
|
1007
|
+
if reference_row:
|
1008
|
+
reference_aa = (
|
1009
|
+
str(reference_row.get("protein_sequence", ""))
|
1010
|
+
or str(reference_row.get("aa_sequence", ""))
|
1011
|
+
)
|
1012
|
+
reference_nt = (
|
1013
|
+
str(reference_row.get("nucleotide_sequence", ""))
|
1014
|
+
or str(reference_row.get("nt_sequence", ""))
|
1015
|
+
)
|
1016
|
+
# If reference doesn't have NT sequence but has AA sequence, reverse translate
|
1017
|
+
if (not reference_nt or reference_nt == "nan") and reference_aa and reference_aa != "nan":
|
1018
|
+
reference_nt = _rev_translate(reference_aa)
|
1019
|
+
|
1020
|
+
# For generation 0 enzymes, don't calculate mutations (they are the reference)
|
1021
|
+
if generation == "0":
|
1022
|
+
aa_muts = ""
|
1023
|
+
nt_muts = ""
|
1024
|
+
log.info(f"Generation 0 enzyme {eid} - no mutations calculated (is reference)")
|
1025
|
+
else:
|
1026
|
+
# Debug sequence availability
|
1027
|
+
log.info(f"Mutation calc for {eid}: gen={generation}, has_ref_aa={bool(reference_aa and reference_aa != 'nan')}, has_rec_aa={bool(rec.aa_seq and rec.aa_seq != 'nan')}")
|
1028
|
+
|
1029
|
+
# Calculate mutations relative to generation 0 reference
|
1030
|
+
aa_muts = _aa_mut(reference_aa, rec.aa_seq) if rec.aa_seq and rec.aa_seq != "nan" and reference_aa and reference_aa != "nan" else ""
|
1031
|
+
nt_muts = _nt_mut(reference_aa, rec.aa_seq, reference_nt, rec.nt_seq) if (reference_aa and reference_aa != "nan") or (reference_nt and reference_nt != "nan") else ""
|
1032
|
+
|
1033
|
+
if aa_muts or nt_muts:
|
1034
|
+
log.info(f"Calculated mutations for {eid} relative to generation 0: AA={aa_muts}, NT={nt_muts}")
|
1035
|
+
else:
|
1036
|
+
log.warning(f"No mutations calculated for {eid} - ref_aa_len={len(reference_aa) if reference_aa else 0}, rec_aa_len={len(rec.aa_seq) if rec.aa_seq else 0}")
|
767
1037
|
|
768
1038
|
# Plate / well --------------------------------------------------------
|
769
1039
|
barcode_plate, plate_name, well = _plate_and_well(idx)
|
@@ -785,13 +1055,18 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
785
1055
|
cof_list = str(row.get("cofactor_list", "")).strip()
|
786
1056
|
cofactor = cof_iupac or cof_list
|
787
1057
|
|
1058
|
+
# Fitness type -------------------------------------------------------
|
1059
|
+
fitness_type = ""
|
1060
|
+
if rec.ttn_or_yield() is not None:
|
1061
|
+
ttn_val = row.get("ttn")
|
1062
|
+
fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
|
1063
|
+
|
788
1064
|
# Additional info -----------------------------------------------------
|
789
1065
|
extra: Dict[str, str] = {
|
790
1066
|
k: str(v) for k, v in row.items() if k not in INPUT_REQUIRED + OPTIONAL_INPUT
|
791
1067
|
}
|
792
|
-
|
793
|
-
|
794
|
-
extra["fitness_type"] = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
|
1068
|
+
# Don't include fitness_type in additional_information since it's now a separate column
|
1069
|
+
extra.pop("fitness_type", None)
|
795
1070
|
additional_information = json.dumps(extra, separators=(",", ":")) if extra else ""
|
796
1071
|
|
797
1072
|
flat = FlatRow(
|
@@ -806,9 +1081,13 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
806
1081
|
nt_sequence=rec.nt_seq,
|
807
1082
|
aa_sequence=rec.aa_seq,
|
808
1083
|
fitness_value=rec.ttn_or_yield(),
|
1084
|
+
fitness_type=fitness_type,
|
809
1085
|
cofactor=cofactor,
|
810
1086
|
reaction_condition=reaction_condition,
|
811
1087
|
ee=str(row.get("ee", "")),
|
1088
|
+
campaign_id=campaign_id,
|
1089
|
+
generation=generation,
|
1090
|
+
parent_enzyme_id=parent_id,
|
812
1091
|
additional_information=additional_information,
|
813
1092
|
)
|
814
1093
|
output_rows.append(flat.as_dict())
|
@@ -1332,12 +1332,28 @@ class ReactionExtractor:
|
|
1332
1332
|
y_offset += pix.height * scale
|
1333
1333
|
|
1334
1334
|
# Convert the page to a pixmap
|
1335
|
-
|
1335
|
+
# Limit zoom factor to avoid creating excessively large images
|
1336
|
+
# Gemini has limits on image size (approx 20MB or 20 megapixels)
|
1337
|
+
zoom = 5.0
|
1338
|
+
estimated_pixels = (max_width * zoom) * (total_height * zoom)
|
1339
|
+
max_pixels = 20_000_000 # 20 megapixels
|
1340
|
+
|
1341
|
+
if estimated_pixels > max_pixels:
|
1342
|
+
# Calculate appropriate zoom to stay under limit
|
1343
|
+
zoom = min(5.0, (max_pixels / (max_width * total_height)) ** 0.5)
|
1344
|
+
LOGGER.warning(f"Reducing zoom from 5.0 to {zoom:.2f} to stay under {max_pixels/1e6:.1f} megapixel limit")
|
1345
|
+
|
1346
|
+
mat = fitz.Matrix(zoom, zoom)
|
1336
1347
|
combined_pix = page.get_pixmap(matrix=mat)
|
1337
1348
|
combined_pix = self._ensure_rgb_pixmap(combined_pix)
|
1338
1349
|
|
1339
1350
|
# Convert to PNG and return
|
1340
1351
|
img_bytes = combined_pix.tobytes("png")
|
1352
|
+
|
1353
|
+
# Check final size
|
1354
|
+
final_size_mb = len(img_bytes) / (1024 * 1024)
|
1355
|
+
if final_size_mb > 20:
|
1356
|
+
LOGGER.warning(f"Combined image is {final_size_mb:.1f}MB, may be too large for vision API")
|
1341
1357
|
output_doc.close()
|
1342
1358
|
|
1343
1359
|
# Save debug file if available
|
@@ -2317,39 +2333,51 @@ Different campaigns may use different model reactions and substrates.
|
|
2317
2333
|
}
|
2318
2334
|
)
|
2319
2335
|
|
2320
|
-
response = model.generate_content(content_parts)
|
2321
|
-
|
2322
|
-
# Track token usage if available
|
2323
2336
|
try:
|
2324
|
-
|
2325
|
-
input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
2326
|
-
output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
2327
|
-
if input_tokens or output_tokens:
|
2328
|
-
try:
|
2329
|
-
from .wrapper import add_token_usage
|
2330
|
-
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
2331
|
-
except ImportError:
|
2332
|
-
pass # wrapper not available
|
2333
|
-
except Exception:
|
2334
|
-
pass # token tracking is best-effort
|
2335
|
-
|
2336
|
-
# Parse JSON from response
|
2337
|
-
if response and response.text:
|
2338
|
-
# Save debug output
|
2339
|
-
if self.debug_dir:
|
2340
|
-
timestamp = int(time.time())
|
2341
|
-
_dump(prompt, self.debug_dir / f"model_reaction_multimodal_prompt_{timestamp}.txt")
|
2342
|
-
_dump(response.text, self.debug_dir / f"model_reaction_multimodal_response_{timestamp}.txt")
|
2337
|
+
response = model.generate_content(content_parts)
|
2343
2338
|
|
2344
|
-
#
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2348
|
-
|
2349
|
-
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
2339
|
+
# Track token usage if available
|
2340
|
+
try:
|
2341
|
+
if hasattr(response, 'usage_metadata'):
|
2342
|
+
input_tokens = getattr(response.usage_metadata, 'prompt_token_count', 0)
|
2343
|
+
output_tokens = getattr(response.usage_metadata, 'candidates_token_count', 0)
|
2344
|
+
if input_tokens or output_tokens:
|
2345
|
+
try:
|
2346
|
+
from .wrapper import add_token_usage
|
2347
|
+
add_token_usage('reaction_info_extractor', input_tokens, output_tokens)
|
2348
|
+
except ImportError:
|
2349
|
+
pass # wrapper not available
|
2350
|
+
except Exception:
|
2351
|
+
pass # token tracking is best-effort
|
2352
|
+
|
2353
|
+
# Parse JSON from response
|
2354
|
+
if response and response.text:
|
2355
|
+
# Save debug output
|
2356
|
+
if self.debug_dir:
|
2357
|
+
timestamp = int(time.time())
|
2358
|
+
_dump(prompt, self.debug_dir / f"model_reaction_multimodal_prompt_{timestamp}.txt")
|
2359
|
+
_dump(response.text, self.debug_dir / f"model_reaction_multimodal_response_{timestamp}.txt")
|
2360
|
+
|
2361
|
+
# Extract JSON from response
|
2362
|
+
text = response.text.strip()
|
2363
|
+
if text.startswith("```json"):
|
2364
|
+
text = text[7:]
|
2365
|
+
if text.endswith("```"):
|
2366
|
+
text = text[:-3]
|
2367
|
+
data = json.loads(text.strip())
|
2368
|
+
else:
|
2369
|
+
raise ValueError("Empty response from multimodal model")
|
2370
|
+
except Exception as vision_error:
|
2371
|
+
LOGGER.error("Vision API call failed: %s", vision_error)
|
2372
|
+
LOGGER.info("Falling back to text-only extraction")
|
2373
|
+
# Fall back to text-only extraction
|
2374
|
+
data = generate_json_with_retry(
|
2375
|
+
self.model,
|
2376
|
+
prompt,
|
2377
|
+
temperature=self.cfg.model_reaction_temperature,
|
2378
|
+
debug_dir=self.debug_dir,
|
2379
|
+
tag="model_reaction_fallback"
|
2380
|
+
)
|
2353
2381
|
else:
|
2354
2382
|
# Fall back to text-only extraction
|
2355
2383
|
data = generate_json_with_retry(
|