debase 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- debase/_version.py +1 -1
- debase/enzyme_lineage_extractor.py +14 -8
- debase/lineage_format.py +335 -56
- debase/reaction_info_extractor.py +60 -32
- debase/substrate_scope_extractor.py +366 -93
- debase/wrapper.py +37 -11
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/METADATA +1 -1
- debase-0.4.2.dist-info/RECORD +16 -0
- debase-0.4.1.dist-info/RECORD +0 -16
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/WHEEL +0 -0
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/entry_points.txt +0 -0
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {debase-0.4.1.dist-info → debase-0.4.2.dist-info}/top_level.txt +0 -0
debase/_version.py
CHANGED
@@ -645,11 +645,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
|
|
645
645
|
came from which parent and what mutations were introduced).
|
646
646
|
|
647
647
|
Respond with a JSON array of objects, each containing:
|
648
|
-
- "location": the identifier (e.g. "Table S1", "Figure 2B", "
|
648
|
+
- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
|
649
649
|
- "type": one of "table", "figure", "text", "section"
|
650
650
|
- "confidence": your confidence score (0-100) that this location contains lineage data
|
651
651
|
- "reason": brief explanation of why this location likely contains lineage
|
652
652
|
|
653
|
+
IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
|
654
|
+
|
653
655
|
Order by confidence score (highest first). Tables showing complete variant lineages or
|
654
656
|
mutation lists should be ranked higher than figure showing complete variant lineages.
|
655
657
|
Text sections is used when no suitable tables/figurews exist.
|
@@ -747,7 +749,7 @@ def identify_campaigns(
|
|
747
749
|
debug_dir: str | Path | None = None,
|
748
750
|
) -> List[Campaign]:
|
749
751
|
"""Identify distinct directed evolution campaigns in the manuscript."""
|
750
|
-
prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text
|
752
|
+
prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
|
751
753
|
campaigns_data: List[dict] = []
|
752
754
|
try:
|
753
755
|
campaigns_data = generate_json_with_retry(
|
@@ -825,7 +827,7 @@ def identify_evolution_locations(
|
|
825
827
|
|
826
828
|
# Include TOC before the main text
|
827
829
|
combined_text = toc_text + text if toc_text else text
|
828
|
-
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
|
830
|
+
prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
|
829
831
|
locs: List[dict] = []
|
830
832
|
try:
|
831
833
|
locs = generate_json_with_retry(
|
@@ -1306,7 +1308,7 @@ def get_lineage(
|
|
1306
1308
|
5. Return both variants and campaigns.
|
1307
1309
|
"""
|
1308
1310
|
# First, identify campaigns in the manuscript
|
1309
|
-
campaigns = identify_campaigns(full_text
|
1311
|
+
campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
|
1310
1312
|
|
1311
1313
|
if campaigns:
|
1312
1314
|
log.info(f"Identified {len(campaigns)} distinct campaigns")
|
@@ -1364,7 +1366,7 @@ def get_lineage(
|
|
1364
1366
|
context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
|
1365
1367
|
locations_with_context.append({
|
1366
1368
|
'location': loc,
|
1367
|
-
'context': context_text
|
1369
|
+
'context': context_text # Full extracted context
|
1368
1370
|
})
|
1369
1371
|
|
1370
1372
|
# For each campaign, ask Gemini to select the best location
|
@@ -1554,13 +1556,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
|
|
1554
1556
|
Look for table of contents entries or section listings that mention sequences.
|
1555
1557
|
Return a JSON array where each element has:
|
1556
1558
|
- "section": the section heading or description
|
1557
|
-
- "page": the page number
|
1559
|
+
- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
|
1558
1560
|
|
1559
1561
|
Focus on:
|
1560
1562
|
- Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
|
1561
|
-
-
|
1563
|
+
- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
|
1562
1564
|
- Prioritize sections that mention "protein" or "amino acid" sequences
|
1563
1565
|
|
1566
|
+
CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
|
1567
|
+
- Correct: "53", "S12", "147"
|
1568
|
+
- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
|
1569
|
+
|
1564
1570
|
Return [] if no sequence sections are found.
|
1565
1571
|
Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
|
1566
1572
|
|
@@ -1572,7 +1578,7 @@ TEXT (truncated):
|
|
1572
1578
|
|
1573
1579
|
def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
|
1574
1580
|
"""Ask Gemini for promising places to look for sequences."""
|
1575
|
-
prompt = _SEQ_LOC_PROMPT.format(chunk=text
|
1581
|
+
prompt = _SEQ_LOC_PROMPT.format(chunk=text)
|
1576
1582
|
try:
|
1577
1583
|
locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
|
1578
1584
|
return locs if isinstance(locs, list) else []
|
debase/lineage_format.py
CHANGED
@@ -52,6 +52,12 @@ try:
|
|
52
52
|
except ImportError: # pragma: no cover
|
53
53
|
RDKIT_OK = False
|
54
54
|
|
55
|
+
try:
|
56
|
+
import google.generativeai as genai # type: ignore
|
57
|
+
GEMINI_OK = True
|
58
|
+
except ImportError: # pragma: no cover
|
59
|
+
GEMINI_OK = False
|
60
|
+
|
55
61
|
# Input columns that MUST be present ------------------------------------------------
|
56
62
|
INPUT_REQUIRED: Tuple[str, ...] = (
|
57
63
|
"enzyme_id",
|
@@ -106,9 +112,13 @@ OUTPUT_COLUMNS: Tuple[str, ...] = (
|
|
106
112
|
"x_coordinate",
|
107
113
|
"y_coordinate",
|
108
114
|
"fitness_value",
|
115
|
+
"fitness_type",
|
109
116
|
"cofactor",
|
110
117
|
"reaction_condition",
|
111
118
|
"ee",
|
119
|
+
"campaign_id",
|
120
|
+
"generation",
|
121
|
+
"parent_enzyme_id",
|
112
122
|
"additional_information",
|
113
123
|
)
|
114
124
|
|
@@ -130,6 +140,9 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
130
140
|
# Local PubChem DB (optional) --------------------------------------------------------
|
131
141
|
PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
|
132
142
|
|
143
|
+
# Gemini API configuration -----------------------------------------------------------
|
144
|
+
GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
|
145
|
+
|
133
146
|
# Miscellaneous ----------------------------------------------------------------------
|
134
147
|
WELL_ROWS: str = "ABCDEFGH" # 8 rows, 12 cols => 96 wells
|
135
148
|
|
@@ -231,9 +244,13 @@ class FlatRow:
|
|
231
244
|
x_coordinate: str = ""
|
232
245
|
y_coordinate: str = ""
|
233
246
|
fitness_value: Optional[float] = None
|
247
|
+
fitness_type: str = ""
|
234
248
|
cofactor: str = ""
|
235
249
|
reaction_condition: str = ""
|
236
250
|
ee: str = ""
|
251
|
+
campaign_id: str = ""
|
252
|
+
generation: str = ""
|
253
|
+
parent_enzyme_id: str = ""
|
237
254
|
additional_information: str = ""
|
238
255
|
|
239
256
|
def as_dict(self) -> Dict[str, str]:
|
@@ -253,9 +270,13 @@ class FlatRow:
|
|
253
270
|
"x_coordinate": self.x_coordinate,
|
254
271
|
"y_coordinate": self.y_coordinate,
|
255
272
|
"fitness_value": self.fitness_value,
|
273
|
+
"fitness_type": self.fitness_type,
|
256
274
|
"cofactor": self.cofactor,
|
257
275
|
"reaction_condition": self.reaction_condition,
|
258
276
|
"ee": self.ee,
|
277
|
+
"campaign_id": self.campaign_id,
|
278
|
+
"generation": self.generation,
|
279
|
+
"parent_enzyme_id": self.parent_enzyme_id,
|
259
280
|
"additional_information": self.additional_information,
|
260
281
|
}
|
261
282
|
# Convert None to empty string for CSV friendliness
|
@@ -527,39 +548,224 @@ def _batch_convert(names: Sequence[str], is_substrate: bool) -> Dict[str, str]:
|
|
527
548
|
# === 7. FLATTENING CORE ============================================================
|
528
549
|
|
529
550
|
def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
|
530
|
-
"""Fill missing sequences in substrate scope entries from
|
531
|
-
|
551
|
+
"""Fill missing sequences in substrate scope entries from reaction data entries.
|
552
|
+
|
553
|
+
This function:
|
554
|
+
1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
|
555
|
+
2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
|
556
|
+
"""
|
557
|
+
# Step 1: Clean up 3a data format
|
558
|
+
log.info("Cleaning up reaction data (3a) format...")
|
559
|
+
|
560
|
+
# Handle column aliasing for enzyme_id
|
561
|
+
if 'enzyme' in df.columns and 'enzyme_id' not in df.columns:
|
562
|
+
df['enzyme_id'] = df['enzyme']
|
563
|
+
log.info("Renamed 'enzyme' column to 'enzyme_id' in reaction data")
|
564
|
+
|
565
|
+
# Step 2: Create sequence lookup from cleaned 3a data
|
532
566
|
seq_lookup = {}
|
533
567
|
|
534
|
-
#
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
568
|
+
# Collect sequences from reaction data entries (3a) - these have data_type='lineage'
|
569
|
+
reaction_entries = df[df.get("data_type") == "lineage"]
|
570
|
+
log.info(f"Found {len(reaction_entries)} reaction data entries to extract sequences from")
|
571
|
+
|
572
|
+
for _, row in reaction_entries.iterrows():
|
573
|
+
eid = str(row["enzyme_id"])
|
574
|
+
campaign_id = str(row.get("campaign_id", "default"))
|
575
|
+
|
576
|
+
# Prioritize protein_sequence (from 3a) over aa_sequence (from lineage file)
|
577
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
578
|
+
nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", "")) or str(row.get("dna_seq", ""))
|
579
|
+
|
580
|
+
if aa_seq and aa_seq != "nan" and aa_seq != "":
|
581
|
+
# Use campaign_id + enzyme_id as composite key for exact matching
|
582
|
+
composite_key = f"{campaign_id}_{eid}"
|
583
|
+
seq_lookup[composite_key] = {
|
584
|
+
"aa_sequence": aa_seq,
|
585
|
+
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
586
|
+
"campaign_id": campaign_id,
|
587
|
+
"enzyme_id": eid
|
588
|
+
}
|
589
|
+
|
590
|
+
# Also keep simple enzyme_id lookup as fallback
|
591
|
+
seq_lookup[eid] = {
|
592
|
+
"aa_sequence": aa_seq,
|
593
|
+
"nt_sequence": nt_seq if nt_seq != "nan" else "",
|
594
|
+
"campaign_id": campaign_id,
|
595
|
+
"enzyme_id": eid
|
596
|
+
}
|
597
|
+
|
598
|
+
log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
|
599
|
+
|
600
|
+
# Step 3: Fill missing sequences in substrate scope entries (3b)
|
601
|
+
substrate_entries = df[df.get("data_type") == "substrate_scope"]
|
602
|
+
log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
|
603
|
+
|
547
604
|
filled_count = 0
|
548
605
|
for idx, row in df.iterrows():
|
606
|
+
if row.get("data_type") != "substrate_scope":
|
607
|
+
continue
|
608
|
+
|
549
609
|
eid = str(row["enzyme_id"])
|
610
|
+
campaign_id = str(row.get("campaign_id", "default"))
|
550
611
|
|
551
612
|
# Check if this row needs sequence filling
|
552
613
|
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
553
|
-
if
|
554
|
-
|
555
|
-
|
556
|
-
if seq_lookup
|
557
|
-
df.at[idx, "
|
558
|
-
df.at[idx, "
|
559
|
-
|
614
|
+
if not aa_seq or aa_seq == "nan" or aa_seq == "":
|
615
|
+
# Try campaign-specific lookup first (most precise match)
|
616
|
+
composite_key = f"{campaign_id}_{eid}"
|
617
|
+
if composite_key in seq_lookup:
|
618
|
+
df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
619
|
+
df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
|
620
|
+
if seq_lookup[composite_key]["nt_sequence"]:
|
621
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
622
|
+
df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
|
623
|
+
filled_count += 1
|
624
|
+
log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
|
625
|
+
|
626
|
+
# Fallback to enzyme_id only lookup
|
627
|
+
elif eid in seq_lookup:
|
628
|
+
df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
|
629
|
+
df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
|
630
|
+
if seq_lookup[eid]["nt_sequence"]:
|
631
|
+
df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
|
632
|
+
df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
|
633
|
+
filled_count += 1
|
634
|
+
log.debug(f"Filled sequence for {eid} (fallback lookup)")
|
635
|
+
|
636
|
+
else:
|
637
|
+
log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
|
560
638
|
|
561
639
|
if filled_count > 0:
|
562
|
-
log.info(f"
|
640
|
+
log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
|
641
|
+
|
642
|
+
return df
|
643
|
+
|
644
|
+
|
645
|
+
def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
|
646
|
+
"""Use Gemini API to identify parent enzymes for entries with missing parent information."""
|
647
|
+
if not GEMINI_OK:
|
648
|
+
log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
|
649
|
+
return df
|
650
|
+
|
651
|
+
if not GEMINI_API_KEY:
|
652
|
+
log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
|
653
|
+
return df
|
654
|
+
|
655
|
+
try:
|
656
|
+
genai.configure(api_key=GEMINI_API_KEY)
|
657
|
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
658
|
+
except Exception as e:
|
659
|
+
log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
|
660
|
+
return df
|
661
|
+
|
662
|
+
# Find entries with empty sequences but missing parent information
|
663
|
+
entries_needing_parents = []
|
664
|
+
for idx, row in df.iterrows():
|
665
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
666
|
+
nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
|
667
|
+
parent_id = str(row.get("parent_enzyme_id", "")).strip()
|
668
|
+
|
669
|
+
# Only process entries that have empty sequences AND no parent info
|
670
|
+
if (not aa_seq or aa_seq == "nan" or aa_seq == "") and (not nt_seq or nt_seq == "nan" or nt_seq == "") and (not parent_id or parent_id == "nan"):
|
671
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
672
|
+
campaign_id = str(row.get("campaign_id", ""))
|
673
|
+
generation = str(row.get("generation", ""))
|
674
|
+
|
675
|
+
entries_needing_parents.append({
|
676
|
+
"idx": idx,
|
677
|
+
"enzyme_id": enzyme_id,
|
678
|
+
"campaign_id": campaign_id,
|
679
|
+
"generation": generation
|
680
|
+
})
|
681
|
+
|
682
|
+
if not entries_needing_parents:
|
683
|
+
log.info("No entries need parent identification from Gemini")
|
684
|
+
return df
|
685
|
+
|
686
|
+
log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
|
687
|
+
|
688
|
+
# Create a lookup of all available enzyme IDs for context
|
689
|
+
available_enzymes = {}
|
690
|
+
for idx, row in df.iterrows():
|
691
|
+
enzyme_id = str(row.get("enzyme_id", ""))
|
692
|
+
campaign_id = str(row.get("campaign_id", ""))
|
693
|
+
aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
|
694
|
+
generation = str(row.get("generation", ""))
|
695
|
+
|
696
|
+
if enzyme_id and enzyme_id != "nan":
|
697
|
+
available_enzymes[enzyme_id] = {
|
698
|
+
"campaign_id": campaign_id,
|
699
|
+
"has_sequence": bool(aa_seq and aa_seq != "nan" and aa_seq != ""),
|
700
|
+
"generation": generation
|
701
|
+
}
|
702
|
+
|
703
|
+
identified_count = 0
|
704
|
+
for entry in entries_needing_parents:
|
705
|
+
enzyme_id = entry["enzyme_id"]
|
706
|
+
campaign_id = entry["campaign_id"]
|
707
|
+
generation = entry["generation"]
|
708
|
+
|
709
|
+
# Create context for Gemini
|
710
|
+
context_info = []
|
711
|
+
context_info.append(f"Enzyme ID: {enzyme_id}")
|
712
|
+
context_info.append(f"Campaign ID: {campaign_id}")
|
713
|
+
if generation:
|
714
|
+
context_info.append(f"Generation: {generation}")
|
715
|
+
|
716
|
+
# Add available enzymes from the same campaign for context
|
717
|
+
campaign_enzymes = []
|
718
|
+
for enz_id, enz_data in available_enzymes.items():
|
719
|
+
if enz_data["campaign_id"] == campaign_id:
|
720
|
+
status = "with sequence" if enz_data["has_sequence"] else "without sequence"
|
721
|
+
gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
|
722
|
+
campaign_enzymes.append(f" - {enz_id} {status} {gen_info}")
|
723
|
+
|
724
|
+
if campaign_enzymes:
|
725
|
+
context_info.append("Available enzymes in same campaign:")
|
726
|
+
context_info.extend(campaign_enzymes[:10]) # Limit to first 10 for context
|
727
|
+
|
728
|
+
context_text = "\n".join(context_info)
|
729
|
+
|
730
|
+
prompt = f"""
|
731
|
+
Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
|
732
|
+
|
733
|
+
{context_text}
|
734
|
+
|
735
|
+
This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
|
736
|
+
|
737
|
+
Please provide your response in this format:
|
738
|
+
Parent: [parent_enzyme_id or "Unknown"]
|
739
|
+
|
740
|
+
If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
|
741
|
+
"""
|
742
|
+
|
743
|
+
try:
|
744
|
+
response = model.generate_content(prompt)
|
745
|
+
response_text = response.text.strip()
|
746
|
+
|
747
|
+
# Parse the response
|
748
|
+
parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
|
749
|
+
|
750
|
+
if parent_match:
|
751
|
+
parent = parent_match.group(1).strip()
|
752
|
+
if parent and parent != "Unknown" and parent != "No parent identified":
|
753
|
+
# Verify the parent exists in our available enzymes
|
754
|
+
if parent in available_enzymes:
|
755
|
+
df.at[entry["idx"], "parent_enzyme_id"] = parent
|
756
|
+
identified_count += 1
|
757
|
+
log.info(f"Identified parent for {enzyme_id}: {parent}")
|
758
|
+
else:
|
759
|
+
log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
|
760
|
+
|
761
|
+
except Exception as e:
|
762
|
+
log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
|
763
|
+
continue
|
764
|
+
|
765
|
+
if identified_count > 0:
|
766
|
+
log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
|
767
|
+
else:
|
768
|
+
log.info("No parent enzymes were identified using Gemini API")
|
563
769
|
|
564
770
|
return df
|
565
771
|
|
@@ -574,7 +780,7 @@ def _plate_and_well(index: int) -> Tuple[int, str, str]:
|
|
574
780
|
return plate_number, plate_name, well
|
575
781
|
|
576
782
|
|
577
|
-
def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str]) -> str:
|
783
|
+
def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str], campaign_id: str = "default") -> str:
|
578
784
|
"""Get root enzyme id, falling back to generation 0 ancestor or self."""
|
579
785
|
if eid in lineage_roots:
|
580
786
|
return lineage_roots[eid]
|
@@ -582,7 +788,12 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
|
|
582
788
|
seen: set[str] = set()
|
583
789
|
while cur and cur not in seen:
|
584
790
|
seen.add(cur)
|
791
|
+
# Try campaign-specific lookup first, then fall back to composite key
|
585
792
|
row = idmap.get(cur, {})
|
793
|
+
if not row:
|
794
|
+
composite_key = f"{campaign_id}_{cur}"
|
795
|
+
row = idmap.get(composite_key, {})
|
796
|
+
|
586
797
|
# Look for generation 0 as the root
|
587
798
|
if str(row.get("generation", "")).strip() == "0":
|
588
799
|
return cur
|
@@ -674,6 +885,12 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
674
885
|
# Fill missing sequences in substrate scope entries from lineage data
|
675
886
|
df = _fill_missing_sequences(df)
|
676
887
|
|
888
|
+
# Use Gemini API to identify parent enzymes for entries with missing sequences
|
889
|
+
df = _identify_parents_with_gemini(df)
|
890
|
+
|
891
|
+
# Fill sequences again after parent identification to propagate sequences from identified parents
|
892
|
+
df = _fill_missing_sequences(df)
|
893
|
+
|
677
894
|
# 1. Generate lineage roots once -----------------------------------------
|
678
895
|
lineage_roots = _generate_lineage_roots(df)
|
679
896
|
|
@@ -694,24 +911,42 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
694
911
|
# _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
|
695
912
|
|
696
913
|
# 3. Flatten rows ---------------------------------------------------------
|
697
|
-
# Create idmap for parent lookups,
|
914
|
+
# Create idmap for parent lookups, using campaign_id + enzyme_id as composite key
|
698
915
|
idmap = {}
|
916
|
+
campaign_idmap = {} # For within-campaign lookups
|
917
|
+
|
699
918
|
for _, r in df.iterrows():
|
700
919
|
eid = str(r["enzyme_id"])
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
920
|
+
campaign_id = str(r.get("campaign_id", "default"))
|
921
|
+
|
922
|
+
# Use composite key for global idmap
|
923
|
+
composite_key = f"{campaign_id}_{eid}"
|
924
|
+
idmap[composite_key] = r.to_dict()
|
925
|
+
|
926
|
+
# Also maintain campaign-specific idmap for parent lookups
|
927
|
+
if campaign_id not in campaign_idmap:
|
928
|
+
campaign_idmap[campaign_id] = {}
|
929
|
+
campaign_idmap[campaign_id][eid] = r.to_dict()
|
930
|
+
|
931
|
+
# Check for duplicate enzyme_ids within campaigns
|
932
|
+
from collections import defaultdict, Counter
|
933
|
+
campaign_enzyme_counts = defaultdict(list)
|
934
|
+
for _, r in df.iterrows():
|
935
|
+
eid = str(r["enzyme_id"])
|
936
|
+
campaign_id = str(r.get("campaign_id", "default"))
|
937
|
+
campaign_enzyme_counts[campaign_id].append(eid)
|
938
|
+
|
939
|
+
total_duplicates = 0
|
940
|
+
for campaign_id, enzyme_ids in campaign_enzyme_counts.items():
|
711
941
|
id_counts = Counter(enzyme_ids)
|
712
942
|
duplicates = {k: v for k, v in id_counts.items() if v > 1}
|
713
|
-
|
714
|
-
|
943
|
+
if duplicates:
|
944
|
+
total_duplicates += sum(duplicates.values()) - len(duplicates)
|
945
|
+
log.warning(f"Campaign {campaign_id} has duplicate enzyme_ids: {duplicates}")
|
946
|
+
|
947
|
+
if total_duplicates > 0:
|
948
|
+
log.warning(f"Found {total_duplicates} duplicate enzyme_ids across campaigns")
|
949
|
+
log.info("All entries within each campaign will be preserved")
|
715
950
|
|
716
951
|
output_rows: List[Dict[str, str]] = []
|
717
952
|
skipped_count = 0
|
@@ -747,23 +982,58 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
747
982
|
smiles_reaction = ".".join(sub_smiles) + " >> " + ".".join(prod_smiles)
|
748
983
|
smiles_string = _canonical_smiles(smiles_string)
|
749
984
|
|
750
|
-
# Mutations
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
985
|
+
# Mutations - calculate based on generation 0 enzyme in same campaign --------
|
986
|
+
campaign_id = str(rec.row.get("campaign_id", "default"))
|
987
|
+
generation = str(rec.row.get("generation", "")).strip()
|
988
|
+
parent_id = rec.parent_id
|
989
|
+
|
990
|
+
# Find generation 0 enzyme in same campaign as reference (only for non-gen-0 enzymes)
|
991
|
+
reference_row = {}
|
992
|
+
if generation != "0":
|
993
|
+
for cid, cmap in campaign_idmap.items():
|
994
|
+
if cid == campaign_id:
|
995
|
+
for enzyme_id, enzyme_row in cmap.items():
|
996
|
+
enzyme_gen = str(enzyme_row.get("generation", "")).strip()
|
997
|
+
if enzyme_gen == "0" or enzyme_gen == "0.0":
|
998
|
+
reference_row = enzyme_row
|
999
|
+
log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
|
1000
|
+
break
|
1001
|
+
break
|
1002
|
+
if not reference_row:
|
1003
|
+
log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
|
764
1004
|
|
765
|
-
|
766
|
-
|
1005
|
+
reference_aa = ""
|
1006
|
+
reference_nt = ""
|
1007
|
+
if reference_row:
|
1008
|
+
reference_aa = (
|
1009
|
+
str(reference_row.get("protein_sequence", ""))
|
1010
|
+
or str(reference_row.get("aa_sequence", ""))
|
1011
|
+
)
|
1012
|
+
reference_nt = (
|
1013
|
+
str(reference_row.get("nucleotide_sequence", ""))
|
1014
|
+
or str(reference_row.get("nt_sequence", ""))
|
1015
|
+
)
|
1016
|
+
# If reference doesn't have NT sequence but has AA sequence, reverse translate
|
1017
|
+
if (not reference_nt or reference_nt == "nan") and reference_aa and reference_aa != "nan":
|
1018
|
+
reference_nt = _rev_translate(reference_aa)
|
1019
|
+
|
1020
|
+
# For generation 0 enzymes, don't calculate mutations (they are the reference)
|
1021
|
+
if generation == "0":
|
1022
|
+
aa_muts = ""
|
1023
|
+
nt_muts = ""
|
1024
|
+
log.info(f"Generation 0 enzyme {eid} - no mutations calculated (is reference)")
|
1025
|
+
else:
|
1026
|
+
# Debug sequence availability
|
1027
|
+
log.info(f"Mutation calc for {eid}: gen={generation}, has_ref_aa={bool(reference_aa and reference_aa != 'nan')}, has_rec_aa={bool(rec.aa_seq and rec.aa_seq != 'nan')}")
|
1028
|
+
|
1029
|
+
# Calculate mutations relative to generation 0 reference
|
1030
|
+
aa_muts = _aa_mut(reference_aa, rec.aa_seq) if rec.aa_seq and rec.aa_seq != "nan" and reference_aa and reference_aa != "nan" else ""
|
1031
|
+
nt_muts = _nt_mut(reference_aa, rec.aa_seq, reference_nt, rec.nt_seq) if (reference_aa and reference_aa != "nan") or (reference_nt and reference_nt != "nan") else ""
|
1032
|
+
|
1033
|
+
if aa_muts or nt_muts:
|
1034
|
+
log.info(f"Calculated mutations for {eid} relative to generation 0: AA={aa_muts}, NT={nt_muts}")
|
1035
|
+
else:
|
1036
|
+
log.warning(f"No mutations calculated for {eid} - ref_aa_len={len(reference_aa) if reference_aa else 0}, rec_aa_len={len(rec.aa_seq) if rec.aa_seq else 0}")
|
767
1037
|
|
768
1038
|
# Plate / well --------------------------------------------------------
|
769
1039
|
barcode_plate, plate_name, well = _plate_and_well(idx)
|
@@ -785,13 +1055,18 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
785
1055
|
cof_list = str(row.get("cofactor_list", "")).strip()
|
786
1056
|
cofactor = cof_iupac or cof_list
|
787
1057
|
|
1058
|
+
# Fitness type -------------------------------------------------------
|
1059
|
+
fitness_type = ""
|
1060
|
+
if rec.ttn_or_yield() is not None:
|
1061
|
+
ttn_val = row.get("ttn")
|
1062
|
+
fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
|
1063
|
+
|
788
1064
|
# Additional info -----------------------------------------------------
|
789
1065
|
extra: Dict[str, str] = {
|
790
1066
|
k: str(v) for k, v in row.items() if k not in INPUT_REQUIRED + OPTIONAL_INPUT
|
791
1067
|
}
|
792
|
-
|
793
|
-
|
794
|
-
extra["fitness_type"] = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
|
1068
|
+
# Don't include fitness_type in additional_information since it's now a separate column
|
1069
|
+
extra.pop("fitness_type", None)
|
795
1070
|
additional_information = json.dumps(extra, separators=(",", ":")) if extra else ""
|
796
1071
|
|
797
1072
|
flat = FlatRow(
|
@@ -806,9 +1081,13 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
806
1081
|
nt_sequence=rec.nt_seq,
|
807
1082
|
aa_sequence=rec.aa_seq,
|
808
1083
|
fitness_value=rec.ttn_or_yield(),
|
1084
|
+
fitness_type=fitness_type,
|
809
1085
|
cofactor=cofactor,
|
810
1086
|
reaction_condition=reaction_condition,
|
811
1087
|
ee=str(row.get("ee", "")),
|
1088
|
+
campaign_id=campaign_id,
|
1089
|
+
generation=generation,
|
1090
|
+
parent_enzyme_id=parent_id,
|
812
1091
|
additional_information=additional_information,
|
813
1092
|
)
|
814
1093
|
output_rows.append(flat.as_dict())
|