debase 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
debase/_version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "0.4.0"
3
+ __version__ = "0.4.2"
@@ -645,11 +645,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
645
645
  came from which parent and what mutations were introduced).
646
646
 
647
647
  Respond with a JSON array of objects, each containing:
648
- - "location": the identifier (e.g. "Table S1", "Figure 2B", "p. 6")
648
+ - "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
649
649
  - "type": one of "table", "figure", "text", "section"
650
650
  - "confidence": your confidence score (0-100) that this location contains lineage data
651
651
  - "reason": brief explanation of why this location likely contains lineage
652
652
 
653
+ IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
654
+
653
655
  Order by confidence score (highest first). Tables showing complete variant lineages or
654
656
  mutation lists should be ranked higher than figure showing complete variant lineages.
655
657
  Text sections is used when no suitable tables/figurews exist.
@@ -747,7 +749,7 @@ def identify_campaigns(
747
749
  debug_dir: str | Path | None = None,
748
750
  ) -> List[Campaign]:
749
751
  """Identify distinct directed evolution campaigns in the manuscript."""
750
- prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text[:30_000])
752
+ prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
751
753
  campaigns_data: List[dict] = []
752
754
  try:
753
755
  campaigns_data = generate_json_with_retry(
@@ -825,7 +827,7 @@ def identify_evolution_locations(
825
827
 
826
828
  # Include TOC before the main text
827
829
  combined_text = toc_text + text if toc_text else text
828
- prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
830
+ prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
829
831
  locs: List[dict] = []
830
832
  try:
831
833
  locs = generate_json_with_retry(
@@ -1306,7 +1308,7 @@ def get_lineage(
1306
1308
  5. Return both variants and campaigns.
1307
1309
  """
1308
1310
  # First, identify campaigns in the manuscript
1309
- campaigns = identify_campaigns(full_text[:50_000], model, debug_dir=debug_dir)
1311
+ campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
1310
1312
 
1311
1313
  if campaigns:
1312
1314
  log.info(f"Identified {len(campaigns)} distinct campaigns")
@@ -1364,7 +1366,7 @@ def get_lineage(
1364
1366
  context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
1365
1367
  locations_with_context.append({
1366
1368
  'location': loc,
1367
- 'context': context_text[:1000] # First 1000 chars of extracted context
1369
+ 'context': context_text # Full extracted context
1368
1370
  })
1369
1371
 
1370
1372
  # For each campaign, ask Gemini to select the best location
@@ -1554,13 +1556,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
1554
1556
  Look for table of contents entries or section listings that mention sequences.
1555
1557
  Return a JSON array where each element has:
1556
1558
  - "section": the section heading or description
1557
- - "page": the page number shown in the table of contents for this section, to your best judgement.
1559
+ - "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
1558
1560
 
1559
1561
  Focus on:
1560
1562
  - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
1561
- - Return the EXACT notation as shown.
1563
+ - For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
1562
1564
  - Prioritize sections that mention "protein" or "amino acid" sequences
1563
1565
 
1566
+ CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
1567
+ - Correct: "53", "S12", "147"
1568
+ - Wrong: "p. 53", "P. 53", "page 53", "pg 53"
1569
+
1564
1570
  Return [] if no sequence sections are found.
1565
1571
  Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
1566
1572
 
@@ -1572,7 +1578,7 @@ TEXT (truncated):
1572
1578
 
1573
1579
  def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
1574
1580
  """Ask Gemini for promising places to look for sequences."""
1575
- prompt = _SEQ_LOC_PROMPT.format(chunk=text[:15_000])
1581
+ prompt = _SEQ_LOC_PROMPT.format(chunk=text)
1576
1582
  try:
1577
1583
  locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
1578
1584
  return locs if isinstance(locs, list) else []
debase/lineage_format.py CHANGED
@@ -52,6 +52,12 @@ try:
52
52
  except ImportError: # pragma: no cover
53
53
  RDKIT_OK = False
54
54
 
55
+ try:
56
+ import google.generativeai as genai # type: ignore
57
+ GEMINI_OK = True
58
+ except ImportError: # pragma: no cover
59
+ GEMINI_OK = False
60
+
55
61
  # Input columns that MUST be present ------------------------------------------------
56
62
  INPUT_REQUIRED: Tuple[str, ...] = (
57
63
  "enzyme_id",
@@ -106,9 +112,13 @@ OUTPUT_COLUMNS: Tuple[str, ...] = (
106
112
  "x_coordinate",
107
113
  "y_coordinate",
108
114
  "fitness_value",
115
+ "fitness_type",
109
116
  "cofactor",
110
117
  "reaction_condition",
111
118
  "ee",
119
+ "campaign_id",
120
+ "generation",
121
+ "parent_enzyme_id",
112
122
  "additional_information",
113
123
  )
114
124
 
@@ -130,6 +140,9 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
130
140
  # Local PubChem DB (optional) --------------------------------------------------------
131
141
  PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
132
142
 
143
+ # Gemini API configuration -----------------------------------------------------------
144
+ GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
145
+
133
146
  # Miscellaneous ----------------------------------------------------------------------
134
147
  WELL_ROWS: str = "ABCDEFGH" # 8 rows, 12 cols => 96 wells
135
148
 
@@ -231,9 +244,13 @@ class FlatRow:
231
244
  x_coordinate: str = ""
232
245
  y_coordinate: str = ""
233
246
  fitness_value: Optional[float] = None
247
+ fitness_type: str = ""
234
248
  cofactor: str = ""
235
249
  reaction_condition: str = ""
236
250
  ee: str = ""
251
+ campaign_id: str = ""
252
+ generation: str = ""
253
+ parent_enzyme_id: str = ""
237
254
  additional_information: str = ""
238
255
 
239
256
  def as_dict(self) -> Dict[str, str]:
@@ -253,9 +270,13 @@ class FlatRow:
253
270
  "x_coordinate": self.x_coordinate,
254
271
  "y_coordinate": self.y_coordinate,
255
272
  "fitness_value": self.fitness_value,
273
+ "fitness_type": self.fitness_type,
256
274
  "cofactor": self.cofactor,
257
275
  "reaction_condition": self.reaction_condition,
258
276
  "ee": self.ee,
277
+ "campaign_id": self.campaign_id,
278
+ "generation": self.generation,
279
+ "parent_enzyme_id": self.parent_enzyme_id,
259
280
  "additional_information": self.additional_information,
260
281
  }
261
282
  # Convert None to empty string for CSV friendliness
@@ -527,39 +548,224 @@ def _batch_convert(names: Sequence[str], is_substrate: bool) -> Dict[str, str]:
527
548
  # === 7. FLATTENING CORE ============================================================
528
549
 
529
550
  def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
530
- """Fill missing sequences in substrate scope entries from lineage entries."""
531
- # Create lookup for sequences by enzyme_id
551
+ """Fill missing sequences in substrate scope entries from reaction data entries.
552
+
553
+ This function:
554
+ 1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
555
+ 2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
556
+ """
557
+ # Step 1: Clean up 3a data format
558
+ log.info("Cleaning up reaction data (3a) format...")
559
+
560
+ # Handle column aliasing for enzyme_id
561
+ if 'enzyme' in df.columns and 'enzyme_id' not in df.columns:
562
+ df['enzyme_id'] = df['enzyme']
563
+ log.info("Renamed 'enzyme' column to 'enzyme_id' in reaction data")
564
+
565
+ # Step 2: Create sequence lookup from cleaned 3a data
532
566
  seq_lookup = {}
533
567
 
534
- # First pass: collect all available sequences from lineage entries
535
- for _, row in df.iterrows():
536
- if row.get("data_type") == "lineage" or pd.notna(row.get("protein_sequence")) or pd.notna(row.get("aa_sequence")):
537
- eid = str(row["enzyme_id"])
538
- aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
539
- nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
540
- if aa_seq and aa_seq != "nan":
541
- seq_lookup[eid] = {
542
- "aa_sequence": aa_seq,
543
- "nt_sequence": nt_seq if nt_seq != "nan" else ""
544
- }
545
-
546
- # Second pass: fill missing sequences in substrate scope entries
568
+ # Collect sequences from reaction data entries (3a) - these have data_type='lineage'
569
+ reaction_entries = df[df.get("data_type") == "lineage"]
570
+ log.info(f"Found {len(reaction_entries)} reaction data entries to extract sequences from")
571
+
572
+ for _, row in reaction_entries.iterrows():
573
+ eid = str(row["enzyme_id"])
574
+ campaign_id = str(row.get("campaign_id", "default"))
575
+
576
+ # Prioritize protein_sequence (from 3a) over aa_sequence (from lineage file)
577
+ aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
578
+ nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", "")) or str(row.get("dna_seq", ""))
579
+
580
+ if aa_seq and aa_seq != "nan" and aa_seq != "":
581
+ # Use campaign_id + enzyme_id as composite key for exact matching
582
+ composite_key = f"{campaign_id}_{eid}"
583
+ seq_lookup[composite_key] = {
584
+ "aa_sequence": aa_seq,
585
+ "nt_sequence": nt_seq if nt_seq != "nan" else "",
586
+ "campaign_id": campaign_id,
587
+ "enzyme_id": eid
588
+ }
589
+
590
+ # Also keep simple enzyme_id lookup as fallback
591
+ seq_lookup[eid] = {
592
+ "aa_sequence": aa_seq,
593
+ "nt_sequence": nt_seq if nt_seq != "nan" else "",
594
+ "campaign_id": campaign_id,
595
+ "enzyme_id": eid
596
+ }
597
+
598
+ log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
599
+
600
+ # Step 3: Fill missing sequences in substrate scope entries (3b)
601
+ substrate_entries = df[df.get("data_type") == "substrate_scope"]
602
+ log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
603
+
547
604
  filled_count = 0
548
605
  for idx, row in df.iterrows():
606
+ if row.get("data_type") != "substrate_scope":
607
+ continue
608
+
549
609
  eid = str(row["enzyme_id"])
610
+ campaign_id = str(row.get("campaign_id", "default"))
550
611
 
551
612
  # Check if this row needs sequence filling
552
613
  aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
553
- if (not aa_seq or aa_seq == "nan") and eid in seq_lookup:
554
- df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
555
- df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
556
- if seq_lookup[eid]["nt_sequence"]:
557
- df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
558
- df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
559
- filled_count += 1
614
+ if not aa_seq or aa_seq == "nan" or aa_seq == "":
615
+ # Try campaign-specific lookup first (most precise match)
616
+ composite_key = f"{campaign_id}_{eid}"
617
+ if composite_key in seq_lookup:
618
+ df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
619
+ df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
620
+ if seq_lookup[composite_key]["nt_sequence"]:
621
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
622
+ df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
623
+ filled_count += 1
624
+ log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
625
+
626
+ # Fallback to enzyme_id only lookup
627
+ elif eid in seq_lookup:
628
+ df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
629
+ df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
630
+ if seq_lookup[eid]["nt_sequence"]:
631
+ df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
632
+ df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
633
+ filled_count += 1
634
+ log.debug(f"Filled sequence for {eid} (fallback lookup)")
635
+
636
+ else:
637
+ log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
560
638
 
561
639
  if filled_count > 0:
562
- log.info(f"Filled sequences for {filled_count} entries")
640
+ log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
641
+
642
+ return df
643
+
644
+
645
+ def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
646
+ """Use Gemini API to identify parent enzymes for entries with missing parent information."""
647
+ if not GEMINI_OK:
648
+ log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
649
+ return df
650
+
651
+ if not GEMINI_API_KEY:
652
+ log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
653
+ return df
654
+
655
+ try:
656
+ genai.configure(api_key=GEMINI_API_KEY)
657
+ model = genai.GenerativeModel('gemini-1.5-flash')
658
+ except Exception as e:
659
+ log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
660
+ return df
661
+
662
+ # Find entries with empty sequences but missing parent information
663
+ entries_needing_parents = []
664
+ for idx, row in df.iterrows():
665
+ aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
666
+ nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
667
+ parent_id = str(row.get("parent_enzyme_id", "")).strip()
668
+
669
+ # Only process entries that have empty sequences AND no parent info
670
+ if (not aa_seq or aa_seq == "nan" or aa_seq == "") and (not nt_seq or nt_seq == "nan" or nt_seq == "") and (not parent_id or parent_id == "nan"):
671
+ enzyme_id = str(row.get("enzyme_id", ""))
672
+ campaign_id = str(row.get("campaign_id", ""))
673
+ generation = str(row.get("generation", ""))
674
+
675
+ entries_needing_parents.append({
676
+ "idx": idx,
677
+ "enzyme_id": enzyme_id,
678
+ "campaign_id": campaign_id,
679
+ "generation": generation
680
+ })
681
+
682
+ if not entries_needing_parents:
683
+ log.info("No entries need parent identification from Gemini")
684
+ return df
685
+
686
+ log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
687
+
688
+ # Create a lookup of all available enzyme IDs for context
689
+ available_enzymes = {}
690
+ for idx, row in df.iterrows():
691
+ enzyme_id = str(row.get("enzyme_id", ""))
692
+ campaign_id = str(row.get("campaign_id", ""))
693
+ aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
694
+ generation = str(row.get("generation", ""))
695
+
696
+ if enzyme_id and enzyme_id != "nan":
697
+ available_enzymes[enzyme_id] = {
698
+ "campaign_id": campaign_id,
699
+ "has_sequence": bool(aa_seq and aa_seq != "nan" and aa_seq != ""),
700
+ "generation": generation
701
+ }
702
+
703
+ identified_count = 0
704
+ for entry in entries_needing_parents:
705
+ enzyme_id = entry["enzyme_id"]
706
+ campaign_id = entry["campaign_id"]
707
+ generation = entry["generation"]
708
+
709
+ # Create context for Gemini
710
+ context_info = []
711
+ context_info.append(f"Enzyme ID: {enzyme_id}")
712
+ context_info.append(f"Campaign ID: {campaign_id}")
713
+ if generation:
714
+ context_info.append(f"Generation: {generation}")
715
+
716
+ # Add available enzymes from the same campaign for context
717
+ campaign_enzymes = []
718
+ for enz_id, enz_data in available_enzymes.items():
719
+ if enz_data["campaign_id"] == campaign_id:
720
+ status = "with sequence" if enz_data["has_sequence"] else "without sequence"
721
+ gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
722
+ campaign_enzymes.append(f" - {enz_id} {status} {gen_info}")
723
+
724
+ if campaign_enzymes:
725
+ context_info.append("Available enzymes in same campaign:")
726
+ context_info.extend(campaign_enzymes[:10]) # Limit to first 10 for context
727
+
728
+ context_text = "\n".join(context_info)
729
+
730
+ prompt = f"""
731
+ Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
732
+
733
+ {context_text}
734
+
735
+ This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
736
+
737
+ Please provide your response in this format:
738
+ Parent: [parent_enzyme_id or "Unknown"]
739
+
740
+ If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
741
+ """
742
+
743
+ try:
744
+ response = model.generate_content(prompt)
745
+ response_text = response.text.strip()
746
+
747
+ # Parse the response
748
+ parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
749
+
750
+ if parent_match:
751
+ parent = parent_match.group(1).strip()
752
+ if parent and parent != "Unknown" and parent != "No parent identified":
753
+ # Verify the parent exists in our available enzymes
754
+ if parent in available_enzymes:
755
+ df.at[entry["idx"], "parent_enzyme_id"] = parent
756
+ identified_count += 1
757
+ log.info(f"Identified parent for {enzyme_id}: {parent}")
758
+ else:
759
+ log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
760
+
761
+ except Exception as e:
762
+ log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
763
+ continue
764
+
765
+ if identified_count > 0:
766
+ log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
767
+ else:
768
+ log.info("No parent enzymes were identified using Gemini API")
563
769
 
564
770
  return df
565
771
 
@@ -574,7 +780,7 @@ def _plate_and_well(index: int) -> Tuple[int, str, str]:
574
780
  return plate_number, plate_name, well
575
781
 
576
782
 
577
- def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str]) -> str:
783
+ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str], campaign_id: str = "default") -> str:
578
784
  """Get root enzyme id, falling back to generation 0 ancestor or self."""
579
785
  if eid in lineage_roots:
580
786
  return lineage_roots[eid]
@@ -582,7 +788,12 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
582
788
  seen: set[str] = set()
583
789
  while cur and cur not in seen:
584
790
  seen.add(cur)
791
+ # Try campaign-specific lookup first, then fall back to composite key
585
792
  row = idmap.get(cur, {})
793
+ if not row:
794
+ composite_key = f"{campaign_id}_{cur}"
795
+ row = idmap.get(composite_key, {})
796
+
586
797
  # Look for generation 0 as the root
587
798
  if str(row.get("generation", "")).strip() == "0":
588
799
  return cur
@@ -674,6 +885,12 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
674
885
  # Fill missing sequences in substrate scope entries from lineage data
675
886
  df = _fill_missing_sequences(df)
676
887
 
888
+ # Use Gemini API to identify parent enzymes for entries with missing sequences
889
+ df = _identify_parents_with_gemini(df)
890
+
891
+ # Fill sequences again after parent identification to propagate sequences from identified parents
892
+ df = _fill_missing_sequences(df)
893
+
677
894
  # 1. Generate lineage roots once -----------------------------------------
678
895
  lineage_roots = _generate_lineage_roots(df)
679
896
 
@@ -694,24 +911,42 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
694
911
  # _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
695
912
 
696
913
  # 3. Flatten rows ---------------------------------------------------------
697
- # Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
914
+ # Create idmap for parent lookups, using campaign_id + enzyme_id as composite key
698
915
  idmap = {}
916
+ campaign_idmap = {} # For within-campaign lookups
917
+
699
918
  for _, r in df.iterrows():
700
919
  eid = str(r["enzyme_id"])
701
- if eid in idmap:
702
- log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
703
- idmap[eid] = r.to_dict()
704
-
705
- # Check for duplicate enzyme_ids
706
- enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
707
- unique_ids = set(enzyme_ids)
708
- if len(enzyme_ids) != len(unique_ids):
709
- log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
710
- from collections import Counter
920
+ campaign_id = str(r.get("campaign_id", "default"))
921
+
922
+ # Use composite key for global idmap
923
+ composite_key = f"{campaign_id}_{eid}"
924
+ idmap[composite_key] = r.to_dict()
925
+
926
+ # Also maintain campaign-specific idmap for parent lookups
927
+ if campaign_id not in campaign_idmap:
928
+ campaign_idmap[campaign_id] = {}
929
+ campaign_idmap[campaign_id][eid] = r.to_dict()
930
+
931
+ # Check for duplicate enzyme_ids within campaigns
932
+ from collections import defaultdict, Counter
933
+ campaign_enzyme_counts = defaultdict(list)
934
+ for _, r in df.iterrows():
935
+ eid = str(r["enzyme_id"])
936
+ campaign_id = str(r.get("campaign_id", "default"))
937
+ campaign_enzyme_counts[campaign_id].append(eid)
938
+
939
+ total_duplicates = 0
940
+ for campaign_id, enzyme_ids in campaign_enzyme_counts.items():
711
941
  id_counts = Counter(enzyme_ids)
712
942
  duplicates = {k: v for k, v in id_counts.items() if v > 1}
713
- log.warning(f"Duplicate enzyme_ids: {duplicates}")
714
- log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
943
+ if duplicates:
944
+ total_duplicates += sum(duplicates.values()) - len(duplicates)
945
+ log.warning(f"Campaign {campaign_id} has duplicate enzyme_ids: {duplicates}")
946
+
947
+ if total_duplicates > 0:
948
+ log.warning(f"Found {total_duplicates} duplicate enzyme_ids across campaigns")
949
+ log.info("All entries within each campaign will be preserved")
715
950
 
716
951
  output_rows: List[Dict[str, str]] = []
717
952
  skipped_count = 0
@@ -747,23 +982,58 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
747
982
  smiles_reaction = ".".join(sub_smiles) + " >> " + ".".join(prod_smiles)
748
983
  smiles_string = _canonical_smiles(smiles_string)
749
984
 
750
- # Mutations -----------------------------------------------------------
751
- root_id = _root_enzyme_id(eid, idmap, lineage_roots)
752
- root_row = idmap[root_id]
753
- root_aa = (
754
- str(root_row.get("protein_sequence", ""))
755
- or str(root_row.get("aa_sequence", ""))
756
- )
757
- root_nt = (
758
- str(root_row.get("nucleotide_sequence", ""))
759
- or str(root_row.get("nt_sequence", ""))
760
- )
761
- # If root doesn't have NT sequence but has AA sequence, reverse translate
762
- if (not root_nt or root_nt == "nan") and root_aa:
763
- root_nt = _rev_translate(root_aa)
985
+ # Mutations - calculate based on generation 0 enzyme in same campaign --------
986
+ campaign_id = str(rec.row.get("campaign_id", "default"))
987
+ generation = str(rec.row.get("generation", "")).strip()
988
+ parent_id = rec.parent_id
989
+
990
+ # Find generation 0 enzyme in same campaign as reference (only for non-gen-0 enzymes)
991
+ reference_row = {}
992
+ if generation != "0":
993
+ for cid, cmap in campaign_idmap.items():
994
+ if cid == campaign_id:
995
+ for enzyme_id, enzyme_row in cmap.items():
996
+ enzyme_gen = str(enzyme_row.get("generation", "")).strip()
997
+ if enzyme_gen == "0" or enzyme_gen == "0.0":
998
+ reference_row = enzyme_row
999
+ log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
1000
+ break
1001
+ break
1002
+ if not reference_row:
1003
+ log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
764
1004
 
765
- aa_muts = _aa_mut(root_aa, rec.aa_seq) if rec.aa_seq and root_aa else ""
766
- nt_muts = _nt_mut(root_aa, rec.aa_seq, root_nt, rec.nt_seq) if root_aa or root_nt else ""
1005
+ reference_aa = ""
1006
+ reference_nt = ""
1007
+ if reference_row:
1008
+ reference_aa = (
1009
+ str(reference_row.get("protein_sequence", ""))
1010
+ or str(reference_row.get("aa_sequence", ""))
1011
+ )
1012
+ reference_nt = (
1013
+ str(reference_row.get("nucleotide_sequence", ""))
1014
+ or str(reference_row.get("nt_sequence", ""))
1015
+ )
1016
+ # If reference doesn't have NT sequence but has AA sequence, reverse translate
1017
+ if (not reference_nt or reference_nt == "nan") and reference_aa and reference_aa != "nan":
1018
+ reference_nt = _rev_translate(reference_aa)
1019
+
1020
+ # For generation 0 enzymes, don't calculate mutations (they are the reference)
1021
+ if generation == "0":
1022
+ aa_muts = ""
1023
+ nt_muts = ""
1024
+ log.info(f"Generation 0 enzyme {eid} - no mutations calculated (is reference)")
1025
+ else:
1026
+ # Debug sequence availability
1027
+ log.info(f"Mutation calc for {eid}: gen={generation}, has_ref_aa={bool(reference_aa and reference_aa != 'nan')}, has_rec_aa={bool(rec.aa_seq and rec.aa_seq != 'nan')}")
1028
+
1029
+ # Calculate mutations relative to generation 0 reference
1030
+ aa_muts = _aa_mut(reference_aa, rec.aa_seq) if rec.aa_seq and rec.aa_seq != "nan" and reference_aa and reference_aa != "nan" else ""
1031
+ nt_muts = _nt_mut(reference_aa, rec.aa_seq, reference_nt, rec.nt_seq) if (reference_aa and reference_aa != "nan") or (reference_nt and reference_nt != "nan") else ""
1032
+
1033
+ if aa_muts or nt_muts:
1034
+ log.info(f"Calculated mutations for {eid} relative to generation 0: AA={aa_muts}, NT={nt_muts}")
1035
+ else:
1036
+ log.warning(f"No mutations calculated for {eid} - ref_aa_len={len(reference_aa) if reference_aa else 0}, rec_aa_len={len(rec.aa_seq) if rec.aa_seq else 0}")
767
1037
 
768
1038
  # Plate / well --------------------------------------------------------
769
1039
  barcode_plate, plate_name, well = _plate_and_well(idx)
@@ -785,13 +1055,18 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
785
1055
  cof_list = str(row.get("cofactor_list", "")).strip()
786
1056
  cofactor = cof_iupac or cof_list
787
1057
 
1058
+ # Fitness type -------------------------------------------------------
1059
+ fitness_type = ""
1060
+ if rec.ttn_or_yield() is not None:
1061
+ ttn_val = row.get("ttn")
1062
+ fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
1063
+
788
1064
  # Additional info -----------------------------------------------------
789
1065
  extra: Dict[str, str] = {
790
1066
  k: str(v) for k, v in row.items() if k not in INPUT_REQUIRED + OPTIONAL_INPUT
791
1067
  }
792
- if rec.ttn_or_yield() is not None:
793
- ttn_val = row.get("ttn")
794
- extra["fitness_type"] = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
1068
+ # Don't include fitness_type in additional_information since it's now a separate column
1069
+ extra.pop("fitness_type", None)
795
1070
  additional_information = json.dumps(extra, separators=(",", ":")) if extra else ""
796
1071
 
797
1072
  flat = FlatRow(
@@ -806,9 +1081,13 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
806
1081
  nt_sequence=rec.nt_seq,
807
1082
  aa_sequence=rec.aa_seq,
808
1083
  fitness_value=rec.ttn_or_yield(),
1084
+ fitness_type=fitness_type,
809
1085
  cofactor=cofactor,
810
1086
  reaction_condition=reaction_condition,
811
1087
  ee=str(row.get("ee", "")),
1088
+ campaign_id=campaign_id,
1089
+ generation=generation,
1090
+ parent_enzyme_id=parent_id,
812
1091
  additional_information=additional_information,
813
1092
  )
814
1093
  output_rows.append(flat.as_dict())