PyPI - debase - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

debase 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

debase/_version.py +1 -1
debase/enzyme_lineage_extractor.py +14 -8
debase/lineage_format.py +335 -56
debase/reaction_info_extractor.py +60 -32
debase/substrate_scope_extractor.py +373 -140
debase/wrapper.py +37 -11
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/METADATA +1 -1
debase-0.4.2.dist-info/RECORD +16 -0
debase-0.4.0.dist-info/RECORD +0 -16
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/WHEEL +0 -0
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/entry_points.txt +0 -0
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.0.dist-info → debase-0.4.2.dist-info}/top_level.txt +0 -0

debase/_version.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Version information."""
-__version__ = "0.4.0"
+__version__ = "0.4.2"

debase/enzyme_lineage_extractor.py CHANGED Viewed

@@ -645,11 +645,13 @@ find the COMPLETE evolutionary lineage of enzyme variants (i.e. which variant
 came from which parent and what mutations were introduced).
 Respond with a JSON array of objects, each containing:
-- "location": the identifier (e.g. "Table S1", "Figure 2B", "p. 6")
+- "location": the identifier (e.g. "Table S1", "Figure 2B", "6" for page 6, "S6" for supplementary page 6)
 - "type": one of "table", "figure", "text", "section"
 - "confidence": your confidence score (0-100) that this location contains lineage data
 - "reason": brief explanation of why this location likely contains lineage
+IMPORTANT: For page numbers, use ONLY the number (e.g., "6" not "p. 6" or "page 6")
 Order by confidence score (highest first). Tables showing complete variant lineages or
 mutation lists should be ranked higher than figure showing complete variant lineages.
 Text sections is used when no suitable tables/figurews exist.
@@ -747,7 +749,7 @@ def identify_campaigns(
     debug_dir: str | Path | None = None,
 ) -> List[Campaign]:
     """Identify distinct directed evolution campaigns in the manuscript."""
-    prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text[:30_000])
+    prompt = _CAMPAIGN_IDENTIFICATION_PROMPT.format(text=text)
     campaigns_data: List[dict] = []
     try:
         campaigns_data = generate_json_with_retry(
@@ -825,7 +827,7 @@ def identify_evolution_locations(
     # Include TOC before the main text
     combined_text = toc_text + text if toc_text else text
-    prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text[:15_000]
+    prompt = _LINEAGE_LOC_PROMPT.format(max_results=max_results) + "\n\nTEXT:\n" + combined_text
     locs: List[dict] = []
     try:
         locs = generate_json_with_retry(
@@ -1306,7 +1308,7 @@ def get_lineage(
     5. Return both variants and campaigns.
     """
     # First, identify campaigns in the manuscript
-    campaigns = identify_campaigns(full_text[:50_000], model, debug_dir=debug_dir)
+    campaigns = identify_campaigns(full_text, model, debug_dir=debug_dir)
     if campaigns:
         log.info(f"Identified {len(campaigns)} distinct campaigns")
@@ -1364,7 +1366,7 @@ def get_lineage(
             context_text = _extract_text_at_locations(full_text, [loc], context_chars=500)
             locations_with_context.append({
                 'location': loc,
-                'context': context_text[:1000]  # First 1000 chars of extracted context
+                'context': context_text  # Full extracted context
             })
         # For each campaign, ask Gemini to select the best location
@@ -1554,13 +1556,17 @@ PRIORITY: Protein/amino acid sequences are preferred over DNA sequences.
 Look for table of contents entries or section listings that mention sequences.
 Return a JSON array where each element has:
 - "section": the section heading or description
-- "page": the page number shown in the table of contents for this section, to your best judgement.
+- "page": the page number (IMPORTANT: Return ONLY the number, e.g., "53" not "p. 53" or "page 53")
 Focus on:
 - Table of contents or entries about "Sequence Information" or "Nucleotide and amino acid sequences"
-- Return the EXACT notation as shown.
+- For supplementary pages, use "S" prefix (e.g., "S53" not "p. S53")
 - Prioritize sections that mention "protein" or "amino acid" sequences
+CRITICAL: Page numbers must be returned as plain numbers or S-prefixed numbers only:
+- Correct: "53", "S12", "147"
+- Wrong: "p. 53", "P. 53", "page 53", "pg 53"
 Return [] if no sequence sections are found.
 Absolutely don't include nucleotides or primer sequences, it is better to return nothing then incomplete sequence, use your best judgement.
@@ -1572,7 +1578,7 @@ TEXT (truncated):
 def identify_sequence_locations(text: str, model, *, debug_dir: str | Path | None = None) -> list[dict]:
     """Ask Gemini for promising places to look for sequences."""
-    prompt = _SEQ_LOC_PROMPT.format(chunk=text[:15_000])
+    prompt = _SEQ_LOC_PROMPT.format(chunk=text)
     try:
         locs = generate_json_with_retry(model, prompt, debug_dir=debug_dir, tag="seq_locations")
         return locs if isinstance(locs, list) else []

debase/lineage_format.py CHANGED Viewed

@@ -52,6 +52,12 @@ try:
 except ImportError:  # pragma: no cover
     RDKIT_OK = False
+try:
+    import google.generativeai as genai  # type: ignore
+    GEMINI_OK = True
+except ImportError:  # pragma: no cover
+    GEMINI_OK = False
 # Input columns that MUST be present ------------------------------------------------
 INPUT_REQUIRED: Tuple[str, ...] = (
     "enzyme_id",
@@ -106,9 +112,13 @@ OUTPUT_COLUMNS: Tuple[str, ...] = (
     "x_coordinate",
     "y_coordinate",
     "fitness_value",
+    "fitness_type",
     "cofactor",
     "reaction_condition",
     "ee",
+    "campaign_id",
+    "generation",
+    "parent_enzyme_id",
     "additional_information",
 )
@@ -130,6 +140,9 @@ CACHE_DIR.mkdir(parents=True, exist_ok=True)
 # Local PubChem DB (optional) --------------------------------------------------------
 PUBCHEM_DB_PATH: Path = Path(__file__).parent.parent.parent / "data" / "iupac2smiles.db"
+# Gemini API configuration -----------------------------------------------------------
+GEMINI_API_KEY: str = os.environ.get("GEMINI_API_KEY", "")
 # Miscellaneous ----------------------------------------------------------------------
 WELL_ROWS: str = "ABCDEFGH"  # 8 rows, 12 cols => 96 wells
@@ -231,9 +244,13 @@ class FlatRow:
     x_coordinate: str = ""
     y_coordinate: str = ""
     fitness_value: Optional[float] = None
+    fitness_type: str = ""
     cofactor: str = ""
     reaction_condition: str = ""
     ee: str = ""
+    campaign_id: str = ""
+    generation: str = ""
+    parent_enzyme_id: str = ""
     additional_information: str = ""
     def as_dict(self) -> Dict[str, str]:
@@ -253,9 +270,13 @@ class FlatRow:
             "x_coordinate": self.x_coordinate,
             "y_coordinate": self.y_coordinate,
             "fitness_value": self.fitness_value,
+            "fitness_type": self.fitness_type,
             "cofactor": self.cofactor,
             "reaction_condition": self.reaction_condition,
             "ee": self.ee,
+            "campaign_id": self.campaign_id,
+            "generation": self.generation,
+            "parent_enzyme_id": self.parent_enzyme_id,
             "additional_information": self.additional_information,
         }
         # Convert None to empty string for CSV friendliness
@@ -527,39 +548,224 @@ def _batch_convert(names: Sequence[str], is_substrate: bool) -> Dict[str, str]:
 # === 7. FLATTENING CORE ============================================================
 def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
-    """Fill missing sequences in substrate scope entries from lineage entries."""
-    # Create lookup for sequences by enzyme_id
+    """Fill missing sequences in substrate scope entries from reaction data entries.
+    This function:
+    1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
+    2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
+    """
+    # Step 1: Clean up 3a data format
+    log.info("Cleaning up reaction data (3a) format...")
+    # Handle column aliasing for enzyme_id
+    if 'enzyme' in df.columns and 'enzyme_id' not in df.columns:
+        df['enzyme_id'] = df['enzyme']
+        log.info("Renamed 'enzyme' column to 'enzyme_id' in reaction data")
+    # Step 2: Create sequence lookup from cleaned 3a data
     seq_lookup = {}
-    # First pass: collect all available sequences from lineage entries
-    for _, row in df.iterrows():
-        if row.get("data_type") == "lineage" or pd.notna(row.get("protein_sequence")) or pd.notna(row.get("aa_sequence")):
-            eid = str(row["enzyme_id"])
-            aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
-            nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
-            if aa_seq and aa_seq != "nan":
-                seq_lookup[eid] = {
-                    "aa_sequence": aa_seq,
-                    "nt_sequence": nt_seq if nt_seq != "nan" else ""
-                }
-    # Second pass: fill missing sequences in substrate scope entries
+    # Collect sequences from reaction data entries (3a) - these have data_type='lineage'
+    reaction_entries = df[df.get("data_type") == "lineage"]
+    log.info(f"Found {len(reaction_entries)} reaction data entries to extract sequences from")
+    for _, row in reaction_entries.iterrows():
+        eid = str(row["enzyme_id"])
+        campaign_id = str(row.get("campaign_id", "default"))
+        # Prioritize protein_sequence (from 3a) over aa_sequence (from lineage file)
+        aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
+        nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", "")) or str(row.get("dna_seq", ""))
+        if aa_seq and aa_seq != "nan" and aa_seq != "":
+            # Use campaign_id + enzyme_id as composite key for exact matching
+            composite_key = f"{campaign_id}_{eid}"
+            seq_lookup[composite_key] = {
+                "aa_sequence": aa_seq,
+                "nt_sequence": nt_seq if nt_seq != "nan" else "",
+                "campaign_id": campaign_id,
+                "enzyme_id": eid
+            }
+            # Also keep simple enzyme_id lookup as fallback
+            seq_lookup[eid] = {
+                "aa_sequence": aa_seq,
+                "nt_sequence": nt_seq if nt_seq != "nan" else "",
+                "campaign_id": campaign_id,
+                "enzyme_id": eid
+            }
+    log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
+    # Step 3: Fill missing sequences in substrate scope entries (3b)
+    substrate_entries = df[df.get("data_type") == "substrate_scope"]
+    log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
     filled_count = 0
     for idx, row in df.iterrows():
+        if row.get("data_type") != "substrate_scope":
+            continue
         eid = str(row["enzyme_id"])
+        campaign_id = str(row.get("campaign_id", "default"))
         # Check if this row needs sequence filling
         aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
-        if (not aa_seq or aa_seq == "nan") and eid in seq_lookup:
-            df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
-            df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
-            if seq_lookup[eid]["nt_sequence"]:
-                df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
-                df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
-            filled_count += 1
+        if not aa_seq or aa_seq == "nan" or aa_seq == "":
+            # Try campaign-specific lookup first (most precise match)
+            composite_key = f"{campaign_id}_{eid}"
+            if composite_key in seq_lookup:
+                df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                if seq_lookup[composite_key]["nt_sequence"]:
+                    df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                    df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                filled_count += 1
+                log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
+            # Fallback to enzyme_id only lookup
+            elif eid in seq_lookup:
+                df.at[idx, "protein_sequence"] = seq_lookup[eid]["aa_sequence"]
+                df.at[idx, "aa_sequence"] = seq_lookup[eid]["aa_sequence"]
+                if seq_lookup[eid]["nt_sequence"]:
+                    df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
+                    df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
+                filled_count += 1
+                log.debug(f"Filled sequence for {eid} (fallback lookup)")
+            else:
+                log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
     if filled_count > 0:
-        log.info(f"Filled sequences for {filled_count} entries")
+        log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
+    return df
+def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
+    """Use Gemini API to identify parent enzymes for entries with missing parent information."""
+    if not GEMINI_OK:
+        log.warning("Gemini API not available (missing google.generativeai). Skipping parent identification.")
+        return df
+    if not GEMINI_API_KEY:
+        log.warning("GEMINI_API_KEY not set. Skipping parent identification.")
+        return df
+    try:
+        genai.configure(api_key=GEMINI_API_KEY)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+    except Exception as e:
+        log.warning(f"Failed to configure Gemini API: {e}. Skipping parent identification.")
+        return df
+    # Find entries with empty sequences but missing parent information
+    entries_needing_parents = []
+    for idx, row in df.iterrows():
+        aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
+        nt_seq = str(row.get("nucleotide_sequence", "")) or str(row.get("nt_sequence", ""))
+        parent_id = str(row.get("parent_enzyme_id", "")).strip()
+        # Only process entries that have empty sequences AND no parent info
+        if (not aa_seq or aa_seq == "nan" or aa_seq == "") and (not nt_seq or nt_seq == "nan" or nt_seq == "") and (not parent_id or parent_id == "nan"):
+            enzyme_id = str(row.get("enzyme_id", ""))
+            campaign_id = str(row.get("campaign_id", ""))
+            generation = str(row.get("generation", ""))
+            entries_needing_parents.append({
+                "idx": idx,
+                "enzyme_id": enzyme_id,
+                "campaign_id": campaign_id,
+                "generation": generation
+            })
+    if not entries_needing_parents:
+        log.info("No entries need parent identification from Gemini")
+        return df
+    log.info(f"Found {len(entries_needing_parents)} entries needing parent identification. Querying Gemini...")
+    # Create a lookup of all available enzyme IDs for context
+    available_enzymes = {}
+    for idx, row in df.iterrows():
+        enzyme_id = str(row.get("enzyme_id", ""))
+        campaign_id = str(row.get("campaign_id", ""))
+        aa_seq = str(row.get("protein_sequence", "")) or str(row.get("aa_sequence", ""))
+        generation = str(row.get("generation", ""))
+        if enzyme_id and enzyme_id != "nan":
+            available_enzymes[enzyme_id] = {
+                "campaign_id": campaign_id,
+                "has_sequence": bool(aa_seq and aa_seq != "nan" and aa_seq != ""),
+                "generation": generation
+            }
+    identified_count = 0
+    for entry in entries_needing_parents:
+        enzyme_id = entry["enzyme_id"]
+        campaign_id = entry["campaign_id"]
+        generation = entry["generation"]
+        # Create context for Gemini
+        context_info = []
+        context_info.append(f"Enzyme ID: {enzyme_id}")
+        context_info.append(f"Campaign ID: {campaign_id}")
+        if generation:
+            context_info.append(f"Generation: {generation}")
+        # Add available enzymes from the same campaign for context
+        campaign_enzymes = []
+        for enz_id, enz_data in available_enzymes.items():
+            if enz_data["campaign_id"] == campaign_id:
+                status = "with sequence" if enz_data["has_sequence"] else "without sequence"
+                gen_info = f"(gen {enz_data['generation']})" if enz_data["generation"] else ""
+                campaign_enzymes.append(f"  - {enz_id} {status} {gen_info}")
+        if campaign_enzymes:
+            context_info.append("Available enzymes in same campaign:")
+            context_info.extend(campaign_enzymes[:10])  # Limit to first 10 for context
+        context_text = "\n".join(context_info)
+        prompt = f"""
+Based on the enzyme information provided, can you identify the parent enzyme for this enzyme?
+{context_text}
+This enzyme currently has no sequence data and no parent information. Based on the enzyme ID and the available enzymes in the same campaign, can you identify which enzyme is likely the parent?
+Please provide your response in this format:
+Parent: [parent_enzyme_id or "Unknown"]
+If you cannot identify a parent enzyme, just respond with "Parent: Unknown".
+"""
+        try:
+            response = model.generate_content(prompt)
+            response_text = response.text.strip()
+            # Parse the response
+            parent_match = re.search(r'Parent:\s*([^\n]+)', response_text)
+            if parent_match:
+                parent = parent_match.group(1).strip()
+                if parent and parent != "Unknown" and parent != "No parent identified":
+                    # Verify the parent exists in our available enzymes
+                    if parent in available_enzymes:
+                        df.at[entry["idx"], "parent_enzyme_id"] = parent
+                        identified_count += 1
+                        log.info(f"Identified parent for {enzyme_id}: {parent}")
+                    else:
+                        log.warning(f"Gemini suggested parent {parent} for {enzyme_id}, but it's not in available enzymes")
+        except Exception as e:
+            log.warning(f"Failed to identify parent for {enzyme_id} from Gemini: {e}")
+            continue
+    if identified_count > 0:
+        log.info(f"Successfully identified {identified_count} parent enzymes using Gemini API")
+    else:
+        log.info("No parent enzymes were identified using Gemini API")
     return df
@@ -574,7 +780,7 @@ def _plate_and_well(index: int) -> Tuple[int, str, str]:
     return plate_number, plate_name, well
-def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str]) -> str:
+def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: Dict[str, str], campaign_id: str = "default") -> str:
     """Get root enzyme id, falling back to generation 0 ancestor or self."""
     if eid in lineage_roots:
         return lineage_roots[eid]
@@ -582,7 +788,12 @@ def _root_enzyme_id(eid: str, idmap: Dict[str, Dict[str, str]], lineage_roots: D
     seen: set[str] = set()
     while cur and cur not in seen:
         seen.add(cur)
+        # Try campaign-specific lookup first, then fall back to composite key
         row = idmap.get(cur, {})
+        if not row:
+            composite_key = f"{campaign_id}_{cur}"
+            row = idmap.get(composite_key, {})
         # Look for generation 0 as the root
         if str(row.get("generation", "")).strip() == "0":
             return cur
@@ -674,6 +885,12 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     # Fill missing sequences in substrate scope entries from lineage data
     df = _fill_missing_sequences(df)
+    # Use Gemini API to identify parent enzymes for entries with missing sequences
+    df = _identify_parents_with_gemini(df)
+    # Fill sequences again after parent identification to propagate sequences from identified parents
+    df = _fill_missing_sequences(df)
     # 1. Generate lineage roots once -----------------------------------------
     lineage_roots = _generate_lineage_roots(df)
@@ -694,24 +911,42 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     # _save_pickle(SUBSTRATE_CACHE, SUBSTRATE_CACHE_FILE)
     # 3. Flatten rows ---------------------------------------------------------
-    # Create idmap for parent lookups, but note this will only keep last occurrence of duplicates
+    # Create idmap for parent lookups, using campaign_id + enzyme_id as composite key
     idmap = {}
+    campaign_idmap = {}  # For within-campaign lookups
     for _, r in df.iterrows():
         eid = str(r["enzyme_id"])
-        if eid in idmap:
-            log.debug(f"Overwriting duplicate enzyme_id in idmap: {eid}")
-        idmap[eid] = r.to_dict()
-    # Check for duplicate enzyme_ids
-    enzyme_ids = [str(r["enzyme_id"]) for _, r in df.iterrows()]
-    unique_ids = set(enzyme_ids)
-    if len(enzyme_ids) != len(unique_ids):
-        log.warning(f"Found duplicate enzyme_ids! Total: {len(enzyme_ids)}, Unique: {len(unique_ids)}")
-        from collections import Counter
+        campaign_id = str(r.get("campaign_id", "default"))
+        # Use composite key for global idmap
+        composite_key = f"{campaign_id}_{eid}"
+        idmap[composite_key] = r.to_dict()
+        # Also maintain campaign-specific idmap for parent lookups
+        if campaign_id not in campaign_idmap:
+            campaign_idmap[campaign_id] = {}
+        campaign_idmap[campaign_id][eid] = r.to_dict()
+    # Check for duplicate enzyme_ids within campaigns
+    from collections import defaultdict, Counter
+    campaign_enzyme_counts = defaultdict(list)
+    for _, r in df.iterrows():
+        eid = str(r["enzyme_id"])
+        campaign_id = str(r.get("campaign_id", "default"))
+        campaign_enzyme_counts[campaign_id].append(eid)
+    total_duplicates = 0
+    for campaign_id, enzyme_ids in campaign_enzyme_counts.items():
         id_counts = Counter(enzyme_ids)
         duplicates = {k: v for k, v in id_counts.items() if v > 1}
-        log.warning(f"Duplicate enzyme_ids: {duplicates}")
-        log.info("Note: All rows will still be processed, but parent lookups may use the last occurrence of duplicate IDs")
+        if duplicates:
+            total_duplicates += sum(duplicates.values()) - len(duplicates)
+            log.warning(f"Campaign {campaign_id} has duplicate enzyme_ids: {duplicates}")
+    if total_duplicates > 0:
+        log.warning(f"Found {total_duplicates} duplicate enzyme_ids across campaigns")
+        log.info("All entries within each campaign will be preserved")
     output_rows: List[Dict[str, str]] = []
     skipped_count = 0
@@ -747,23 +982,58 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         smiles_reaction = ".".join(sub_smiles) + " >> " + ".".join(prod_smiles)
         smiles_string = _canonical_smiles(smiles_string)
-        # Mutations -----------------------------------------------------------
-        root_id = _root_enzyme_id(eid, idmap, lineage_roots)
-        root_row = idmap[root_id]
-        root_aa = (
-            str(root_row.get("protein_sequence", ""))
-            or str(root_row.get("aa_sequence", ""))
-        )
-        root_nt = (
-            str(root_row.get("nucleotide_sequence", ""))
-            or str(root_row.get("nt_sequence", ""))
-        )
-        # If root doesn't have NT sequence but has AA sequence, reverse translate
-        if (not root_nt or root_nt == "nan") and root_aa:
-            root_nt = _rev_translate(root_aa)
+        # Mutations - calculate based on generation 0 enzyme in same campaign --------
+        campaign_id = str(rec.row.get("campaign_id", "default"))
+        generation = str(rec.row.get("generation", "")).strip()
+        parent_id = rec.parent_id
+        # Find generation 0 enzyme in same campaign as reference (only for non-gen-0 enzymes)
+        reference_row = {}
+        if generation != "0":
+            for cid, cmap in campaign_idmap.items():
+                if cid == campaign_id:
+                    for enzyme_id, enzyme_row in cmap.items():
+                        enzyme_gen = str(enzyme_row.get("generation", "")).strip()
+                        if enzyme_gen == "0" or enzyme_gen == "0.0":
+                            reference_row = enzyme_row
+                            log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
+                            break
+                    break
+            if not reference_row:
+                log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
-        aa_muts = _aa_mut(root_aa, rec.aa_seq) if rec.aa_seq and root_aa else ""
-        nt_muts = _nt_mut(root_aa, rec.aa_seq, root_nt, rec.nt_seq) if root_aa or root_nt else ""
+        reference_aa = ""
+        reference_nt = ""
+        if reference_row:
+            reference_aa = (
+                str(reference_row.get("protein_sequence", ""))
+                or str(reference_row.get("aa_sequence", ""))
+            )
+            reference_nt = (
+                str(reference_row.get("nucleotide_sequence", ""))
+                or str(reference_row.get("nt_sequence", ""))
+            )
+            # If reference doesn't have NT sequence but has AA sequence, reverse translate
+            if (not reference_nt or reference_nt == "nan") and reference_aa and reference_aa != "nan":
+                reference_nt = _rev_translate(reference_aa)
+        # For generation 0 enzymes, don't calculate mutations (they are the reference)
+        if generation == "0":
+            aa_muts = ""
+            nt_muts = ""
+            log.info(f"Generation 0 enzyme {eid} - no mutations calculated (is reference)")
+        else:
+            # Debug sequence availability
+            log.info(f"Mutation calc for {eid}: gen={generation}, has_ref_aa={bool(reference_aa and reference_aa != 'nan')}, has_rec_aa={bool(rec.aa_seq and rec.aa_seq != 'nan')}")
+            # Calculate mutations relative to generation 0 reference
+            aa_muts = _aa_mut(reference_aa, rec.aa_seq) if rec.aa_seq and rec.aa_seq != "nan" and reference_aa and reference_aa != "nan" else ""
+            nt_muts = _nt_mut(reference_aa, rec.aa_seq, reference_nt, rec.nt_seq) if (reference_aa and reference_aa != "nan") or (reference_nt and reference_nt != "nan") else ""
+            if aa_muts or nt_muts:
+                log.info(f"Calculated mutations for {eid} relative to generation 0: AA={aa_muts}, NT={nt_muts}")
+            else:
+                log.warning(f"No mutations calculated for {eid} - ref_aa_len={len(reference_aa) if reference_aa else 0}, rec_aa_len={len(rec.aa_seq) if rec.aa_seq else 0}")
         # Plate / well --------------------------------------------------------
         barcode_plate, plate_name, well = _plate_and_well(idx)
@@ -785,13 +1055,18 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         cof_list = str(row.get("cofactor_list", "")).strip()
         cofactor = cof_iupac or cof_list
+        # Fitness type -------------------------------------------------------
+        fitness_type = ""
+        if rec.ttn_or_yield() is not None:
+            ttn_val = row.get("ttn")
+            fitness_type = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
         # Additional info -----------------------------------------------------
         extra: Dict[str, str] = {
             k: str(v) for k, v in row.items() if k not in INPUT_REQUIRED + OPTIONAL_INPUT
         }
-        if rec.ttn_or_yield() is not None:
-            ttn_val = row.get("ttn")
-            extra["fitness_type"] = "ttn" if (ttn_val is not None and pd.notna(ttn_val)) else "yield"
+        # Don't include fitness_type in additional_information since it's now a separate column
+        extra.pop("fitness_type", None)
         additional_information = json.dumps(extra, separators=(",", ":")) if extra else ""
         flat = FlatRow(
@@ -806,9 +1081,13 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
             nt_sequence=rec.nt_seq,
             aa_sequence=rec.aa_seq,
             fitness_value=rec.ttn_or_yield(),
+            fitness_type=fitness_type,
             cofactor=cofactor,
             reaction_condition=reaction_condition,
             ee=str(row.get("ee", "")),
+            campaign_id=campaign_id,
+            generation=generation,
+            parent_enzyme_id=parent_id,
             additional_information=additional_information,
         )
         output_rows.append(flat.as_dict())

debase 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

debase 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl