PyPI - debase - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

debase 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

debase/_version.py +1 -1
debase/cleanup_sequence.py +512 -33
debase/enzyme_lineage_extractor.py +985 -100
debase/lineage_format.py +226 -13
debase/reaction_info_extractor.py +178 -34
debase/substrate_scope_extractor.py +52 -4
debase/wrapper.py +155 -151
debase-0.4.5.dist-info/METADATA +121 -0
debase-0.4.5.dist-info/RECORD +16 -0
debase-0.4.3.dist-info/METADATA +0 -296
debase-0.4.3.dist-info/RECORD +0 -16
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/WHEEL +0 -0
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/entry_points.txt +0 -0
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/licenses/LICENSE +0 -0
{debase-0.4.3.dist-info → debase-0.4.5.dist-info}/top_level.txt +0 -0

debase/lineage_format.py CHANGED Viewed

@@ -553,6 +553,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
     This function:
     1. First cleans up 3a data (lineage entries) to standardize enzyme_id column
     2. Then populates sequences in 3b data (substrate scope) based on campaign_id + enzyme_id matching
+    3. Uses Gemini API for intelligent matching when exact matches fail
     """
     # Step 1: Clean up 3a data format
     log.info("Cleaning up reaction data (3a) format...")
@@ -564,6 +565,7 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
     # Step 2: Create sequence lookup from cleaned 3a data
     seq_lookup = {}
+    campaign_enzymes = {}  # Track enzymes by campaign for Gemini matching
     # Collect sequences from reaction data entries (3a) - these have data_type='lineage'
     reaction_entries = df[df.get("data_type") == "lineage"]
@@ -584,7 +586,9 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 "aa_sequence": aa_seq,
                 "nt_sequence": nt_seq if nt_seq != "nan" else "",
                 "campaign_id": campaign_id,
-                "enzyme_id": eid
+                "enzyme_id": eid,
+                "generation": str(row.get("generation", "")),
+                "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
             }
             # Also keep simple enzyme_id lookup as fallback
@@ -592,16 +596,41 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 "aa_sequence": aa_seq,
                 "nt_sequence": nt_seq if nt_seq != "nan" else "",
                 "campaign_id": campaign_id,
-                "enzyme_id": eid
+                "enzyme_id": eid,
+                "generation": str(row.get("generation", "")),
+                "parent_enzyme_id": str(row.get("parent_enzyme_id", ""))
             }
+            # Track enzymes by campaign for Gemini matching
+            if campaign_id not in campaign_enzymes:
+                campaign_enzymes[campaign_id] = []
+            campaign_enzymes[campaign_id].append({
+                "enzyme_id": eid,
+                "has_sequence": True,
+                "generation": str(row.get("generation", "")),
+                "parent_id": str(row.get("parent_enzyme_id", ""))
+            })
     log.info(f"Created sequence lookup with {len(seq_lookup)} entries from reaction data")
+    # Setup Gemini if available
+    gemini_model = None
+    if GEMINI_OK and GEMINI_API_KEY:
+        try:
+            genai.configure(api_key=GEMINI_API_KEY)
+            gemini_model = genai.GenerativeModel('gemini-1.5-flash')
+            log.info("Gemini API configured for intelligent enzyme matching")
+        except Exception as e:
+            log.warning(f"Failed to configure Gemini API: {e}")
     # Step 3: Fill missing sequences in substrate scope entries (3b)
     substrate_entries = df[df.get("data_type") == "substrate_scope"]
     log.info(f"Found {len(substrate_entries)} substrate scope entries to populate sequences for")
     filled_count = 0
+    gemini_matched_count = 0
+    unmatched_enzymes = []  # Track enzymes that need Gemini matching
     for idx, row in df.iterrows():
         if row.get("data_type") != "substrate_scope":
             continue
@@ -620,6 +649,8 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 if seq_lookup[composite_key]["nt_sequence"]:
                     df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
                     df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
+                df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
                 filled_count += 1
                 log.debug(f"Filled sequence for {eid} in campaign {campaign_id} (exact match)")
@@ -630,18 +661,182 @@ def _fill_missing_sequences(df: pd.DataFrame) -> pd.DataFrame:
                 if seq_lookup[eid]["nt_sequence"]:
                     df.at[idx, "nucleotide_sequence"] = seq_lookup[eid]["nt_sequence"]
                     df.at[idx, "nt_sequence"] = seq_lookup[eid]["nt_sequence"]
+                df.at[idx, "generation"] = seq_lookup[eid]["generation"]
+                df.at[idx, "parent_enzyme_id"] = seq_lookup[eid]["parent_enzyme_id"]
                 filled_count += 1
                 log.debug(f"Filled sequence for {eid} (fallback lookup)")
             else:
-                log.warning(f"No sequence found for enzyme_id={eid} in campaign {campaign_id}")
+                # Collect for Gemini matching
+                unmatched_enzymes.append({
+                    "idx": idx,
+                    "enzyme_id": eid,
+                    "campaign_id": campaign_id
+                })
+    # Step 4: Use Gemini for intelligent matching of unmatched enzymes
+    if unmatched_enzymes and gemini_model:
+        log.info(f"Using Gemini to intelligently match {len(unmatched_enzymes)} unmatched enzymes")
+        # Group unmatched enzymes by campaign
+        unmatched_by_campaign = {}
+        for entry in unmatched_enzymes:
+            cid = entry["campaign_id"]
+            if cid not in unmatched_by_campaign:
+                unmatched_by_campaign[cid] = []
+            unmatched_by_campaign[cid].append(entry)
+        # Process each campaign
+        for campaign_id, entries in unmatched_by_campaign.items():
+            if campaign_id not in campaign_enzymes or not campaign_enzymes[campaign_id]:
+                log.warning(f"No enzymes with sequences found in campaign {campaign_id}")
+                continue
+            # Get enzyme IDs that need matching
+            unmatched_ids = [e["enzyme_id"] for e in entries]
+            # Get available enzymes in this campaign
+            available_ids = [e["enzyme_id"] for e in campaign_enzymes[campaign_id] if e["has_sequence"]]
+            if not available_ids:
+                log.warning(f"No enzymes with sequences available in campaign {campaign_id}")
+                continue
+            # Create prompt for Gemini
+            prompt = f"""Match enzyme variant IDs from substrate scope data to their corresponding sequences in reaction data.
+These are from the same campaign ({campaign_id}) but may use slightly different naming conventions.
+Enzymes needing sequences (from substrate scope):
+{json.dumps(unmatched_ids, indent=2)}
+Enzymes with sequences available (from reaction data):
+{json.dumps(available_ids, indent=2)}
+Match each enzyme from the first list to its corresponding enzyme in the second list.
+Consider variations like:
+- Case differences (p411-hf vs P411-HF)
+- Underscore vs hyphen (p411_hf vs p411-hf)
+- Additional prefixes/suffixes
+- Similar naming patterns within the campaign
+Return ONLY a JSON object mapping substrate scope IDs to reaction data IDs:
+{{"substrate_scope_id": "reaction_data_id", ...}}
+Only include matches you are confident about. If no match exists, omit that enzyme.
+"""
+            try:
+                response = gemini_model.generate_content(prompt)
+                mapping_text = response.text.strip()
+                # Extract JSON from response
+                if '```json' in mapping_text:
+                    mapping_text = mapping_text.split('```json')[1].split('```')[0].strip()
+                elif '```' in mapping_text:
+                    mapping_text = mapping_text.split('```')[1].split('```')[0].strip()
+                mapping = json.loads(mapping_text)
+                # Apply the matches
+                for entry in entries:
+                    substrate_id = entry["enzyme_id"]
+                    if substrate_id in mapping:
+                        matched_id = mapping[substrate_id]
+                        composite_key = f"{campaign_id}_{matched_id}"
+                        if composite_key in seq_lookup:
+                            idx = entry["idx"]
+                            df.at[idx, "protein_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                            df.at[idx, "aa_sequence"] = seq_lookup[composite_key]["aa_sequence"]
+                            if seq_lookup[composite_key]["nt_sequence"]:
+                                df.at[idx, "nucleotide_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                                df.at[idx, "nt_sequence"] = seq_lookup[composite_key]["nt_sequence"]
+                            # Also copy generation and parent_enzyme_id
+                            df.at[idx, "generation"] = seq_lookup[composite_key]["generation"]
+                            df.at[idx, "parent_enzyme_id"] = seq_lookup[composite_key]["parent_enzyme_id"]
+                            # Store the match for later mutation copying
+                            df.at[idx, "_matched_enzyme_id"] = matched_id
+                            df.at[idx, "_matched_campaign_id"] = campaign_id
+                            gemini_matched_count += 1
+                            log.info(f"Gemini matched '{substrate_id}' -> '{matched_id}' in campaign {campaign_id}")
+                        else:
+                            log.warning(f"Gemini suggested match '{matched_id}' not found in sequence lookup")
+            except Exception as e:
+                log.warning(f"Failed to get Gemini matches for campaign {campaign_id}: {e}")
+    # Final logging
+    total_filled = filled_count + gemini_matched_count
+    if total_filled > 0:
+        log.info(f"Successfully filled sequences for {total_filled} substrate scope entries "
+                f"({filled_count} exact matches, {gemini_matched_count} Gemini matches)")
-    if filled_count > 0:
-        log.info(f"Successfully filled sequences for {filled_count} substrate scope entries")
+    # Log any remaining unmatched
+    for entry in unmatched_enzymes:
+        if not any(df.at[entry["idx"], col] for col in ["protein_sequence", "aa_sequence"]
+                  if col in df.columns and df.at[entry["idx"], col]):
+            log.warning(f"No sequence found for enzyme_id={entry['enzyme_id']} in campaign {entry['campaign_id']}")
     return df
+def _copy_mutations_from_matched_enzymes(out_df: pd.DataFrame, orig_df: pd.DataFrame) -> pd.DataFrame:
+    """Copy nucleotide_mutation and amino_acid_substitutions from matched enzymes.
+    This function looks for entries that were matched by Gemini and copies their
+    mutation information from the corresponding matched enzyme.
+    """
+    # Look for entries with _matched_enzyme_id (these were matched by Gemini)
+    if "_matched_enzyme_id" not in orig_df.columns:
+        return out_df
+    matched_entries = orig_df[orig_df["_matched_enzyme_id"].notna()]
+    if len(matched_entries) == 0:
+        return out_df
+    log.info(f"Copying mutations for {len(matched_entries)} Gemini-matched entries")
+    # Create a lookup of mutations from the output dataframe
+    mutation_lookup = {}
+    for idx, row in out_df.iterrows():
+        key = f"{row['campaign_id']}_{row['id']}"  # 'id' is the enzyme_id in output
+        mutation_lookup[key] = {
+            "nucleotide_mutation": row.get("nucleotide_mutation", ""),
+            "amino_acid_substitutions": row.get("amino_acid_substitutions", "")
+        }
+    # Copy mutations for matched entries
+    mutations_copied = 0
+    for idx, row in out_df.iterrows():
+        # Check if this row needs mutation copying
+        # Find the original row in orig_df with the same enzyme_id and campaign_id
+        orig_mask = (orig_df["enzyme_id"] == row["id"]) & (orig_df["campaign_id"] == row["campaign_id"])
+        orig_rows = orig_df[orig_mask]
+        if len(orig_rows) > 0 and "_matched_enzyme_id" in orig_rows.columns:
+            orig_row = orig_rows.iloc[0]
+            if pd.notna(orig_row.get("_matched_enzyme_id")):
+                # This was a Gemini-matched entry
+                matched_id = orig_row["_matched_enzyme_id"]
+                matched_campaign = orig_row["_matched_campaign_id"]
+                lookup_key = f"{matched_campaign}_{matched_id}"
+                if lookup_key in mutation_lookup:
+                    out_df.at[idx, "nucleotide_mutation"] = mutation_lookup[lookup_key]["nucleotide_mutation"]
+                    out_df.at[idx, "amino_acid_substitutions"] = mutation_lookup[lookup_key]["amino_acid_substitutions"]
+                    mutations_copied += 1
+                    log.debug(f"Copied mutations for {row['id']} from {matched_id}")
+    if mutations_copied > 0:
+        log.info(f"Successfully copied mutations for {mutations_copied} entries")
+    return out_df
 def _identify_parents_with_gemini(df: pd.DataFrame) -> pd.DataFrame:
     """Use Gemini API to identify parent enzymes for entries with missing parent information."""
     if not GEMINI_OK:
@@ -885,11 +1080,7 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     # Fill missing sequences in substrate scope entries from lineage data
     df = _fill_missing_sequences(df)
-    # Use Gemini API to identify parent enzymes for entries with missing sequences
-    df = _identify_parents_with_gemini(df)
-    # Fill sequences again after parent identification to propagate sequences from identified parents
-    df = _fill_missing_sequences(df)
+    # Note: Removed parent identification - we only want exact variant matching
     # 1. Generate lineage roots once -----------------------------------------
     lineage_roots = _generate_lineage_roots(df)
@@ -992,15 +1183,33 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
         if generation != "0":
             for cid, cmap in campaign_idmap.items():
                 if cid == campaign_id:
+                    # First try to find generation 0
                     for enzyme_id, enzyme_row in cmap.items():
                         enzyme_gen = str(enzyme_row.get("generation", "")).strip()
                         if enzyme_gen == "0" or enzyme_gen == "0.0":
                             reference_row = enzyme_row
                             log.debug(f"Found generation 0 enzyme {enzyme_id} as reference for {eid}")
                             break
+                    # If no generation 0 found, find the earliest generation
+                    if not reference_row:
+                        earliest_gen = float('inf')
+                        earliest_enzyme = None
+                        for enzyme_id, enzyme_row in cmap.items():
+                            try:
+                                enzyme_gen = float(str(enzyme_row.get("generation", "")).strip())
+                                if enzyme_gen < earliest_gen and enzyme_gen < float(generation):
+                                    earliest_gen = enzyme_gen
+                                    earliest_enzyme = enzyme_id
+                                    reference_row = enzyme_row
+                            except (ValueError, AttributeError):
+                                continue
+                        if reference_row:
+                            log.info(f"No generation 0 found in campaign {campaign_id}, using generation {earliest_gen} enzyme {earliest_enzyme} as reference for {eid}")
+                        else:
+                            log.warning(f"No suitable reference enzyme found in campaign {campaign_id} for {eid}")
                     break
-            if not reference_row:
-                log.warning(f"No generation 0 enzyme found in campaign {campaign_id} for {eid}")
         reference_aa = ""
         reference_nt = ""
@@ -1095,6 +1304,10 @@ def flatten_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     log.info(f"Flattening complete: {processed_count} rows processed, {skipped_count} rows skipped")
     out_df = pd.DataFrame(output_rows, columns=OUTPUT_COLUMNS)
+    # Post-process: Copy mutations from matched enzymes for Gemini-matched substrate scope entries
+    out_df = _copy_mutations_from_matched_enzymes(out_df, df)
     return out_df
@@ -1137,7 +1350,7 @@ def run_pipeline(reaction_csv: str | Path | None = None,
     if not dfs:
         raise ValueError("At least one input CSV must be provided")
-    # Combine dataframes
+    # Combine dataframes without deduplication
     if len(dfs) > 1:
         df_in = pd.concat(dfs, ignore_index=True)
         log.info("Combined data: %d total entries", len(df_in))

debase 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

debase 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl