PyPI - masster - Versions diffs - 0.5.13__py3-none-any.whl → 0.5.14__py3-none-any.whl - Mend

masster 0.5.13py3-none-any.whl → 0.5.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (15) hide show

masster/_version.py +1 -1
masster/lib/lib.py +371 -57
masster/study/helpers.py +1 -0
masster/study/id.py +233 -36
masster/study/importers.py +161 -52
masster/study/merge.py +1 -1
masster/study/plot.py +10 -18
masster/study/study5_schema.json +9 -0
masster/wizard/__init__.py +4 -4
masster/wizard/wizard.py +437 -19
{masster-0.5.13.dist-info → masster-0.5.14.dist-info}/METADATA +1 -1
{masster-0.5.13.dist-info → masster-0.5.14.dist-info}/RECORD +15 -15
{masster-0.5.13.dist-info → masster-0.5.14.dist-info}/WHEEL +0 -0
{masster-0.5.13.dist-info → masster-0.5.14.dist-info}/entry_points.txt +0 -0
{masster-0.5.13.dist-info → masster-0.5.14.dist-info}/licenses/LICENSE +0 -0

masster/study/importers.py CHANGED Viewed

@@ -21,20 +21,20 @@ def import_oracle(
     """
     Import oracle identification data and map it to consensus features.
-    This method reads oracle identification results from folder/diag/summary_by_feature.csv
-    and maps them to consensus features using the 'uit' (feature_uid) column. The oracle
-    data is used to populate identification columns in consensus_df.
+    This method reads oracle identification results from folder/diag/annotation_full.csv
+    and creates lib_df and id_df DataFrames with detailed library and identification information.
+    It also updates consensus_df with top identification results.
     Parameters:
-        folder (str): Path to oracle folder containing diag/summary_by_feature.csv
+        folder (str): Path to oracle folder containing diag/annotation_full.csv
         min_id_level (int, optional): Minimum identification level to include
         max_id_level (int, optional): Maximum identification level to include
     Returns:
-        None: Updates consensus_df in-place with oracle identification data
+        None: Updates consensus_df, creates lib_df and id_df in-place with oracle identification data
     Raises:
-        FileNotFoundError: If the oracle summary file doesn't exist
+        FileNotFoundError: If the oracle annotation file doesn't exist
         ValueError: If consensus_df is empty or doesn't have required columns
     Example:
@@ -55,9 +55,9 @@ def import_oracle(
         raise ValueError("consensus_df must contain 'consensus_uid' column")
     # Check if oracle file exists
-    oracle_file_path = os.path.join(folder, "diag", "summary_by_feature.csv")
+    oracle_file_path = os.path.join(folder, "diag", "annotation_full.csv")
     if not os.path.exists(oracle_file_path):
-        raise FileNotFoundError(f"Oracle summary file not found: {oracle_file_path}")
+        raise FileNotFoundError(f"Oracle annotation file not found: {oracle_file_path}")
     self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
@@ -69,64 +69,171 @@ def import_oracle(
         self.logger.error(f"Could not read {oracle_file_path}: {e}")
         raise
-    # Select relevant columns from oracle data
-    required_oracle_cols = ["title", "id_level", "id_label", "id_ion", "id_class", "score"]
-    missing_cols = [col for col in required_oracle_cols if col not in oracle_data.columns]
-    if missing_cols:
-        raise ValueError(f"Oracle data missing required columns: {missing_cols}")
-    oracle_subset = oracle_data[required_oracle_cols].copy()
-    # Extract consensus_uid from title column (format: "uid:XYZ, ...")
-    self.logger.debug("Extracting consensus UIDs from oracle titles using pattern 'uid:(\\d+)'")
-    oracle_subset["consensus_uid"] = oracle_subset["title"].str.extract(r"uid:(\d+)")
+    # Extract consensus_uid from scan_title column (format: "uid:XYZ, ...")
+    self.logger.debug("Extracting consensus UIDs from oracle scan_title using pattern 'uid:(\\d+)'")
+    oracle_data["consensus_uid"] = oracle_data["scan_title"].str.extract(r"uid:(\d+)", expand=False)
     # Remove rows where consensus_uid extraction failed
-    oracle_subset = oracle_subset.dropna(subset=["consensus_uid"])
-    oracle_subset["consensus_uid"] = oracle_subset["consensus_uid"].astype(int)
+    initial_count = len(oracle_data)
+    oracle_data = oracle_data.dropna(subset=["consensus_uid"])
+    oracle_data["consensus_uid"] = oracle_data["consensus_uid"].astype(int)
-    self.logger.debug(f"Extracted consensus UIDs for {len(oracle_subset)} oracle entries")
+    self.logger.debug(f"Extracted consensus UIDs for {len(oracle_data)}/{initial_count} oracle entries")
     # Apply id_level filters if specified
-    initial_count = len(oracle_subset)
     if min_id_level is not None:
-        oracle_subset = oracle_subset[oracle_subset["id_level"] >= min_id_level]
-        self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_subset)} entries")
+        oracle_data = oracle_data[oracle_data["level"] >= min_id_level]
+        self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_data)} entries")
     if max_id_level is not None:
-        oracle_subset = oracle_subset[oracle_subset["id_level"] <= max_id_level]
-        self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_subset)} entries")
+        oracle_data = oracle_data[oracle_data["level"] <= max_id_level]
+        self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_data)} entries")
-    if len(oracle_subset) == 0:
+    if len(oracle_data) == 0:
         self.logger.warning("No oracle entries remain after filtering")
         return
-    # Sort by id_level (descending) to prioritize higher confidence identifications
-    # and remove duplicates by consensus_uid, keeping the first (highest id_level)
-    oracle_subset = oracle_subset.sort_values(by=["id_level"], ascending=False)
-    oracle_subset = oracle_subset.drop_duplicates(subset=["consensus_uid"], keep="first")
+    # === CREATE LIB_DF ===
+    self.logger.debug("Creating lib_df from Oracle annotation data")
+    self.logger.debug(f"Oracle data shape before lib_df creation: {oracle_data.shape}")
-    self.logger.debug(f"After deduplication by consensus_uid: {len(oracle_subset)} unique identifications")
+    # Create unique lib_uid for each library entry
+    oracle_data["lib_uid"] = range(len(oracle_data))
-    # Convert to polars for efficient joining
-    oracle_pl = pl.DataFrame(oracle_subset)
+    # Map Oracle columns to lib_df schema
+    lib_data = []
+    for _, row in oracle_data.iterrows():
+        # Convert cmpd_uid to integer, using lib_uid as fallback
+        cmpd_uid = row["lib_uid"]  # Use lib_uid as integer compound identifier
+        try:
+            if row.get("lib_id") is not None:
+                cmpd_uid = int(float(str(row["lib_id"])))  # Convert to int, handling potential float strings
+        except (ValueError, TypeError):
+            pass  # Keep lib_uid as fallback
+        lib_entry = {
+            "lib_uid": row["lib_uid"],
+            "cmpd_uid": cmpd_uid,  # Integer compound identifier
+            "source_id": "LipidOracle",  # Fixed source identifier
+            "name": row.get("name", None),
+            "shortname": row.get("species", None),
+            "class": row.get("hg", None),
+            "smiles": None,  # Not available in Oracle data
+            "inchi": None,   # Not available in Oracle data
+            "inchikey": None, # Not available in Oracle data
+            "formula": row.get("formula", None),
+            "iso": 0,        # Fixed isotope value
+            "adduct": row.get("ion", None),
+            "probability": row.get("score", None),
+            "m": None,       # Would need to calculate from formula
+            "z": 1 if row.get("ion", "").find("+") != -1 else (-1 if row.get("ion", "").find("-") != -1 else None),
+            "mz": row.get("mz", None),  # Use mz column from annotation_full.csv
+            "rt": None,      # Set to null as requested
+            "quant_group": None,  # Set to null as requested
+            "db_id": row.get("lib_id", None),
+            "db": row.get("lib", None)
+        }
+        lib_data.append(lib_entry)
-    self.logger.debug(f"Oracle data ready for consensus mapping: {len(oracle_pl)} entries")
+    self.logger.debug(f"Created {len(lib_data)} lib_data entries")
-    if oracle_pl.is_empty():
-        self.logger.warning("No oracle entries could be processed")
-        return
+    # Create lib_df as Polars DataFrame with error handling for mixed types
+    try:
+        lib_df_temp = pl.DataFrame(lib_data)
+    except Exception as e:
+        self.logger.warning(f"Error creating lib_df with polars: {e}")
+        # Fallback: convert to pandas first, then to polars
+        lib_df_pandas = pd.DataFrame(lib_data)
+        lib_df_temp = pl.from_pandas(lib_df_pandas)
+    # Ensure uniqueness by name and adduct combination
+    # Sort by lib_uid and keep first occurrence (earliest in processing order)
+    self.lib_df = (
+        lib_df_temp
+        .sort("lib_uid")
+        .unique(subset=["name", "adduct"], keep="first")
+    )
+    self.logger.info(f"Created lib_df with {len(self.lib_df)} library entries ({len(lib_data) - len(self.lib_df)} duplicates removed)")
+    # === CREATE ID_DF ===
+    self.logger.debug("Creating id_df from Oracle identification matches")
+    # Create identification matches
+    id_data = []
+    for _, row in oracle_data.iterrows():
+        # Use dmz from annotation_full.csv directly for mz_delta
+        mz_delta = None
+        if row.get("dmz") is not None:
+            try:
+                mz_delta = float(row["dmz"])
+            except (ValueError, TypeError):
+                pass
+        # Use rt_err from annotation_full.csv for rt_delta, None if NaN
+        rt_delta = None
+        rt_err_value = row.get("rt_err")
+        if rt_err_value is not None and not (isinstance(rt_err_value, float) and pd.isna(rt_err_value)):
+            try:
+                rt_delta = float(rt_err_value)
+            except (ValueError, TypeError):
+                pass
+        # Create matcher as "lipidoracle-" + score_metric from annotation_full.csv
+        matcher = "lipidoracle"  # default fallback
+        if row.get("score_metric") is not None:
+            try:
+                score_metric = str(row["score_metric"])
+                matcher = f"lipidoracle-{score_metric}"
+            except (ValueError, TypeError):
+                pass
+        id_entry = {
+            "consensus_uid": row["consensus_uid"],
+            "lib_uid": row["lib_uid"],
+            "mz_delta": mz_delta,
+            "rt_delta": rt_delta,
+            "matcher": matcher,
+            "score": row.get("score", None)
+        }
+        id_data.append(id_entry)
+    # Create id_df as Polars DataFrame with error handling
+    try:
+        id_df_temp = pl.DataFrame(id_data)
+    except Exception as e:
+        self.logger.warning(f"Error creating id_df with polars: {e}")
+        # Fallback: convert to pandas first, then to polars
+        id_df_pandas = pd.DataFrame(id_data)
+        id_df_temp = pl.from_pandas(id_df_pandas)
+    # Filter id_df to only include lib_uids that exist in the final unique lib_df
+    unique_lib_uids = self.lib_df.select("lib_uid").to_series()
+    self.id_df = id_df_temp.filter(pl.col("lib_uid").is_in(unique_lib_uids))
+    self.logger.info(f"Created id_df with {len(self.id_df)} identification matches")
+    # === UPDATE CONSENSUS_DF (existing functionality) ===
+    self.logger.debug("Updating consensus_df with top identification results")
+    # Convert to polars for efficient joining with error handling
+    try:
+        oracle_pl = pl.DataFrame(oracle_data)
+    except Exception as e:
+        self.logger.warning(f"Error converting oracle_data to polars: {e}")
+        # Convert using from_pandas properly
+        oracle_pl = pl.from_pandas(oracle_data.reset_index(drop=True))
-    # Group by consensus_uid and select the best identification (highest id_level)
+    # Group by consensus_uid and select the best identification (highest level)
     # In case of ties, take the first one
     best_ids = (
         oracle_pl
         .group_by("consensus_uid")
         .agg([
-            pl.col("id_level").max().alias("max_id_level")
+            pl.col("level").max().alias("max_level")
         ])
         .join(oracle_pl, on="consensus_uid")
-        .filter(pl.col("id_level") == pl.col("max_id_level"))
+        .filter(pl.col("level") == pl.col("max_level"))
         .group_by("consensus_uid")
         .first()  # In case of ties, take the first
     )
@@ -135,13 +242,13 @@ def import_oracle(
     # Prepare the identification columns
     id_columns = {
-        "id_top_name": best_ids.select("consensus_uid", "id_label"),
-        "id_top_adduct": best_ids.select("consensus_uid", "id_ion"),
-        "id_top_class": best_ids.select("consensus_uid", "id_class"),
+        "id_top_name": best_ids.select("consensus_uid", "name"),
+        "id_top_adduct": best_ids.select("consensus_uid", "ion"),
+        "id_top_class": best_ids.select("consensus_uid", "hg"),
         "id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
         "id_source": best_ids.select(
             "consensus_uid",
-            pl.when(pl.col("id_level") == 1)
+            pl.when(pl.col("level") == 1)
             .then(pl.lit("lipidoracle ms1"))
             .otherwise(pl.lit("lipidoracle ms2"))
             .alias("id_source")
@@ -161,11 +268,11 @@ def import_oracle(
                 )
     # Update consensus_df with oracle identifications
-    for col_name, id_data in id_columns.items():
-        oracle_column = id_data.columns[1]  # second column (after consensus_uid)
+    for col_name, id_data_col in id_columns.items():
+        oracle_column = id_data_col.columns[1]  # second column (after consensus_uid)
         # Create update dataframe
-        update_data = id_data.rename({oracle_column: col_name})
+        update_data = id_data_col.rename({oracle_column: col_name})
         # Join and update
         self.consensus_df = (
@@ -207,8 +314,8 @@ def import_oracle(
     updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
     total_consensus = len(self.consensus_df)
-    self.logger.info(
-        f"Oracle import complete: {updated_count}/{total_consensus} "
+    self.logger.success(
+        f"LipidOracle import completed. {updated_count}/{total_consensus} "
         f"consensus features now have identifications ({updated_count/total_consensus*100:.1f}%)"
     )
@@ -218,5 +325,7 @@ def import_oracle(
         "min_id_level": min_id_level,
         "max_id_level": max_id_level,
         "updated_features": updated_count,
-        "total_features": total_consensus
+        "total_features": total_consensus,
+        "lib_entries": len(self.lib_df),
+        "id_matches": len(self.id_df)
     })

masster/study/merge.py CHANGED Viewed

@@ -3023,7 +3023,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
             pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
             pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
         ])
-        study.logger.success(f"Adduct information updated for {updated_count} consensus features.")
+        study.logger.info(f"Adduct information updated for {updated_count} consensus features.")
     else:
         study.logger.debug("No consensus features updated based on mass shift analysis")

masster/study/plot.py CHANGED Viewed

@@ -631,6 +631,7 @@ def plot_consensus_2d(
     mz_range=None,
     rt_range=None,
     legend="bottom_right",
+    show_none=True,
 ):
     """
     Plot consensus features in a 2D scatter plot with retention time vs m/z.
@@ -656,6 +657,7 @@ def plot_consensus_2d(
         legend (str, optional): Legend position for categorical data. Options: 'top_right', 'top_left',
                                'bottom_right', 'bottom_left', 'right', 'left', 'top', 'bottom'.
                                If None, legend is hidden. Only applies to categorical coloring (default: "bottom_right")
+        show_none (bool): Whether to display points with None values for colorby column (default: True)
     """
     if self.consensus_df is None:
         self.logger.error("No consensus map found.")
@@ -734,6 +736,10 @@ def plot_consensus_2d(
         from bokeh.models.annotations import ColorBar
     from bokeh.palettes import viridis, Category20
+    # Filter out None values for colorby column if show_none=False
+    if not show_none and colorby in data.columns:
+        data = data.filter(pl.col(colorby).is_not_null())
     # Convert Polars DataFrame to pandas for Bokeh compatibility
     data_pd = data.to_pandas()
     source = ColumnDataSource(data_pd)
@@ -837,7 +843,7 @@ def plot_consensus_2d(
             categorical_palette = viridis(min(256, len(unique_values)))
         # Handle None values with black color FIRST so they appear in the background
-        if has_none_values:
+        if has_none_values and show_none:
             # Filter data for None values
             none_data = data.filter(pl.col(colorby).is_null())
             none_data_pd = none_data.to_pandas()
@@ -947,33 +953,19 @@ def plot_consensus_2d(
         ("number_samples", "@number_samples"),
         ("number_ms2", "@number_ms2"),
         ("inty_mean", "@inty_mean"),
-        ("coherence_mean", "@chrom_coherence_mean"),
-        ("prominence_scaled_mean", "@chrom_prominence_scaled_mean"),
     ]
-    # Add adduct_top if it exists in data
-    if "adduct_top" in data.columns:
-        tooltips.append(("adduct_top", "@adduct_top"))
-    # Add id_top_name if it exists in data
-    if "id_top_name" in data.columns:
-        tooltips.append(("id_top_name", "@id_top_name"))
-    # Add id_top_adduct if it exists in data
-    if "id_top_adduct" in data.columns:
-        tooltips.append(("id_top_adduct", "@id_top_adduct"))
     # Add id_top_* columns if they exist and have non-null values
-    id_top_columns = ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]
+    id_top_columns = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score"]
     for col in id_top_columns:
         if col in data.columns:
             # Check if the column has any non-null values
             if data.filter(pl.col(col).is_not_null()).height > 0:
                 # Format score column with decimal places, others as strings
                 if col == "id_top_score":
-                    tooltips.append((col.replace("id_top_", "id_"), f"@{col}{{0.0000}}"))
+                    tooltips.append((col, f"@{col}{{0.0}}"))
                 else:
-                    tooltips.append((col.replace("id_top_", "id_"), f"@{col}"))
+                    tooltips.append((col, f"@{col}"))
     hover = HoverTool(
         tooltips=tooltips,

masster/study/study5_schema.json CHANGED Viewed

@@ -321,6 +321,12 @@
       "name": {
         "dtype": "pl.String"
       },
+      "shortname": {
+        "dtype": "pl.String"
+      },
+      "class": {
+        "dtype": "pl.String"
+      },
       "smiles": {
         "dtype": "pl.String"
       },
@@ -339,6 +345,9 @@
       "adduct": {
         "dtype": "pl.String"
       },
+      "probability": {
+        "dtype": "pl.Float64"
+      },
       "m": {
         "dtype": "pl.Float64"
       },

masster/wizard/__init__.py CHANGED Viewed

@@ -5,13 +5,13 @@ This module provides the Wizard class for fully automated processing of MS data
 from raw files to final study results, including batch conversion, assembly,
 alignment, merging, plotting, and export.
-The create_script() function allows immediate generation of standalone analysis
+The create_analysis() function allows immediate generation of standalone analysis
 scripts without creating a Wizard instance first.
-The execute() function combines create_script() with immediate execution of the
+The analyze() function combines create_analysis() with immediate execution of the
 generated script for fully automated processing.
 """
-from .wizard import Wizard, wizard_def, create_script, execute
+from .wizard import Wizard, wizard_def, create_analysis, create_notebook, analyze
-__all__ = ["Wizard", "wizard_def", "create_script", "execute"]
+__all__ = ["Wizard", "wizard_def", "create_analysis", "create_notebook", "analyze"]

masster 0.5.13__py3-none-any.whl → 0.5.14__py3-none-any.whl

Potentially problematic release.

masster 0.5.13py3-none-any.whl → 0.5.14py3-none-any.whl