PyPI - masster - Versions diffs - 0.5.11__tar.gz → 0.5.13__tar.gz - Mend

masster 0.5.11tar.gz → 0.5.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (96) hide show

{masster-0.5.11 → masster-0.5.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.5.11
+Version: 0.5.13
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.5.11 → masster-0.5.13}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.5.11"
+version = "0.5.13"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.5.11 → masster-0.5.13}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.11"
+__version__ = "0.5.13"
 def get_version():

{masster-0.5.11 → masster-0.5.13}/src/masster/study/h5.py RENAMED Viewed

@@ -874,8 +874,47 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
     # Create DataFrame with regular columns first
     if regular_data:
-        df = pl.DataFrame(regular_data)
-        # print(f"DEBUG: Created DataFrame with regular columns, shape: {df.shape}")
+        # Final safety check: convert any remaining numpy object arrays to Python lists
+        # and handle numpy scalars within lists
+        safe_regular_data = {}
+        import numpy as np
+        def convert_numpy_scalars(value):
+            """Convert numpy scalars to Python native types recursively."""
+            if isinstance(value, np.generic):
+                return value.item()  # Convert numpy scalar to Python scalar
+            elif isinstance(value, list):
+                return [convert_numpy_scalars(item) for item in value]
+            else:
+                return value
+        for k, v in regular_data.items():
+            if hasattr(v, 'dtype') and str(v.dtype) == 'object':
+                # Convert numpy object array to Python list
+                safe_regular_data[k] = [convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, 'tolist') else list(v))]
+            elif isinstance(v, list):
+                # Handle lists that might contain numpy scalars
+                safe_regular_data[k] = [convert_numpy_scalars(item) for item in v]
+            else:
+                safe_regular_data[k] = convert_numpy_scalars(v)
+        # Create DataFrame with proper error handling
+        try:
+            df = pl.DataFrame(safe_regular_data)
+        except Exception as e:
+            # If direct creation fails, try creating column by column to identify and handle problematic columns
+            df = pl.DataFrame()
+            for k, v in safe_regular_data.items():
+                try:
+                    df = df.with_columns([pl.Series(k, v)])
+                except Exception:
+                    # Skip problematic columns or convert them to string as a fallback
+                    try:
+                        df = df.with_columns([pl.Series(k, [str(item) for item in v])])
+                    except Exception:
+                        # Last resort: skip the column entirely
+                        continue
         # Add Object columns one by one
         for col, values in object_data.items():
             # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")

{masster-0.5.11 → masster-0.5.13}/src/masster/study/id.py RENAMED Viewed

@@ -661,7 +661,8 @@ def _update_consensus_id_columns(study, logger=None):
             ("id_top_name", pl.String),
             ("id_top_class", pl.String),
             ("id_top_adduct", pl.String),
-            ("id_top_score", pl.Float64)
+            ("id_top_score", pl.Float64),
+            ("id_source", pl.String)
         ]:
             if col_name not in study.consensus_df.columns:
                 study.consensus_df = study.consensus_df.with_columns(
@@ -1076,7 +1077,7 @@ def id_reset(study):
         # Check which columns exist before trying to update them
         id_columns_to_reset = []
-        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
+        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
             if col in study.consensus_df.columns:
                 if col == "id_top_score":
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
@@ -1170,7 +1171,7 @@ def lib_reset(study):
         # Check which columns exist before trying to update them
         id_columns_to_reset = []
-        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]:
+        for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
             if col in study.consensus_df.columns:
                 if col == "id_top_score":
                     id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))

masster-0.5.13/src/masster/study/importers.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""
+import.py
+Module providing import functionality for Study class, specifically for importing
+oracle identification data into consensus features.
+"""
+from __future__ import annotations
+import os
+import pandas as pd
+import polars as pl
+def import_oracle(
+    self,
+    folder,
+    min_id_level=None,
+    max_id_level=None,
+):
+    """
+    Import oracle identification data and map it to consensus features.
+    This method reads oracle identification results from folder/diag/summary_by_feature.csv
+    and maps them to consensus features using the 'uit' (feature_uid) column. The oracle
+    data is used to populate identification columns in consensus_df.
+    Parameters:
+        folder (str): Path to oracle folder containing diag/summary_by_feature.csv
+        min_id_level (int, optional): Minimum identification level to include
+        max_id_level (int, optional): Maximum identification level to include
+    Returns:
+        None: Updates consensus_df in-place with oracle identification data
+    Raises:
+        FileNotFoundError: If the oracle summary file doesn't exist
+        ValueError: If consensus_df is empty or doesn't have required columns
+    Example:
+        >>> study.import_oracle(
+        ...     folder="path/to/oracle_results",
+        ...     min_id_level=2,
+        ...     max_id_level=4
+        ... )
+    """
+    self.logger.info(f"Starting oracle import from folder: {folder}")
+    # Validate inputs
+    if self.consensus_df is None or self.consensus_df.is_empty():
+        raise ValueError("consensus_df is empty or not available. Run merge() first.")
+    if "consensus_uid" not in self.consensus_df.columns:
+        raise ValueError("consensus_df must contain 'consensus_uid' column")
+    # Check if oracle file exists
+    oracle_file_path = os.path.join(folder, "diag", "summary_by_feature.csv")
+    if not os.path.exists(oracle_file_path):
+        raise FileNotFoundError(f"Oracle summary file not found: {oracle_file_path}")
+    self.logger.debug(f"Loading oracle data from: {oracle_file_path}")
+    try:
+        # Read oracle data using pandas first for easier processing
+        oracle_data = pd.read_csv(oracle_file_path)
+        self.logger.info(f"Oracle data loaded successfully with {len(oracle_data)} rows")
+    except Exception as e:
+        self.logger.error(f"Could not read {oracle_file_path}: {e}")
+        raise
+    # Select relevant columns from oracle data
+    required_oracle_cols = ["title", "id_level", "id_label", "id_ion", "id_class", "score"]
+    missing_cols = [col for col in required_oracle_cols if col not in oracle_data.columns]
+    if missing_cols:
+        raise ValueError(f"Oracle data missing required columns: {missing_cols}")
+    oracle_subset = oracle_data[required_oracle_cols].copy()
+    # Extract consensus_uid from title column (format: "uid:XYZ, ...")
+    self.logger.debug("Extracting consensus UIDs from oracle titles using pattern 'uid:(\\d+)'")
+    oracle_subset["consensus_uid"] = oracle_subset["title"].str.extract(r"uid:(\d+)")
+    # Remove rows where consensus_uid extraction failed
+    oracle_subset = oracle_subset.dropna(subset=["consensus_uid"])
+    oracle_subset["consensus_uid"] = oracle_subset["consensus_uid"].astype(int)
+    self.logger.debug(f"Extracted consensus UIDs for {len(oracle_subset)} oracle entries")
+    # Apply id_level filters if specified
+    initial_count = len(oracle_subset)
+    if min_id_level is not None:
+        oracle_subset = oracle_subset[oracle_subset["id_level"] >= min_id_level]
+        self.logger.debug(f"After min_id_level filter ({min_id_level}): {len(oracle_subset)} entries")
+    if max_id_level is not None:
+        oracle_subset = oracle_subset[oracle_subset["id_level"] <= max_id_level]
+        self.logger.debug(f"After max_id_level filter ({max_id_level}): {len(oracle_subset)} entries")
+    if len(oracle_subset) == 0:
+        self.logger.warning("No oracle entries remain after filtering")
+        return
+    # Sort by id_level (descending) to prioritize higher confidence identifications
+    # and remove duplicates by consensus_uid, keeping the first (highest id_level)
+    oracle_subset = oracle_subset.sort_values(by=["id_level"], ascending=False)
+    oracle_subset = oracle_subset.drop_duplicates(subset=["consensus_uid"], keep="first")
+    self.logger.debug(f"After deduplication by consensus_uid: {len(oracle_subset)} unique identifications")
+    # Convert to polars for efficient joining
+    oracle_pl = pl.DataFrame(oracle_subset)
+    self.logger.debug(f"Oracle data ready for consensus mapping: {len(oracle_pl)} entries")
+    if oracle_pl.is_empty():
+        self.logger.warning("No oracle entries could be processed")
+        return
+    # Group by consensus_uid and select the best identification (highest id_level)
+    # In case of ties, take the first one
+    best_ids = (
+        oracle_pl
+        .group_by("consensus_uid")
+        .agg([
+            pl.col("id_level").max().alias("max_id_level")
+        ])
+        .join(oracle_pl, on="consensus_uid")
+        .filter(pl.col("id_level") == pl.col("max_id_level"))
+        .group_by("consensus_uid")
+        .first()  # In case of ties, take the first
+    )
+    self.logger.debug(f"Selected best identifications for {len(best_ids)} consensus features")
+    # Prepare the identification columns
+    id_columns = {
+        "id_top_name": best_ids.select("consensus_uid", "id_label"),
+        "id_top_adduct": best_ids.select("consensus_uid", "id_ion"),
+        "id_top_class": best_ids.select("consensus_uid", "id_class"),
+        "id_top_score": best_ids.select("consensus_uid", pl.col("score").round(3).alias("score")),
+        "id_source": best_ids.select(
+            "consensus_uid",
+            pl.when(pl.col("id_level") == 1)
+            .then(pl.lit("lipidoracle ms1"))
+            .otherwise(pl.lit("lipidoracle ms2"))
+            .alias("id_source")
+        )
+    }
+    # Initialize identification columns in consensus_df if they don't exist
+    for col_name in id_columns.keys():
+        if col_name not in self.consensus_df.columns:
+            if col_name == "id_top_score":
+                self.consensus_df = self.consensus_df.with_columns(
+                    pl.lit(None, dtype=pl.Float64).alias(col_name)
+                )
+            else:
+                self.consensus_df = self.consensus_df.with_columns(
+                    pl.lit(None, dtype=pl.String).alias(col_name)
+                )
+    # Update consensus_df with oracle identifications
+    for col_name, id_data in id_columns.items():
+        oracle_column = id_data.columns[1]  # second column (after consensus_uid)
+        # Create update dataframe
+        update_data = id_data.rename({oracle_column: col_name})
+        # Join and update
+        self.consensus_df = (
+            self.consensus_df
+            .join(update_data, on="consensus_uid", how="left", suffix="_oracle")
+            .with_columns(
+                pl.coalesce([f"{col_name}_oracle", col_name]).alias(col_name)
+            )
+            .drop(f"{col_name}_oracle")
+        )
+    # Replace NaN values with None in identification columns
+    id_col_names = ["id_top_name", "id_top_adduct", "id_top_class", "id_top_score", "id_source"]
+    for col_name in id_col_names:
+        if col_name in self.consensus_df.columns:
+            # For string columns, replace empty strings and "nan" with None
+            if col_name != "id_top_score":
+                self.consensus_df = self.consensus_df.with_columns(
+                    pl.when(
+                        pl.col(col_name).is_null() |
+                        (pl.col(col_name) == "") |
+                        (pl.col(col_name) == "nan") |
+                        (pl.col(col_name) == "NaN")
+                    )
+                    .then(None)
+                    .otherwise(pl.col(col_name))
+                    .alias(col_name)
+                )
+            # For numeric columns, replace NaN with None
+            else:
+                self.consensus_df = self.consensus_df.with_columns(
+                    pl.when(pl.col(col_name).is_null() | pl.col(col_name).is_nan())
+                    .then(None)
+                    .otherwise(pl.col(col_name))
+                    .alias(col_name)
+                )
+    # Count how many consensus features were updated
+    updated_count = self.consensus_df.filter(pl.col("id_top_name").is_not_null()).height
+    total_consensus = len(self.consensus_df)
+    self.logger.info(
+        f"Oracle import complete: {updated_count}/{total_consensus} "
+        f"consensus features now have identifications ({updated_count/total_consensus*100:.1f}%)"
+    )
+    # Update history
+    self.update_history(["import_oracle"], {
+        "folder": folder,
+        "min_id_level": min_id_level,
+        "max_id_level": max_id_level,
+        "updated_features": updated_count,
+        "total_features": total_consensus
+    })

{masster-0.5.11 → masster-0.5.13}/src/masster/study/merge.py RENAMED Viewed

@@ -1792,6 +1792,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
         "id_top_class": None,
         "id_top_adduct": None,
         "id_top_score": None,
+        "id_source": None,
     }
@@ -2194,6 +2195,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
                 "id_top_class": None,
                 "id_top_adduct": None,
                 "id_top_score": None,
+                "id_source": None,
             },
         )
@@ -2255,15 +2257,13 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
                 {
                     "consensus_uid": row["consensus_uid"],
                     "rt": row["rt"],
-                    "mz": row["mz"],  # Add missing mz field
+                    "mz": row["mz"],
                     "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
                     "adduct_top": row.get("adduct_top"),
                     "inty_mean": row.get("inty_mean", 0),
                 },
             )
-        # Use optimized adduct grouping
-        #study.logger.info(f"About to call adduct grouping for {len(consensus_data)} consensus features")
         adduct_group_list, adduct_of_list = __merge_adduct_grouping(
             study, consensus_data, rt_tol/3, mz_tol
         )
@@ -2718,8 +2718,6 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
         study.logger.debug("No consensus features for adduct identification by mass shift")
         return
-    study.logger.info(f"Identifying coeluting adducts by mass shifts in {len(study.consensus_df)} consensus features...")
     # Get adducts DataFrame if not provided
     if cached_adducts_df is None or cached_adducts_df.is_empty():
         try:
@@ -3025,8 +3023,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
             pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
             pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
         ])
-        study.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
+        study.logger.success(f"Adduct information updated for {updated_count} consensus features.")
     else:
         study.logger.debug("No consensus features updated based on mass shift analysis")
@@ -3395,7 +3392,7 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
             adduct_of_list = [0] * len(consensus_data)
             return adduct_group_list, adduct_of_list
-        study.logger.info(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
+        study.logger.debug(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
     except Exception as e:
         study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
@@ -3405,7 +3402,7 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
     # Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
     adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
-    study.logger.info(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
+    study.logger.debug(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
     # Build hash map for O(1) mass shift lookup
     mass_shift_map = {}  # rounded_delta -> [(likelihood, adduct1, adduct2), ...]

{masster-0.5.11 → masster-0.5.13}/src/masster/study/plot.py RENAMED Viewed

@@ -630,6 +630,7 @@ def plot_consensus_2d(
     height=450,
     mz_range=None,
     rt_range=None,
+    legend="bottom_right",
 ):
     """
     Plot consensus features in a 2D scatter plot with retention time vs m/z.
@@ -652,6 +653,9 @@ def plot_consensus_2d(
         height (int): Plot height in pixels (default: 900)
         mz_range (tuple, optional): m/z range for filtering consensus features (min_mz, max_mz)
         rt_range (tuple, optional): Retention time range for filtering consensus features (min_rt, max_rt)
+        legend (str, optional): Legend position for categorical data. Options: 'top_right', 'top_left',
+                               'bottom_right', 'bottom_left', 'right', 'left', 'top', 'bottom'.
+                               If None, legend is hidden. Only applies to categorical coloring (default: "bottom_right")
     """
     if self.consensus_df is None:
         self.logger.error("No consensus map found.")
@@ -783,13 +787,20 @@ def plot_consensus_2d(
         # Sorting would break the correspondence between legend labels and point colors
         unique_values = [v for v in data_pd[colorby].unique() if v is not None]
-        if len(unique_values) <= 20:
-            palette = Category20[min(20, max(3, len(unique_values)))]
+        # Use the custom palette from cmap if available, otherwise fall back to defaults
+        if len(palette) >= len(unique_values):
+            # Use custom colormap palette - sample evenly across the palette
+            import numpy as np
+            indices = np.linspace(0, len(palette) - 1, len(unique_values)).astype(int)
+            categorical_palette = [palette[i] for i in indices]
+        elif len(unique_values) <= 20:
+            # Fall back to Category20 if custom palette is too small
+            categorical_palette = Category20[min(20, max(3, len(unique_values)))]
         else:
             # For many categories, use a subset of the viridis palette
-            palette = viridis(min(256, len(unique_values)))
+            categorical_palette = viridis(min(256, len(unique_values)))
-        color_mapper = factor_cmap(colorby, palette, unique_values)
+        color_mapper = factor_cmap(colorby, categorical_palette, unique_values)
     else:
         # Handle numeric coloring with LinearColorMapper
         color_mapper = LinearColorMapper(
@@ -809,21 +820,65 @@ def plot_consensus_2d(
     if is_categorical:
         # For categorical data, create separate renderers for each category
         # This enables proper legend interactivity where each category can be toggled independently
-        unique_values = [v for v in data_pd[colorby].unique() if v is not None]
+        all_unique_values = list(data_pd[colorby].unique())
+        unique_values = [v for v in all_unique_values if v is not None]
+        has_none_values = None in all_unique_values
-        if len(unique_values) <= 20:
-            palette = Category20[min(20, max(3, len(unique_values)))]
+        # Use the custom palette from cmap if available, otherwise fall back to defaults
+        if len(palette) >= len(unique_values):
+            # Use custom colormap palette - sample evenly across the palette
+            import numpy as np
+            indices = np.linspace(0, len(palette) - 1, len(unique_values)).astype(int)
+            categorical_palette = [palette[i] for i in indices]
+        elif len(unique_values) <= 20:
+            # Fall back to Category20 if custom palette is too small
+            categorical_palette = Category20[min(20, max(3, len(unique_values)))]
         else:
-            palette = viridis(min(256, len(unique_values)))
+            categorical_palette = viridis(min(256, len(unique_values)))
-        # Create a separate renderer for each category
+        # Handle None values with black color FIRST so they appear in the background
+        if has_none_values:
+            # Filter data for None values
+            none_data = data.filter(pl.col(colorby).is_null())
+            none_data_pd = none_data.to_pandas()
+            none_source = bp.ColumnDataSource(none_data_pd)
+            if scaling.lower() in ["dyn", "dynamic"]:
+                # Calculate appropriate radius for dynamic scaling
+                rt_range = data["rt"].max() - data["rt"].min()
+                mz_range = data["mz"].max() - data["mz"].min()
+                dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
+                renderer = p.circle(
+                    x="rt",
+                    y="mz",
+                    radius=dynamic_radius,
+                    fill_color="lightgray",
+                    line_color=None,
+                    alpha=alpha,
+                    source=none_source,
+                    legend_label="None",
+                )
+            else:
+                renderer = p.scatter(
+                    x="rt",
+                    y="mz",
+                    size="markersize",
+                    fill_color="lightgray",
+                    line_color=None,
+                    alpha=alpha,
+                    source=none_source,
+                    legend_label="None",
+                )
+        # Create a separate renderer for each non-None category (plotted on top of None values)
         for i, category in enumerate(unique_values):
             # Filter data for this category
             category_data = data.filter(pl.col(colorby) == category)
             category_data_pd = category_data.to_pandas()
             category_source = bp.ColumnDataSource(category_data_pd)
-            color = palette[i % len(palette)]
+            color = categorical_palette[i % len(categorical_palette)]
             if scaling.lower() in ["dyn", "dynamic"]:
                 # Calculate appropriate radius for dynamic scaling
@@ -942,8 +997,25 @@ def plot_consensus_2d(
         p.add_layout(color_bar, "right")
     else:
         # For categorical data, configure the legend that was automatically created
-        p.legend.location = "top_right"
-        p.legend.click_policy = "hide"
+        if legend is not None:
+            # Map legend position parameter to Bokeh legend position
+            legend_position_map = {
+                "top_right": "top_right",
+                "top_left": "top_left",
+                "bottom_right": "bottom_right",
+                "bottom_left": "bottom_left",
+                "right": "right",
+                "left": "left",
+                "top": "top",
+                "bottom": "bottom"
+            }
+            bokeh_legend_pos = legend_position_map.get(legend, "bottom_right")
+            p.legend.location = bokeh_legend_pos
+            p.legend.click_policy = "hide"
+        else:
+            # Hide legend when legend=None
+            p.legend.visible = False
     if filename is not None:
         # Convert relative paths to absolute paths using study folder as base

{masster-0.5.11 → masster-0.5.13}/src/masster/study/study.py RENAMED Viewed

@@ -109,6 +109,7 @@ from masster.study.parameters import set_parameters_property
 from masster.study.save import save, save_consensus, save_samples
 from masster.study.export import export_mgf, export_mztab, export_xlsx, export_parquet
 from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset, _get_adducts
+from masster.study.importers import import_oracle
 from masster.logger import MassterLogger
 from masster.study.defaults.study_def import study_defaults
@@ -454,6 +455,9 @@ class Study:
     reset_id = id_reset
     lib_reset = lib_reset
     reset_lib = lib_reset
+    # === Oracle Import Operations ===
+    import_oracle = import_oracle
     # === Parameter Management ===
     update_history = update_history

{masster-0.5.11 → masster-0.5.13}/src/masster/study/study5_schema.json RENAMED Viewed

@@ -114,6 +114,9 @@
       },
       "id_top_score": {
         "dtype": "pl.Float64"
+      },
+      "id_source": {
+        "dtype": "pl.String"
       }
     }
   },

{masster-0.5.11 → masster-0.5.13}/uv.lock RENAMED Viewed

@@ -1420,7 +1420,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.5.11"
+version = "0.5.13"
 source = { editable = "." }
 dependencies = [
     { name = "alpharaw" },