PyPI - masster - Versions diffs - 0.4.20__py3-none-any.whl → 0.4.21__py3-none-any.whl - Mend

masster 0.4.20py3-none-any.whl → 0.4.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (28) hide show

masster/__init__.py +6 -0
masster/_version.py +1 -1
masster/sample/h5.py +58 -1
masster/sample/load.py +7 -1
masster/sample/plot.py +56 -65
masster/sample/processing.py +158 -0
masster/sample/sample.py +2 -0
masster/sample/sample5_schema.json +3 -0
masster/sample/save.py +137 -59
masster/spectrum.py +58 -9
masster/study/export.py +238 -152
masster/study/h5.py +65 -1
masster/study/helpers.py +3 -3
masster/study/merge.py +25 -10
masster/study/plot.py +39 -2
masster/study/processing.py +257 -1
masster/study/save.py +48 -5
masster/study/study.py +16 -3
masster/study/study5_schema.json +3 -0
masster/wizard/__init__.py +5 -2
masster/wizard/wizard.py +430 -1866
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/METADATA +1 -1
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/RECORD +26 -28
masster/wizard/test_structure.py +0 -49
masster/wizard/test_wizard.py +0 -285
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/WHEEL +0 -0
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/entry_points.txt +0 -0
{masster-0.4.20.dist-info → masster-0.4.21.dist-info}/licenses/LICENSE +0 -0

masster/__init__.py CHANGED Viewed

@@ -8,6 +8,12 @@ mass spectrometry workflows.
 from __future__ import annotations
+import warnings
+# Suppress pyOpenMS environment variable warnings globally
+warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
+warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
 from masster._version import __version__
 # from masster._version import get_version

masster/_version.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.4.20"
+__version__ = "0.4.21"
 def get_version():

masster/sample/h5.py CHANGED Viewed

@@ -235,6 +235,22 @@ def _save_sample5(
                             data=serialized_data,
                             compression="gzip",
                         )
+                    elif col == "ms1_spec":
+                        # this column contains either None or numpy arrays with isotope pattern data
+                        # serialize numpy arrays to JSON strings for storage
+                        data = features[col]
+                        data_as_json_strings = []
+                        for i in range(len(data)):
+                            if data[i] is not None:
+                                # Convert numpy array to list and then to JSON
+                                data_as_json_strings.append(json.dumps(data[i].tolist()))
+                            else:
+                                data_as_json_strings.append("None")
+                        features_group.create_dataset(
+                            col,
+                            data=data_as_json_strings,
+                            compression="gzip",
+                        )
                     else:
                         self.logger.warning(
@@ -630,6 +646,25 @@ def _load_sample5(self, filename: str, map: bool = False):
                                                 )
                                         reconstructed_data.append(spectrum_list)
+                                data[col] = reconstructed_data
+                            case "ms1_spec":
+                                data_col = features_group[col][:]
+                                # Convert JSON strings back to numpy arrays
+                                reconstructed_data = []
+                                for item in data_col:
+                                    if isinstance(item, bytes):
+                                        item = item.decode("utf-8")
+                                    if item == "None" or item == "":
+                                        reconstructed_data.append(None)
+                                    else:
+                                        try:
+                                            # Parse JSON string to get list and convert to numpy array
+                                            array_data = json.loads(item)
+                                            reconstructed_data.append(np.array(array_data, dtype=np.float64))
+                                        except (json.JSONDecodeError, ValueError, TypeError):
+                                            reconstructed_data.append(None)
                                 data[col] = reconstructed_data
                             case _:
                                 self.logger.debug(f"Unexpected Object column '{col}'")
@@ -1371,6 +1406,25 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                         ):
                                             reconstructed_data.append(None)
+                                data[col] = reconstructed_data
+                            case "ms1_spec":
+                                data_col = features_group[col][:]
+                                # Convert JSON strings back to numpy arrays
+                                reconstructed_data = []
+                                for item in data_col:
+                                    if isinstance(item, bytes):
+                                        item = item.decode("utf-8")
+                                    if item == "None" or item == "":
+                                        reconstructed_data.append(None)
+                                    else:
+                                        try:
+                                            # Parse JSON string to get list and convert to numpy array
+                                            array_data = json.loads(item)
+                                            reconstructed_data.append(np.array(array_data, dtype=np.float64))
+                                        except (json.JSONDecodeError, ValueError, TypeError):
+                                            reconstructed_data.append(None)
                                 data[col] = reconstructed_data
                             case _:
                                 # Handle other Object columns as raw data
@@ -1407,6 +1461,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                 # Add Object columns one by one
                 for col, values in object_columns.items():
                     if not self.features_df.is_empty():
+                        # Fix for missing columns: if values is None, create list of None with correct length
+                        if values is None:
+                            values = [None] * len(self.features_df)
                         self.features_df = self.features_df.with_columns(
                             pl.Series(col, values, dtype=pl.Object).alias(col),
                         )
@@ -2027,7 +2084,7 @@ def load_dataframe_from_h5_group(
     for col in schema_columns:
         if col not in group:
             if logger:
-                logger.warning(f"Column '{col}' not found in {df_name}.")
+                logger.info(f"Column '{col}' not found in {df_name}.")
             data[col] = None
             missing_columns.append(col)
             continue

masster/sample/load.py CHANGED Viewed

@@ -48,9 +48,14 @@ from tqdm import tqdm
 from masster.chromatogram import Chromatogram
 from masster.spectrum import Spectrum
+# Suppress pyOpenMS warnings globally
+warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH.*", category=UserWarning)
+warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
 # Import pyopenms with suppressed warnings
 with warnings.catch_warnings():
-    warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
+    warnings.filterwarnings("ignore", message=".*OPENMS_DATA_PATH environment variable already exists.*", category=UserWarning)
+    warnings.filterwarnings("ignore", message="Warning: OPENMS_DATA_PATH.*", category=UserWarning)
     import pyopenms as oms
@@ -633,6 +638,7 @@ def _load_wiff(
             mz=peaks.mz.values,
             inty=peaks.intensity.values,
             ms_level=ms_level,
+            centroided=False,  # WIFF files always contain profile data
         )
         bl = spect.baseline()
         spect = spect.denoise(threshold=bl)

masster/sample/plot.py CHANGED Viewed

@@ -387,18 +387,19 @@ def plot_2d(
     show_only_features_with_ms2=False,
     show_isotopes=False,
     show_ms2=False,
+    show_in_browser=False,
     title=None,
     cmap=None,
     marker="circle",
     markersize=10,
-    size="dynamic",
+    size="static",
     raster_dynamic=True,
     raster_max_px=8,
     raster_threshold=0.8,
     height=600,
     width=800,
     mz_range=None,
-    rt_range=None,
+    rt_range=None
 ):
     """
     Plot a two-dimensional visualization of MS1 survey scan data with optional overlays
@@ -634,8 +635,10 @@ def plot_2d(
                 ("m/z", "@mz{0.0000}"),
                 ("feature_uid", "@feature_uid"),
                 ("inty", "@inty"),
-                ("quality", "@quality"),
-                ("rt_delta", "@rt_delta"),
+                ("iso", "@iso"),
+                ("adduct", "@adduct"),
+                ("chrom_coherence", "@chrom_coherence"),
+                ("chrom_prominence_scaled", "@chrom_prominence_scaled"),
             ],
         )
         feature_points_1 = hv.Points(
@@ -644,8 +647,8 @@ def plot_2d(
             vdims=[
                 "feature_uid",
                 "inty",
-                "quality",
-                "rt_delta",
+                "iso",
+                "adduct",
                 "ms2_scans",
                 "chrom_coherence",
                 "chrom_prominence_scaled",
@@ -666,8 +669,10 @@ def plot_2d(
                 ("m/z", "@mz{0.0000}"),
                 ("feature_uid", "@feature_uid"),
                 ("inty", "@inty"),
-                ("quality", "@quality"),
-                ("rt_delta", "@rt_delta"),
+                ("iso", "@iso"),
+                ("adduct", "@adduct"),
+                ("chrom_coherence", "@chrom_coherence"),
+                ("chrom_prominence_scaled", "@chrom_prominence_scaled"),
             ],
         )
         feature_points_2 = hv.Points(
@@ -676,8 +681,8 @@ def plot_2d(
             vdims=[
                 "feature_uid",
                 "inty",
-                "quality",
-                "rt_delta",
+                "iso",
+                "adduct",
                 "chrom_coherence",
                 "chrom_prominence_scaled",
             ],
@@ -702,10 +707,11 @@ def plot_2d(
                     ("m/z", "@mz{0.0000}"),
                     ("feature_uid", "@feature_uid"),
                     ("inty", "@inty"),
-                    ("quality", "@quality"),
-                    ("rt_delta", "@rt_delta"),
                     ("iso", "@iso"),
                     ("iso_of", "@iso_of"),
+                    ("adduct", "@adduct"),
+                    ("chrom_coherence", "@chrom_coherence"),
+                    ("chrom_prominence_scaled", "@chrom_prominence_scaled"),
                 ],
             )
             feature_points_iso = hv.Points(
@@ -714,10 +720,9 @@ def plot_2d(
                 vdims=[
                     "feature_uid",
                     "inty",
-                    "quality",
-                    "rt_delta",
                     "iso",
                     "iso_of",
+                    "adduct",
                     "chrom_coherence",
                     "chrom_prominence_scaled",
                 ],
@@ -918,21 +923,24 @@ def plot_2d(
             else:
                 # For slider plots, save the current state
                 hv.save(create_feature_overlay(markersize), filename, fmt="png")
-            return None
         else:
-            # For notebook display, return the interactive layout
-            return _display_plot(layout, layout)
+            # Use show() for display in notebook
+            layout.show()
     else:
         # Create a panel layout without slider
         layout = panel.Column(overlay)
+    # Handle display logic based on show_in_browser and raster_dynamic
     if filename is not None:
         # Use consistent save/display behavior
         self._handle_sample_plot_output(layout, filename, "panel")
-        return None
     else:
-        # Check if we're in a notebook environment and display appropriately
-        return _display_plot(overlay, layout)
+        # Show in browser if both show_in_browser and raster_dynamic are True
+        if show_in_browser and raster_dynamic:
+            layout.show()
+        else:
+            # Return to notebook for inline display
+            return layout
 def plot_2d_oracle(
@@ -1952,11 +1960,10 @@ def plot_feature_stats(
     filename=None,
 ):
     """
-    Generates overlaid distribution plots for selected feature metrics.
+    Generates vertically stacked density plots for selected feature metrics.
     The distributions are created separately for features with and without MS2 data.
-    Metrics include intensity, quality, retention time, m/z (and m/z delta), number of MS2 peaks,
-    summed MS2 intensities, and the MS2-to-MS1 ratio. The plots help to visualize the distribution
-    differences between features that are linked to MS2 spectra and those that are not.
+    Metrics include mz, rt, log10(inty), chrom_coherence, chrom_prominence, and chrom_prominence_scaled.
+    The plots help to visualize the distribution differences between features that are linked to MS2 spectra and those that are not.
     Parameters:
         filename (str, optional): The output filename. If the filename ends with ".html",
@@ -1972,54 +1979,28 @@ def plot_feature_stats(
     # Convert to pandas for operations that require pandas functionality
     if hasattr(feats, "to_pandas"):
         feats = feats.to_pandas()
-    # Compute m/z delta for each feature
-    feats["mz_delta"] = feats["mz_end"] - feats["mz_start"]
-    # Add a column with the number of peaks in the MS2 spectrum
-    feats["MS2peaks"] = feats["ms2_specs"].apply(
-        lambda x: len(x[0]) if x is not None else 0,
-    )
-    # Add a column with the sum of intensities in the MS2 spectrum
-    feats["MS2int"] = feats["ms2_specs"].apply(
-        lambda x: sum(x[0].inty) if x is not None else 0,
-    )
-    # Calculate the ratio of MS2 to MS1 intensities
-    feats["MS2toMS1"] = feats["MS2int"] / feats["inty"]
-    # Apply log10 transformation to intensity, quality, and MS2int columns (handling non-positive values)
+    # Apply log10 transformation to intensity (handling non-positive values)
     feats["inty"] = np.where(feats["inty"] <= 0, np.nan, np.log10(feats["inty"]))
-    # COMMENT: AR was bugging
-    # feats["chrom_heights"] = np.where(
-    #     feats["chrom_heights"] <= 0, np.nan, np.log10(feats["chrom_heights"])
-    # )
-    feats["quality"] = np.where(
-        feats["quality"] <= 0,
-        np.nan,
-        np.log10(feats["quality"]),
-    )
-    feats["MS2int"] = np.where(feats["MS2int"] <= 0, np.nan, np.log10(feats["MS2int"]))
+    # Apply log10 transformation to quality (handling non-positive values)
+    feats["quality"] = np.where(feats["quality"] <= 0, np.nan, np.log10(feats["quality"]))
     # Separate features based on presence of MS2 data
     feats_with_MS2 = feats[feats["ms2_scans"].notnull()]
     feats_without_MS2 = feats[feats["ms2_scans"].isnull()]
-    # Define the metrics to plot
+    # Define the specific metrics to plot
     cols_to_plot = [
         "mz",
-        "mz_delta",
-        "inty",
-        "quality",
-        "rt",
+        "rt",
+        "inty",  # Already log10 transformed above
         "rt_delta",
+        "quality",  # Already log10 transformed above
         "chrom_coherence",
         "chrom_prominence",
         "chrom_prominence_scaled",
-        # COMMENT: AR was bugging
-        # "chrom_heights",
-        # "chrom_heights_scaled",
-        "MS2peaks",
-        "MS2int",
-        "MS2toMS1",
+        "chrom_height_scaled",
     ]
     # Ensure an index column is available for plotting
@@ -2032,29 +2013,39 @@ def plot_feature_stats(
         data_with = feats_with_MS2[col].dropna().values
         data_without = feats_without_MS2[col].dropna().values
-        # Create distribution elements for features with and without MS2
+        # Create distribution elements - Green for WITH MS2, Red for WITHOUT MS2
         dist_with = hv.Distribution(data_with, label="With MS2").opts(
-            color="red",
+            color="green",
             alpha=0.6,
         )
         dist_without = hv.Distribution(data_without, label="Without MS2").opts(
-            color="blue",
+            color="red",
             alpha=0.6,
         )
         # Overlay the distributions with a legend and hover tool enabled
+        title = col
+        if col == "inty":
+            title = "log10(inty)"
+        elif col == "quality":
+            title = "log10(quality)"
         overlay = (dist_with * dist_without).opts(
-            title=col,
+            title=title,
             show_legend=True,
             tools=["hover"],
         )
         density_plots.append(overlay)
-    # Arrange the plots in a layout with three columns
+    # Arrange the plots in a grid layout (3 columns for 7 plots)
     layout = hv.Layout(density_plots).cols(3).opts(shared_axes=False)
     # Use consistent save/display behavior
-    self._handle_sample_plot_output(layout, filename, "holoviews")
+    if filename is not None:
+        self._handle_sample_plot_output(layout, filename, "holoviews")
+    else:
+        # Return the layout directly for notebook display
+        return layout
 def plot_tic(

masster/sample/processing.py CHANGED Viewed

@@ -1273,3 +1273,161 @@ def find_ms2(self, **kwargs):
     self.logger.debug(
         "Parameters stored to find_ms2",
     )
+def find_iso(self, rt_tolerance: float = 0.1, **kwargs):
+    """Extract isotopic distributions from MS1 data and add to features_df.
+    This method processes each feature to find isotopic distributions from MS1 data,
+    similar to the study.find_iso() method but for individual samples. The method
+    adds a new 'ms1_spec' column to features_df containing numpy arrays with
+    isotopic distribution data.
+    Args:
+        rt_tolerance (float): RT tolerance in minutes for matching MS1 scans. Default 0.1.
+        **kwargs: Additional parameters
+    Notes:
+        - Adds a new 'ms1_spec' column to features_df containing numpy arrays
+        - Each array contains [mz, intensity] pairs for the isotopic distribution
+        - Uses the same isotope shift pattern as study.find_iso()
+        - Only processes features that don't already have ms1_spec data
+    """
+    if self.features_df is None or self.features_df.is_empty():
+        self.logger.warning("No features found. Run find_features() first.")
+        return
+    if self.ms1_df is None or self.ms1_df.is_empty():
+        self.logger.warning("No MS1 data found.")
+        return
+    # Check if ms1_spec column already exists
+    if "ms1_spec" in self.features_df.columns:
+        features_without_spec = self.features_df.filter(pl.col("ms1_spec").is_null())
+        if features_without_spec.is_empty():
+            self.logger.info("All features already have isotopic distributions.")
+            return
+        self.logger.info(f"Processing {len(features_without_spec)} features without isotopic distributions.")
+    else:
+        # Add the ms1_spec column with None values
+        self.features_df = self.features_df.with_columns(
+            pl.lit(None, dtype=pl.Object).alias("ms1_spec")
+        )
+        features_without_spec = self.features_df
+        self.logger.info(f"Processing {len(features_without_spec)} features for isotopic distributions.")
+    # Define isotope shifts (same as study.find_iso)
+    isotope_shifts = np.array([
+        0.33,
+        0.50,
+        0.66,
+        1.00335,
+        1.50502,
+        2.00670,
+        3.01005,
+        4.01340,
+        5.01675,
+        6.02010,
+        7.02345,
+    ])
+    # Convert rt_tolerance from minutes to seconds
+    rt_tolerance_s = rt_tolerance * 60
+    # Process each feature
+    ms1_specs = []
+    feature_indices = []
+    for i, row in enumerate(tqdm(
+        features_without_spec.rows(named=True),
+        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Extracting isotope patterns"
+    )):
+        feature_rt = row["rt"]
+        feature_mz = row["mz"]
+        # Find MS1 scans within RT tolerance
+        rt_mask = (
+            (self.ms1_df["rt"] >= (feature_rt - rt_tolerance_s)) &
+            (self.ms1_df["rt"] <= (feature_rt + rt_tolerance_s))
+        )
+        ms1_in_range = self.ms1_df.filter(rt_mask)
+        if ms1_in_range.is_empty():
+            ms1_specs.append(None)
+            feature_indices.append(row["feature_uid"])
+            continue
+        # Extract isotopic pattern
+        isotope_pattern = []
+        # Start with the monoisotopic peak (M+0)
+        base_intensity = 0
+        mz_tolerance = 0.01  # 10 ppm at 1000 Da
+        # Find the base peak intensity
+        base_mask = (
+            (ms1_in_range["mz"] >= (feature_mz - mz_tolerance)) &
+            (ms1_in_range["mz"] <= (feature_mz + mz_tolerance))
+        )
+        base_peaks = ms1_in_range.filter(base_mask)
+        if not base_peaks.is_empty():
+            base_intensity = base_peaks["inty"].max()
+            isotope_pattern.append([feature_mz, base_intensity])
+        # Look for isotope peaks
+        for shift in isotope_shifts:
+            isotope_mz = feature_mz + shift
+            isotope_mask = (
+                (ms1_in_range["mz"] >= (isotope_mz - mz_tolerance)) &
+                (ms1_in_range["mz"] <= (isotope_mz + mz_tolerance))
+            )
+            isotope_peaks = ms1_in_range.filter(isotope_mask)
+            if not isotope_peaks.is_empty():
+                max_intensity = isotope_peaks["inty"].max()
+                # Only keep isotope peaks that are at least 1% of base peak
+                if base_intensity > 0 and max_intensity >= 0.01 * base_intensity:
+                    # Get the mz of the most intense peak
+                    max_peak = isotope_peaks.filter(pl.col("inty") == max_intensity).row(0, named=True)
+                    isotope_pattern.append([max_peak["mz"], max_intensity])
+        # Convert to numpy array or None if empty
+        if len(isotope_pattern) > 1:  # Need at least 2 points (monoisotopic + 1 isotope)
+            ms1_spec = np.array(isotope_pattern, dtype=np.float64)
+        else:
+            ms1_spec = None
+        ms1_specs.append(ms1_spec)
+        feature_indices.append(row["feature_uid"])
+    # Update the features_df with the isotopic spectra
+    update_df = pl.DataFrame({
+        "feature_uid": feature_indices,
+        "ms1_spec_new": pl.Series("ms1_spec_new", ms1_specs, dtype=pl.Object)
+    })
+    # Join and update
+    self.features_df = (
+        self.features_df.join(
+            update_df,
+            on="feature_uid",
+            how="left"
+        )
+        .with_columns([
+            pl.when(pl.col("ms1_spec_new").is_not_null())
+            .then(pl.col("ms1_spec_new"))
+            .otherwise(pl.col("ms1_spec"))
+            .alias("ms1_spec")
+        ])
+        .drop("ms1_spec_new")
+    )
+    # Log results
+    non_null_count = len([spec for spec in ms1_specs if spec is not None])
+    self.logger.info(f"Extracted isotopic distributions for {non_null_count}/{len(ms1_specs)} features.")
+    # Store parameters in history
+    params_dict = {"rt_tolerance": rt_tolerance}
+    params_dict.update(kwargs)
+    self.store_history(["find_iso"], params_dict)

masster/sample/sample.py CHANGED Viewed

@@ -97,6 +97,7 @@ from masster.sample.processing import _get_ztscan_stats
 from masster.sample.processing import _spec_to_mat
 from masster.sample.processing import analyze_dda
 from masster.sample.processing import find_features
+from masster.sample.processing import find_iso
 from masster.sample.processing import find_ms2
 from masster.sample.processing import get_spectrum
 from masster.sample.parameters import store_history
@@ -218,6 +219,7 @@ class Sample:
     save = save
     find_features = find_features
     find_adducts = find_adducts
+    find_iso = find_iso
     find_ms2 = find_ms2
     get_spectrum = get_spectrum
     filter = features_filter

masster/sample/sample5_schema.json CHANGED Viewed

@@ -90,6 +90,9 @@
       },
       "ms2_specs": {
         "dtype": "pl.Object"
+      },
+      "ms1_spec": {
+        "dtype": "pl.Object"
       }
     }
   },

masster 0.4.20__py3-none-any.whl → 0.4.21__py3-none-any.whl

Potentially problematic release.

masster 0.4.20py3-none-any.whl → 0.4.21py3-none-any.whl