PyPI - masster - Versions diffs - 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl - Mend

masster 0.3.14py3-none-any.whl → 0.3.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (21) hide show

masster/_version.py +1 -1
masster/sample/h5.py +577 -0
masster/sample/helpers.py +9 -2
masster/sample/load.py +68 -7
masster/sample/plot.py +43 -34
masster/sample/sample.py +4 -0
masster/spectrum.py +3 -0
masster/study/defaults/fill_def.py +3 -3
masster/study/defaults/study_def.py +20 -0
masster/study/export.py +3 -0
masster/study/h5.py +120 -23
masster/study/helpers.py +482 -11
masster/study/load.py +566 -205
masster/study/plot.py +9 -2
masster/study/study.py +32 -13
masster/study/study5_schema.json +17 -5
{masster-0.3.14.dist-info → masster-0.3.16.dist-info}/METADATA +1 -1
{masster-0.3.14.dist-info → masster-0.3.16.dist-info}/RECORD +21 -21
{masster-0.3.14.dist-info → masster-0.3.16.dist-info}/WHEEL +0 -0
{masster-0.3.14.dist-info → masster-0.3.16.dist-info}/entry_points.txt +0 -0
{masster-0.3.14.dist-info → masster-0.3.16.dist-info}/licenses/LICENSE +0 -0

masster/sample/load.py CHANGED Viewed

@@ -110,6 +110,63 @@ def load(
         self.label = label
+def load_study(
+    self,
+    filename=None,
+    ondisk=False,
+    type=None,
+    label=None,
+):
+    """
+    Optimized load method for study use that skips loading ms1_df for better performance.
+    This method is identical to load() but uses _load_sample5_study() for .sample5 files,
+    which skips reading the potentially large ms1_df dataset to improve throughput when
+    adding samples to studies.
+    Args:
+        filename (str, optional): The path to the file to load. If None, uses self.file_path.
+        ondisk (bool, optional): Whether to load on-disk or in-memory. Defaults to False.
+        type (str, optional): Override file type detection. Can be "ztscan". Defaults to None.
+        label (str, optional): Override sample label. Defaults to None.
+    Raises:
+        FileNotFoundError: If the specified file doesn't exist.
+        ValueError: If the file format is not supported.
+    Notes:
+        - Only affects .sample5 files (uses _load_sample5_study instead of _load_sample5)
+        - Other file formats (.mzML, .wiff, .raw) are loaded normally
+        - Sets ms1_df = None for .sample5 files to save memory and loading time
+        - Recommended for study workflows where MS1 spectral data is not needed
+    """
+    if filename is None:
+        filename = self.file_path
+    filename = os.path.abspath(filename)
+    if not os.path.exists(filename):
+        raise FileNotFoundError("Filename not valid. Provide a valid file path.")
+    self.ondisk = ondisk
+    # check if file is mzML
+    if filename.lower().endswith(".mzml"):
+        self._load_mzML(filename)
+    elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
+        self._load_wiff(filename)
+    elif filename.lower().endswith(".raw"):
+        self._load_raw(filename)
+    elif filename.lower().endswith(".sample5"):
+        self._load_sample5_study(filename)  # Use optimized version for study loading
+    else:
+        raise ValueError("File must be .mzML, .wiff, *.raw, or .sample5")
+    self.file_type = "dda"
+    if type is not None and type.lower() in ["ztscan"]:
+        self.file_type = "ztscan"
+    if label is not None:
+        self.label = label
 def _load_mzML(
     self,
     filename=None,
@@ -379,18 +436,23 @@ def _load_raw(
             mz=peaks.mz.values,
             inty=peaks.intensity.values,
             ms_level=s["ms_level"],
-            centroided=False,
         )
         # remove peaks with intensity <= 0
         bl = spect.baseline()
         spect = spect.denoise(threshold=bl)
         if spect.ms_level == 1:
-            spect = spect.centroid(
-                tolerance=self.parameters.mz_tol_ms1_da,
-                ppm=self.parameters.mz_tol_ms1_ppm,
-                min_points=self.parameters.centroid_min_points_ms1,
-            )
+            # Use the same logic as mzML loading
+            mz = np.array(spect.mz)
+            median_diff = np.median(np.diff(np.sort(mz))) if mz.size > 1 else None
+            if median_diff is not None and median_diff < 0.01:
+                spect = spect.centroid(
+                    tolerance=self.parameters.mz_tol_ms1_da,
+                    ppm=self.parameters.mz_tol_ms1_ppm,
+                    min_points=self.parameters.centroid_min_points_ms1,
+                )
         newscan = {
             "scan_uid": i,
             "cycle": cycle,
@@ -544,7 +606,6 @@ def _load_wiff(
             mz=peaks.mz.values,
             inty=peaks.intensity.values,
             ms_level=ms_level,
-            centroided=False,
         )
         bl = spect.baseline()
         spect = spect.denoise(threshold=bl)

masster/sample/plot.py CHANGED Viewed

@@ -56,7 +56,6 @@ from bokeh.models import HoverTool
 from holoviews import dim
 from holoviews.plotting.util import process_cmap
 from matplotlib.colors import rgb2hex
-from masster.chromatogram import Chromatogram
 # Parameters removed - using hardcoded defaults
@@ -75,23 +74,36 @@ def _is_notebook_environment():
         # Check for Jupyter/JupyterLab
         from IPython import get_ipython
-        if get_ipython() is not None:
+        ipython = get_ipython()
+        if ipython is not None:
             # Check if we're in a notebook context
-            shell = get_ipython().__class__.__name__
+            shell = ipython.__class__.__name__
             if shell in ["ZMQInteractiveShell", "Shell"]:  # Jupyter notebook/lab
                 return True
-        # Check for Marimo
+        # Check for Marimo - multiple ways to detect it
         import sys
+        # Check if marimo is in modules
         if "marimo" in sys.modules:
             return True
-        # Additional check for notebook environments
+        # Check for marimo in the call stack or environment
+        import inspect
+        frame = inspect.currentframe()
+        try:
+            while frame:
+                if frame.f_globals.get("__name__", "").startswith("marimo"):
+                    return True
+                frame = frame.f_back
+        finally:
+            del frame
+        # Additional check for notebook environments via builtins
         if hasattr(__builtins__, "__IPYTHON__") or hasattr(__builtins__, "_ih"):
             return True
-    except ImportError:
+    except (ImportError, AttributeError):
         pass
     return False
@@ -106,22 +118,17 @@ def _display_plot(plot_object, layout=None):
         layout: Optional panel layout object
     Returns:
-        The layout object if in notebook environment, None otherwise
+        The plot object for inline display in notebooks, None for browser display
     """
     if _is_notebook_environment():
-        # Display inline in notebook
-        try:
-            # For Jupyter notebooks, just return the plot object -
-            # holoviews will handle the display automatically
+        # In notebook environments, return the plot object for inline display
+        # For Jupyter notebooks, holoviews/panel objects display automatically when returned
+        if layout is not None:
+            # Return the layout object which will display inline in notebooks
+            return layout
+        else:
+            # Return the plot object directly for holoviews automatic display
             return plot_object
-        except Exception:
-            # Fallback to panel display for other notebook environments
-            if layout is not None:
-                return layout
-            else:
-                # Create a simple layout if none provided
-                simple_layout = panel.Column(plot_object)
-                return simple_layout
     else:
         # Display in browser (original behavior)
         if layout is not None:
@@ -512,7 +519,7 @@ def plot_2d(
             feats = feats.to_pandas()
         # if ms2_scans is not null, keep only the first element of the list
         feats["ms2_scans"] = feats["ms2_scans"].apply(
-            lambda x: x[0] if type(x) == list else x,
+            lambda x: x[0] if isinstance(x, list) else x,
         )
         if mz_range is not None:
             feats = feats[(feats["mz"] >= mz_range[0]) & (feats["mz"] <= mz_range[1])]
@@ -707,8 +714,6 @@ def plot_2d(
         class MarkerSizeController(param.Parameterized):
             size_slider = param.Number(default=markersize, bounds=(1, 20), step=0.5)
-        controller = MarkerSizeController()
         # Create a function that generates just the feature overlays with different sizes
         def create_feature_overlay(size_val):
             feature_overlay = None
@@ -808,7 +813,17 @@ def plot_2d(
         # Create layout
         layout = on.Column(slider_widget, reactive_plot, sizing_mode="stretch_width")
-        return layout
+        # Handle filename saving for slider mode
+        if filename is not None:
+            if filename.endswith(".html"):
+                layout.save(filename, embed=True)
+            else:
+                # For slider plots, save the current state
+                hv.save(create_feature_overlay(markersize), filename, fmt="png")
+            return None
+        else:
+            # For notebook display, return the interactive layout
+            return _display_plot(layout, layout)
     else:
         # Create a panel layout without slider
         layout = panel.Column(overlay)
@@ -819,17 +834,11 @@ def plot_2d(
             layout.save(filename, embed=True)
         else:
             # save the panel layout as a png
-            if use_slider_sizing:
-                # For slider plots, save the current state of the param_plot
-                hv.save(create_feature_overlay(markersize), filename, fmt="png")
-            else:
-                hv.save(overlay, filename, fmt="png")
+            hv.save(overlay, filename, fmt="png")
+        return None
     else:
         # Check if we're in a notebook environment and display appropriately
-        if use_slider_sizing:
-            return _display_plot(layout, layout)
-        else:
-            return _display_plot(overlay, layout)
+        return _display_plot(overlay, layout)
 def plot_2d_oracle(
@@ -982,7 +991,7 @@ def plot_2d_oracle(
         oracle_data = pd.read_csv(
             os.path.join(oracle_folder, "diag", "summary_by_feature.csv"),
         )
-    except:
+    except Exception:
         print(f"Could not read {oracle_folder}/diag/summary_by_feature.csv")
         return

masster/sample/sample.py CHANGED Viewed

@@ -49,6 +49,7 @@ from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
 # Sample-specific imports
 from masster.sample.h5 import _load_sample5
+from masster.sample.h5 import _load_sample5_study
 from masster.sample.h5 import _save_sample5
 from masster.sample.helpers import _delete_ms2
 from masster.sample.helpers import _estimate_memory_usage
@@ -72,6 +73,7 @@ from masster.sample.load import _load_wiff
 from masster.sample.load import chrom_extract
 from masster.sample.load import index_file
 from masster.sample.load import load
+from masster.sample.load import load_study
 from masster.sample.load import sanitize
 from masster.sample.plot import plot_2d
 from masster.sample.plot import plot_2d_oracle
@@ -203,6 +205,7 @@ class Sample:
     # Attach module functions as class methods
     load = load
+    load_study = load_study
     save = save
     find_features = find_features
     find_adducts = find_adducts
@@ -243,6 +246,7 @@ class Sample:
     # Additional method assignments for all imported functions
     _load_sample5 = _load_sample5
+    _load_sample5_study = _load_sample5_study
     _save_sample5 = _save_sample5
     _delete_ms2 = _delete_ms2
     _estimate_memory_usage = _estimate_memory_usage

masster/spectrum.py CHANGED Viewed

@@ -229,6 +229,9 @@ class Spectrum:
             elif isinstance(value, (list, dict)):
                 # Create copies of mutable objects
                 result[key] = copy.deepcopy(value)
+            elif isinstance(value, np.number):
+                # Handle numpy scalar types (float32, int32, etc.)
+                result[key] = value.item()
             else:
                 # Immutable objects can be copied directly
                 result[key] = value

masster/study/defaults/fill_def.py CHANGED Viewed

@@ -23,7 +23,7 @@ class fill_defaults:
     uids: Optional[list] = None
     mz_tol: float = 0.010
     rt_tol: float = 10.0
-    min_samples_rel: float = 0.05
+    min_samples_rel: float = 0.00
     min_samples_abs: int = 5
     _param_metadata: dict[str, dict[str, Any]] = field(
@@ -37,7 +37,7 @@ class fill_defaults:
                 "dtype": float,
                 "description": "m/z tolerance for chromatogram extraction (Da)",
                 "default": 0.010,
-                "min_value": 0.001,
+                "min_value": 0.0002,
                 "max_value": 0.1,
             },
             "rt_tol": {
@@ -51,7 +51,7 @@ class fill_defaults:
                 "dtype": float,
                 "description": "Minimum relative samples threshold (fraction)",
                 "default": 0.05,
-                "min_value": 0.01,
+                "min_value": 0.0,
                 "max_value": 1.0,
             },
             "min_samples_abs": {

masster/study/defaults/study_def.py CHANGED Viewed

@@ -18,6 +18,9 @@ class study_defaults:
         log_level (str): Logging level to be set for the logger. Default is "INFO".
         log_label (Optional[str]): Optional label for the logger. Default is None.
         log_sink (str): Output sink for logging. Default is "sys.stdout".
+        polarity (str): Polarity of the study (positive/negative). Default is "positive".
+        eic_mz_tol (float): Default m/z tolerance for EIC extraction and consensus selection. Default is 0.01.
+        eic_rt_tol (float): Default RT tolerance for EIC extraction and consensus selection. Default is 10.0.
     """
     folder: Optional[str] = None
@@ -27,6 +30,9 @@ class study_defaults:
     log_sink: str = "sys.stdout"
     polarity: str = "positive"
+    eic_mz_tol: float = 0.01
+    eic_rt_tol: float = 10.0
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             "folder": {
@@ -61,6 +67,20 @@ class study_defaults:
                 "default": "positive",
                 "allowed_values": ["positive", "negative", "pos", "neg"],
             },
+            "eic_mz_tol": {
+                "dtype": float,
+                "description": "Default m/z tolerance for EIC extraction and consensus selection (Da)",
+                "default": 0.01,
+                "min_value": 0.001,
+                "max_value": 1.0,
+            },
+            "eic_rt_tol": {
+                "dtype": float,
+                "description": "Default RT tolerance for EIC extraction and consensus selection (seconds)",
+                "default": 10.0,
+                "min_value": 0.2,
+                "max_value": 60.0,
+            },
         },
         repr=False,
     )

masster/study/export.py CHANGED Viewed

@@ -180,6 +180,9 @@ def _get_mgf_df(self, **kwargs):
             for e in energies:
                 cons_ms2_e = cons_ms2[cons_ms2["energy"] == e]
                 if selection == "best":
+                    # Check if the filtered DataFrame is empty
+                    if len(cons_ms2_e) == 0:
+                        continue
                     idx = cons_ms2_e["prec_inty"].idxmax()
                     cons_ms2_e_row = cons_ms2_e.loc[idx]
                     spect = cons_ms2_e_row["spec"]

masster/study/h5.py CHANGED Viewed

@@ -695,19 +695,59 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     if schema_columns is None:
         schema_columns = []
-    # First pass: load all existing columns
+    # Get available columns from HDF5 file
+    hdf5_columns = list(group.keys())
+    logger.debug(f"HDF5 columns available: {hdf5_columns}")
+    # Handle column name migrations for backward compatibility first
+    if df_name == "samples_df":
+        # Migrate old column names to new names
+        column_migrations = {
+            "size": "num_features",
+            "file_source": "sample_source",
+            "ms1": "num_ms1",
+            "ms2": "num_ms2"
+        }
+        # Create a mapping of what's actually available after migrations
+        effective_columns = hdf5_columns.copy()
+        for old_name, new_name in column_migrations.items():
+            if old_name in effective_columns:
+                logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
+                # Add the new name to effective columns and optionally remove old name
+                effective_columns.append(new_name)
+    # First pass: load all existing columns (including migrated ones)
     for col in schema_columns or []:
-        if col not in group:
+        source_col = col
+        # Check if we need to load from a migrated column name
+        if df_name == "samples_df":
+            column_migrations = {
+                "size": "num_features",
+                "file_source": "sample_source",
+                "ms1": "num_ms1",
+                "ms2": "num_ms2"
+            }
+            # Reverse lookup - find old name for new name
+            reverse_migrations = {v: k for k, v in column_migrations.items()}
+            if col in reverse_migrations:
+                old_name = reverse_migrations[col]
+                if old_name in group:
+                    source_col = old_name
+                    logger.info(f"Loading '{col}' from old column name '{old_name}'")
+        if source_col not in group:
             missing_columns.append(col)
             continue
         dtype = schema[df_name]["columns"][col].get("dtype", "native")
         if dtype == "pl.Object" or col in object_columns:
             # Handle object columns specially
-            data[col] = _reconstruct_object_column(group[col][:], col)
+            data[col] = _reconstruct_object_column(group[source_col][:], col)
         else:
             # Regular columns
-            column_data = group[col][:]
+            column_data = group[source_col][:]
             # Convert -123 sentinel values back to None for numeric columns
             if len(column_data) > 0:
@@ -759,17 +799,43 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     # Second pass: handle missing columns
     for col in missing_columns:
         logger.warning(f"Column '{col}' not found in {df_name}.")
-        # For missing columns, create appropriately sized array of None values
+        # For missing columns, create appropriately sized array with appropriate defaults
         if col in object_columns:
             data[col] = [None] * expected_length
             logger.debug(f"Created missing object column '{col}' with length {expected_length}")
         else:
-            data[col] = [None] * expected_length
-            logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
+            # Provide specific default values for new columns for backward compatibility
+            if df_name == "samples_df":
+                if col == "sample_group":
+                    data[col] = [""] * expected_length  # Empty string default
+                    logger.debug(f"Created missing column '{col}' with empty string defaults")
+                elif col == "sample_batch":
+                    data[col] = [1] * expected_length  # Batch 1 default
+                    logger.debug(f"Created missing column '{col}' with batch 1 defaults")
+                elif col == "sample_sequence":
+                    # Create increasing sequence numbers
+                    data[col] = list(range(1, expected_length + 1))
+                    logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
+                else:
+                    data[col] = [None] * expected_length
+                    logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
+            else:
+                data[col] = [None] * expected_length
+                logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
     # Check for columns in HDF5 file that are not in schema (for backward compatibility)
-    hdf5_columns = list(group.keys())
-    extra_columns = [col for col in hdf5_columns if col not in (schema_columns or [])]
+    # But skip the old column names we already migrated
+    migrated_old_names = set()
+    if df_name == "samples_df":
+        column_migrations = {
+            "size": "num_features",
+            "file_source": "sample_source",
+            "ms1": "num_ms1",
+            "ms2": "num_ms2"
+        }
+        migrated_old_names = set(column_migrations.keys())
+    extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
     for col in extra_columns:
         logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -1320,9 +1386,12 @@ def _load_study5(self, filename=None):
                         "sample_type": [],
                         "size": [],
                         "map_id": [],
-                        "file_source": [],
-                        "ms1": [],
-                        "ms2": [],
+                        "sample_source": [],
+                        "num_ms1": [],
+                        "num_ms2": [],
+                        "sample_group": [],
+                        "sample_batch": [],
+                        "sample_sequence": [],
                     },
                     schema={
                         "sample_uid": pl.Int64,
@@ -1330,10 +1399,13 @@ def _load_study5(self, filename=None):
                         "sample_path": pl.Utf8,
                         "sample_type": pl.Utf8,
                         "size": pl.Int64,
-                        "map_id": pl.Utf8,
-                        "file_source": pl.Utf8,
-                        "ms1": pl.Int64,
-                        "ms2": pl.Int64,
+                        "map_id": pl.Int64,
+                        "sample_source": pl.Utf8,
+                        "num_ms1": pl.Int64,
+                        "num_ms2": pl.Int64,
+                        "sample_group": pl.Utf8,
+                        "sample_batch": pl.Int64,
+                        "sample_sequence": pl.Int64,
                     },
                 )
             pbar.update(1)
@@ -1354,9 +1426,12 @@ def _load_study5(self, filename=None):
                         "sample_type": [],
                         "size": [],
                         "map_id": [],
-                        "file_source": [],
-                        "ms1": [],
-                        "ms2": [],
+                        "sample_source": [],
+                        "num_ms1": [],
+                        "num_ms2": [],
+                        "sample_group": [],
+                        "sample_batch": [],
+                        "sample_sequence": [],
                     },
                     schema={
                         "sample_uid": pl.Int64,
@@ -1364,10 +1439,13 @@ def _load_study5(self, filename=None):
                         "sample_path": pl.Utf8,
                         "sample_type": pl.Utf8,
                         "size": pl.Int64,
-                        "map_id": pl.Utf8,
-                        "file_source": pl.Utf8,
-                        "ms1": pl.Int64,
-                        "ms2": pl.Int64,
+                        "map_id": pl.Int64,
+                        "sample_source": pl.Utf8,
+                        "num_ms1": pl.Int64,
+                        "num_ms2": pl.Int64,
+                        "sample_group": pl.Utf8,
+                        "sample_batch": pl.Int64,
+                        "sample_sequence": pl.Int64,
                     },
                 )
             pbar.update(1)
@@ -1463,4 +1541,23 @@ def _load_study5(self, filename=None):
                 self.consensus_ms2 = None
             pbar.update(1)
+    # Check and migrate old string-based map_id to integer indices
+    if (self.samples_df is not None and
+        not self.samples_df.is_empty() and
+        self.samples_df['map_id'].dtype == pl.Utf8):
+        self.logger.info("Detected old string-based map_id format, migrating to integer indices")
+        # Convert string-based map_id to integer indices
+        sample_count = len(self.samples_df)
+        new_map_ids = list(range(sample_count))
+        self.samples_df = self.samples_df.with_columns(
+            pl.lit(new_map_ids).alias("map_id")
+        )
+        # Ensure the column is Int64 type
+        self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
+        self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})")
     self.logger.debug("Study loaded")

masster 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

Potentially problematic release.

masster 0.3.14py3-none-any.whl → 0.3.16py3-none-any.whl