PyPI - masster - Versions diffs - 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl - Mend

masster 0.3.13py3-none-any.whl → 0.3.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (20) hide show

masster/sample/helpers.py +9 -2
masster/sample/load.py +11 -7
masster/sample/plot.py +43 -34
masster/study/defaults/study_def.py +20 -0
masster/study/h5.py +120 -23
masster/study/helpers.py +974 -13
masster/study/load.py +28 -15
masster/study/plot.py +270 -98
masster/study/processing.py +9 -0
masster/study/study.py +32 -38
masster/study/study5_schema.json +14 -5
{masster-0.3.13.dist-info → masster-0.3.15.dist-info}/METADATA +2 -1
{masster-0.3.13.dist-info → masster-0.3.15.dist-info}/RECORD +16 -20
masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +0 -199787
masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
masster/docs/SCX_API_Documentation.md +0 -0
masster/docs/SCX_DLL_Analysis.md +0 -0
{masster-0.3.13.dist-info → masster-0.3.15.dist-info}/WHEEL +0 -0
{masster-0.3.13.dist-info → masster-0.3.15.dist-info}/entry_points.txt +0 -0
{masster-0.3.13.dist-info → masster-0.3.15.dist-info}/licenses/LICENSE +0 -0

masster/sample/helpers.py CHANGED Viewed

@@ -281,7 +281,7 @@ def select_closest_scan(
     return scan
-def get_eic(self, mz, mz_tol=0.01):
+def get_eic(self, mz, mz_tol=None):
     """
     Extract an extracted ion chromatogram (EIC) from `ms1_df` for a target m/z ± mz_tol.
@@ -291,11 +291,18 @@ def get_eic(self, mz, mz_tol=0.01):
     Parameters:
         mz (float): target m/z value
-        mz_tol (float): tolerance around mz (default 0.01)
+        mz_tol (float): tolerance around mz. If None, uses self.parameters.eic_mz_tol or defaults to 0.01
     Returns:
         polars.DataFrame or None: chromatogram with columns ['rt', 'inty'] or None if not available
     """
+    # Use default mz_tol from sample parameters if not provided
+    if mz_tol is None:
+        if hasattr(self, 'parameters') and hasattr(self.parameters, 'eic_mz_tol'):
+            mz_tol = self.parameters.eic_mz_tol
+        else:
+            mz_tol = 0.01  # fallback default
     # Validate ms1_df
     if not hasattr(self, "ms1_df") or self.ms1_df is None:
         if hasattr(self, "logger"):

masster/sample/load.py CHANGED Viewed

@@ -379,18 +379,23 @@ def _load_raw(
             mz=peaks.mz.values,
             inty=peaks.intensity.values,
             ms_level=s["ms_level"],
-            centroided=False,
         )
         # remove peaks with intensity <= 0
         bl = spect.baseline()
         spect = spect.denoise(threshold=bl)
         if spect.ms_level == 1:
-            spect = spect.centroid(
-                tolerance=self.parameters.mz_tol_ms1_da,
-                ppm=self.parameters.mz_tol_ms1_ppm,
-                min_points=self.parameters.centroid_min_points_ms1,
-            )
+            # Use the same logic as mzML loading
+            mz = np.array(spect.mz)
+            median_diff = np.median(np.diff(np.sort(mz))) if mz.size > 1 else None
+            if median_diff is not None and median_diff < 0.01:
+                spect = spect.centroid(
+                    tolerance=self.parameters.mz_tol_ms1_da,
+                    ppm=self.parameters.mz_tol_ms1_ppm,
+                    min_points=self.parameters.centroid_min_points_ms1,
+                )
         newscan = {
             "scan_uid": i,
             "cycle": cycle,
@@ -544,7 +549,6 @@ def _load_wiff(
             mz=peaks.mz.values,
             inty=peaks.intensity.values,
             ms_level=ms_level,
-            centroided=False,
         )
         bl = spect.baseline()
         spect = spect.denoise(threshold=bl)

masster/sample/plot.py CHANGED Viewed

@@ -56,7 +56,6 @@ from bokeh.models import HoverTool
 from holoviews import dim
 from holoviews.plotting.util import process_cmap
 from matplotlib.colors import rgb2hex
-from masster.chromatogram import Chromatogram
 # Parameters removed - using hardcoded defaults
@@ -75,23 +74,36 @@ def _is_notebook_environment():
         # Check for Jupyter/JupyterLab
         from IPython import get_ipython
-        if get_ipython() is not None:
+        ipython = get_ipython()
+        if ipython is not None:
             # Check if we're in a notebook context
-            shell = get_ipython().__class__.__name__
+            shell = ipython.__class__.__name__
             if shell in ["ZMQInteractiveShell", "Shell"]:  # Jupyter notebook/lab
                 return True
-        # Check for Marimo
+        # Check for Marimo - multiple ways to detect it
         import sys
+        # Check if marimo is in modules
         if "marimo" in sys.modules:
             return True
-        # Additional check for notebook environments
+        # Check for marimo in the call stack or environment
+        import inspect
+        frame = inspect.currentframe()
+        try:
+            while frame:
+                if frame.f_globals.get("__name__", "").startswith("marimo"):
+                    return True
+                frame = frame.f_back
+        finally:
+            del frame
+        # Additional check for notebook environments via builtins
         if hasattr(__builtins__, "__IPYTHON__") or hasattr(__builtins__, "_ih"):
             return True
-    except ImportError:
+    except (ImportError, AttributeError):
         pass
     return False
@@ -106,22 +118,17 @@ def _display_plot(plot_object, layout=None):
         layout: Optional panel layout object
     Returns:
-        The layout object if in notebook environment, None otherwise
+        The plot object for inline display in notebooks, None for browser display
     """
     if _is_notebook_environment():
-        # Display inline in notebook
-        try:
-            # For Jupyter notebooks, just return the plot object -
-            # holoviews will handle the display automatically
+        # In notebook environments, return the plot object for inline display
+        # For Jupyter notebooks, holoviews/panel objects display automatically when returned
+        if layout is not None:
+            # Return the layout object which will display inline in notebooks
+            return layout
+        else:
+            # Return the plot object directly for holoviews automatic display
             return plot_object
-        except Exception:
-            # Fallback to panel display for other notebook environments
-            if layout is not None:
-                return layout
-            else:
-                # Create a simple layout if none provided
-                simple_layout = panel.Column(plot_object)
-                return simple_layout
     else:
         # Display in browser (original behavior)
         if layout is not None:
@@ -512,7 +519,7 @@ def plot_2d(
             feats = feats.to_pandas()
         # if ms2_scans is not null, keep only the first element of the list
         feats["ms2_scans"] = feats["ms2_scans"].apply(
-            lambda x: x[0] if type(x) == list else x,
+            lambda x: x[0] if isinstance(x, list) else x,
         )
         if mz_range is not None:
             feats = feats[(feats["mz"] >= mz_range[0]) & (feats["mz"] <= mz_range[1])]
@@ -707,8 +714,6 @@ def plot_2d(
         class MarkerSizeController(param.Parameterized):
             size_slider = param.Number(default=markersize, bounds=(1, 20), step=0.5)
-        controller = MarkerSizeController()
         # Create a function that generates just the feature overlays with different sizes
         def create_feature_overlay(size_val):
             feature_overlay = None
@@ -808,7 +813,17 @@ def plot_2d(
         # Create layout
         layout = on.Column(slider_widget, reactive_plot, sizing_mode="stretch_width")
-        return layout
+        # Handle filename saving for slider mode
+        if filename is not None:
+            if filename.endswith(".html"):
+                layout.save(filename, embed=True)
+            else:
+                # For slider plots, save the current state
+                hv.save(create_feature_overlay(markersize), filename, fmt="png")
+            return None
+        else:
+            # For notebook display, return the interactive layout
+            return _display_plot(layout, layout)
     else:
         # Create a panel layout without slider
         layout = panel.Column(overlay)
@@ -819,17 +834,11 @@ def plot_2d(
             layout.save(filename, embed=True)
         else:
             # save the panel layout as a png
-            if use_slider_sizing:
-                # For slider plots, save the current state of the param_plot
-                hv.save(create_feature_overlay(markersize), filename, fmt="png")
-            else:
-                hv.save(overlay, filename, fmt="png")
+            hv.save(overlay, filename, fmt="png")
+        return None
     else:
         # Check if we're in a notebook environment and display appropriately
-        if use_slider_sizing:
-            return _display_plot(layout, layout)
-        else:
-            return _display_plot(overlay, layout)
+        return _display_plot(overlay, layout)
 def plot_2d_oracle(
@@ -982,7 +991,7 @@ def plot_2d_oracle(
         oracle_data = pd.read_csv(
             os.path.join(oracle_folder, "diag", "summary_by_feature.csv"),
         )
-    except:
+    except Exception:
         print(f"Could not read {oracle_folder}/diag/summary_by_feature.csv")
         return

masster/study/defaults/study_def.py CHANGED Viewed

@@ -18,6 +18,9 @@ class study_defaults:
         log_level (str): Logging level to be set for the logger. Default is "INFO".
         log_label (Optional[str]): Optional label for the logger. Default is None.
         log_sink (str): Output sink for logging. Default is "sys.stdout".
+        polarity (str): Polarity of the study (positive/negative). Default is "positive".
+        eic_mz_tol (float): Default m/z tolerance for EIC extraction and consensus selection. Default is 0.01.
+        eic_rt_tol (float): Default RT tolerance for EIC extraction and consensus selection. Default is 10.0.
     """
     folder: Optional[str] = None
@@ -27,6 +30,9 @@ class study_defaults:
     log_sink: str = "sys.stdout"
     polarity: str = "positive"
+    eic_mz_tol: float = 0.01
+    eic_rt_tol: float = 10.0
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
             "folder": {
@@ -61,6 +67,20 @@ class study_defaults:
                 "default": "positive",
                 "allowed_values": ["positive", "negative", "pos", "neg"],
             },
+            "eic_mz_tol": {
+                "dtype": float,
+                "description": "Default m/z tolerance for EIC extraction and consensus selection (Da)",
+                "default": 0.01,
+                "min_value": 0.001,
+                "max_value": 1.0,
+            },
+            "eic_rt_tol": {
+                "dtype": float,
+                "description": "Default RT tolerance for EIC extraction and consensus selection (seconds)",
+                "default": 10.0,
+                "min_value": 0.2,
+                "max_value": 60.0,
+            },
         },
         repr=False,
     )

masster/study/h5.py CHANGED Viewed

@@ -695,19 +695,59 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     if schema_columns is None:
         schema_columns = []
-    # First pass: load all existing columns
+    # Get available columns from HDF5 file
+    hdf5_columns = list(group.keys())
+    logger.debug(f"HDF5 columns available: {hdf5_columns}")
+    # Handle column name migrations for backward compatibility first
+    if df_name == "samples_df":
+        # Migrate old column names to new names
+        column_migrations = {
+            "size": "num_features",
+            "file_source": "sample_source",
+            "ms1": "num_ms1",
+            "ms2": "num_ms2"
+        }
+        # Create a mapping of what's actually available after migrations
+        effective_columns = hdf5_columns.copy()
+        for old_name, new_name in column_migrations.items():
+            if old_name in effective_columns:
+                logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
+                # Add the new name to effective columns and optionally remove old name
+                effective_columns.append(new_name)
+    # First pass: load all existing columns (including migrated ones)
     for col in schema_columns or []:
-        if col not in group:
+        source_col = col
+        # Check if we need to load from a migrated column name
+        if df_name == "samples_df":
+            column_migrations = {
+                "size": "num_features",
+                "file_source": "sample_source",
+                "ms1": "num_ms1",
+                "ms2": "num_ms2"
+            }
+            # Reverse lookup - find old name for new name
+            reverse_migrations = {v: k for k, v in column_migrations.items()}
+            if col in reverse_migrations:
+                old_name = reverse_migrations[col]
+                if old_name in group:
+                    source_col = old_name
+                    logger.info(f"Loading '{col}' from old column name '{old_name}'")
+        if source_col not in group:
             missing_columns.append(col)
             continue
         dtype = schema[df_name]["columns"][col].get("dtype", "native")
         if dtype == "pl.Object" or col in object_columns:
             # Handle object columns specially
-            data[col] = _reconstruct_object_column(group[col][:], col)
+            data[col] = _reconstruct_object_column(group[source_col][:], col)
         else:
             # Regular columns
-            column_data = group[col][:]
+            column_data = group[source_col][:]
             # Convert -123 sentinel values back to None for numeric columns
             if len(column_data) > 0:
@@ -759,17 +799,43 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     # Second pass: handle missing columns
     for col in missing_columns:
         logger.warning(f"Column '{col}' not found in {df_name}.")
-        # For missing columns, create appropriately sized array of None values
+        # For missing columns, create appropriately sized array with appropriate defaults
         if col in object_columns:
             data[col] = [None] * expected_length
             logger.debug(f"Created missing object column '{col}' with length {expected_length}")
         else:
-            data[col] = [None] * expected_length
-            logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
+            # Provide specific default values for new columns for backward compatibility
+            if df_name == "samples_df":
+                if col == "sample_group":
+                    data[col] = [""] * expected_length  # Empty string default
+                    logger.debug(f"Created missing column '{col}' with empty string defaults")
+                elif col == "sample_batch":
+                    data[col] = [1] * expected_length  # Batch 1 default
+                    logger.debug(f"Created missing column '{col}' with batch 1 defaults")
+                elif col == "sample_sequence":
+                    # Create increasing sequence numbers
+                    data[col] = list(range(1, expected_length + 1))
+                    logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
+                else:
+                    data[col] = [None] * expected_length
+                    logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
+            else:
+                data[col] = [None] * expected_length
+                logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
     # Check for columns in HDF5 file that are not in schema (for backward compatibility)
-    hdf5_columns = list(group.keys())
-    extra_columns = [col for col in hdf5_columns if col not in (schema_columns or [])]
+    # But skip the old column names we already migrated
+    migrated_old_names = set()
+    if df_name == "samples_df":
+        column_migrations = {
+            "size": "num_features",
+            "file_source": "sample_source",
+            "ms1": "num_ms1",
+            "ms2": "num_ms2"
+        }
+        migrated_old_names = set(column_migrations.keys())
+    extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
     for col in extra_columns:
         logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -1320,9 +1386,12 @@ def _load_study5(self, filename=None):
                         "sample_type": [],
                         "size": [],
                         "map_id": [],
-                        "file_source": [],
-                        "ms1": [],
-                        "ms2": [],
+                        "sample_source": [],
+                        "num_ms1": [],
+                        "num_ms2": [],
+                        "sample_group": [],
+                        "sample_batch": [],
+                        "sample_sequence": [],
                     },
                     schema={
                         "sample_uid": pl.Int64,
@@ -1330,10 +1399,13 @@ def _load_study5(self, filename=None):
                         "sample_path": pl.Utf8,
                         "sample_type": pl.Utf8,
                         "size": pl.Int64,
-                        "map_id": pl.Utf8,
-                        "file_source": pl.Utf8,
-                        "ms1": pl.Int64,
-                        "ms2": pl.Int64,
+                        "map_id": pl.Int64,
+                        "sample_source": pl.Utf8,
+                        "num_ms1": pl.Int64,
+                        "num_ms2": pl.Int64,
+                        "sample_group": pl.Utf8,
+                        "sample_batch": pl.Int64,
+                        "sample_sequence": pl.Int64,
                     },
                 )
             pbar.update(1)
@@ -1354,9 +1426,12 @@ def _load_study5(self, filename=None):
                         "sample_type": [],
                         "size": [],
                         "map_id": [],
-                        "file_source": [],
-                        "ms1": [],
-                        "ms2": [],
+                        "sample_source": [],
+                        "num_ms1": [],
+                        "num_ms2": [],
+                        "sample_group": [],
+                        "sample_batch": [],
+                        "sample_sequence": [],
                     },
                     schema={
                         "sample_uid": pl.Int64,
@@ -1364,10 +1439,13 @@ def _load_study5(self, filename=None):
                         "sample_path": pl.Utf8,
                         "sample_type": pl.Utf8,
                         "size": pl.Int64,
-                        "map_id": pl.Utf8,
-                        "file_source": pl.Utf8,
-                        "ms1": pl.Int64,
-                        "ms2": pl.Int64,
+                        "map_id": pl.Int64,
+                        "sample_source": pl.Utf8,
+                        "num_ms1": pl.Int64,
+                        "num_ms2": pl.Int64,
+                        "sample_group": pl.Utf8,
+                        "sample_batch": pl.Int64,
+                        "sample_sequence": pl.Int64,
                     },
                 )
             pbar.update(1)
@@ -1463,4 +1541,23 @@ def _load_study5(self, filename=None):
                 self.consensus_ms2 = None
             pbar.update(1)
+    # Check and migrate old string-based map_id to integer indices
+    if (self.samples_df is not None and
+        not self.samples_df.is_empty() and
+        self.samples_df['map_id'].dtype == pl.Utf8):
+        self.logger.info("Detected old string-based map_id format, migrating to integer indices")
+        # Convert string-based map_id to integer indices
+        sample_count = len(self.samples_df)
+        new_map_ids = list(range(sample_count))
+        self.samples_df = self.samples_df.with_columns(
+            pl.lit(new_map_ids).alias("map_id")
+        )
+        # Ensure the column is Int64 type
+        self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
+        self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})")
     self.logger.debug("Study loaded")

masster 0.3.13__py3-none-any.whl → 0.3.15__py3-none-any.whl

Potentially problematic release.

masster 0.3.13py3-none-any.whl → 0.3.15py3-none-any.whl