PyPI - masster - Versions diffs - 0.5.3__tar.gz → 0.5.4__tar.gz - Mend

masster 0.5.3tar.gz → 0.5.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (94) hide show

{masster-0.5.3 → masster-0.5.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.5.3
+Version: 0.5.4
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.5.3 → masster-0.5.4}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.5.3"
+version = "0.5.4"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.5.3 → masster-0.5.4}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.4"
+__version__ = "0.5.5"
 def get_version():

{masster-0.5.3 → masster-0.5.4}/src/masster/sample/adducts.py RENAMED Viewed

@@ -473,7 +473,7 @@ def find_adducts(self, **kwargs):
     self.logger.debug(f"Min probability threshold: {min_probability}")
     # Generate comprehensive adduct specifications using the Sample method
-    adducts_df = self._get_adducts(
+    adducts_df = _get_adducts(self,
         adducts_list=adducts_list,
         charge_min=charge_min,
         charge_max=charge_max,

{masster-0.5.3 → masster-0.5.4}/src/masster/sample/h5.py RENAMED Viewed

@@ -62,8 +62,8 @@ def _save_sample5(
             return
     # synchronize feature_map if it exists
-    if hasattr(self, "_feature_map") and self._feature_map is not None:
-        self._features_sync()
+    #if hasattr(self, "_feature_map") and self._feature_map is not None:
+    #    self._features_sync()
     # if no extension is given, add .sample5
     if not filename.endswith(".sample5"):
@@ -1057,15 +1057,15 @@ def _load_sample5(self, filename: str, map: bool = False):
         # Parameters are now loaded from metadata JSON (see above)
         # Lib and lib_match are no longer saved/loaded
-    if map:
-        featureXML = filename.replace(".sample5", ".featureXML")
-        if os.path.exists(featureXML):
-            self._load_featureXML(featureXML)
-            self._features_sync()
-        else:
-            self.logger.warning(
-                f"Feature XML file {featureXML} not found, skipping loading.",
-            )
+    #if map:
+    #    featureXML = filename.replace(".sample5", ".featureXML")
+    #    if os.path.exists(featureXML):
+    #        self._load_featureXML(featureXML)
+    #        #self._features_sync()
+    #    else:
+    #        self.logger.warning(
+    #            f"Feature XML file {featureXML} not found, skipping loading.",
+    #        )
     # set self.file_path to *.sample5
     self.file_path = filename

{masster-0.5.3 → masster-0.5.4}/src/masster/sample/helpers.py RENAMED Viewed

@@ -569,7 +569,7 @@ def select(
         self.logger.info(f"Selected features. Features remaining: {len(feats)}")
     return feats
+'''
 def _features_sync(self):
     """
     Synchronizes the cached FeatureMap with features_df.
@@ -675,7 +675,7 @@ def _features_sync(self):
         self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
     except Exception as e:
         self.logger.error(f"Error during feature synchronization: {e}")
+'''
 def features_delete(self, features: list | None = None):
     """

{masster-0.5.3 → masster-0.5.4}/src/masster/sample/load.py RENAMED Viewed

@@ -46,6 +46,7 @@ import polars as pl
 from tqdm import tqdm
 from masster.chromatogram import Chromatogram
+from .h5 import _load_sample5
 from masster.spectrum import Spectrum
 # Suppress pyOpenMS warnings globally
@@ -96,13 +97,13 @@ def load(
     # check if file is mzML
     if filename.lower().endswith(".mzml"):
-        self._load_mzML(filename)
+        _load_mzML(self, filename)
     elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
-        self._load_wiff(filename)
+        _load_wiff(self, filename)
     elif filename.lower().endswith(".raw"):
-        self._load_raw(filename)
+        _load_raw(self, filename)
     elif filename.lower().endswith(".sample5"):
-        self._load_sample5(filename)
+        _load_sample5(self, filename)
     # elif filename.lower().endswith(".h5"):
     #    self._load_h5(filename)
     else:

{masster-0.5.3 → masster-0.5.4}/src/masster/sample/processing.py RENAMED Viewed

@@ -795,7 +795,7 @@ def find_features(self, **kwargs):
     )
     self.features_df = df
-    self._features_sync()
+    #self._features_sync()
     self.logger.info(f"Feature detection completed. Total features: {len(df)}")
     # store params

{masster-0.5.3 → masster-0.5.4}/src/masster/sample/sample.py RENAMED Viewed

@@ -48,9 +48,9 @@ from masster.sample.defaults.find_ms2_def import find_ms2_defaults
 from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
 # Sample-specific imports - keeping these private, only for internal use
-# from masster.sample.h5 import _load_sample5
+from masster.sample.h5 import _load_sample5
 # from masster.sample.h5 import _load_sample5_study
-# from masster.sample.h5 import _save_sample5
+from masster.sample.h5 import _save_sample5
 # from masster.sample.helpers import _delete_ms2
 from masster.sample.helpers import _estimate_memory_usage
 from masster.sample.helpers import _get_scan_uids
@@ -263,12 +263,16 @@ class Sample:
     _get_feature_map = _get_feature_map
     # Additional method assignments for all imported functions
-    # Removed internal-only methods: _load_sample5, _load_sample5_study, _save_sample5, _delete_ms2, _features_sync
+    # Removed internal-only methods: _load_sample5_study, _delete_ms2, _features_sync
     _estimate_memory_usage = _estimate_memory_usage
     _get_scan_uids = _get_scan_uids
     _get_feature_uids = _get_feature_uids
     features_delete = features_delete
     features_filter = features_filter
+    _save_sample5 = _save_sample5
+    _load_sample5 = _load_sample5
     # Removed internal-only load methods: _load_featureXML, _load_ms2data, _load_mzML, _load_raw, _load_wiff
     chrom_extract = chrom_extract
     _index_file = _index_file  # Renamed from index_file to be internal-only

{masster-0.5.3 → masster-0.5.4}/src/masster/study/h5.py RENAMED Viewed

@@ -304,6 +304,30 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                     serialized_chunk.append(item.to_json())
                 else:
                     serialized_chunk.append("None")
+        elif col_name == "iso":
+            # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+            for item in chunk_data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        serialized_chunk.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    serialized_chunk.append("None")
+        elif col_name == "ms1_spec":
+            # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+            for item in chunk_data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        serialized_chunk.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    serialized_chunk.append("None")
         else:
             logger.warning(
                 f"Unknown object column '{col_name}', using default serialization",
@@ -564,6 +588,34 @@ def _save_dataframe_column_legacy(
                 else:
                     data_as_str.append("None")
             group.create_dataset(col, data=data_as_str, compression=compression)
+        elif col == "iso":
+            # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+            data_as_json_strings = []
+            for item in data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        data_as_json_strings.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    data_as_json_strings.append("None")
+            group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
+        elif col == "ms1_spec":
+            # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+            data_as_json_strings = []
+            for item in data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        data_as_json_strings.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    data_as_json_strings.append("None")
+            group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
         else:
             logger.warning(
                 f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
@@ -666,6 +718,24 @@ def _reconstruct_object_column(data_col, col_name: str):
                             },
                         )
                 reconstructed_data.append(converted_adducts)
+            elif col_name == "iso":
+                # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+                try:
+                    import numpy as np
+                    iso_data = json.loads(item)
+                    # Convert back to numpy array
+                    reconstructed_data.append(np.array(iso_data) if iso_data else None)
+                except (json.JSONDecodeError, ValueError, ImportError):
+                    reconstructed_data.append(None)
+            elif col_name == "ms1_spec":
+                # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+                try:
+                    import numpy as np
+                    ms1_spec_data = json.loads(item)
+                    # Convert back to numpy array
+                    reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
+                except (json.JSONDecodeError, ValueError, ImportError):
+                    reconstructed_data.append(None)
             else:
                 # Unknown object column
                 reconstructed_data.append(None)

{masster-0.5.3 → masster-0.5.4}/src/masster/study/plot.py RENAMED Viewed

@@ -603,7 +603,7 @@ def plot_consensus_2d(
                 pl.when(
                     (pl.col(sizeby).is_not_null()) & (pl.col(sizeby).is_finite()) & (pl.col(sizeby) > 0),
                 )
-                .then((pl.col(sizeby).log10() * markersize / 12).pow(2))
+                .then((pl.col(sizeby).log10() * markersize / 12).pow(1.5))
                 .otherwise(markersize)
                 .alias("markersize"),
             ])
@@ -1421,46 +1421,60 @@ def plot_rt_correction(
     p.xaxis.axis_label = f"Retention Time ({rt_unit})"
     p.yaxis.axis_label = "RT - RT_original (s)"
-    samples_info = None
+    # Create sample name lookup dictionary from samples_df (all in Polars)
+    sample_names_dict = {}
     if hasattr(self, "samples_df") and self.samples_df is not None:
         try:
-            samples_info = self.samples_df.to_pandas()
+            sample_name_mapping = (
+                self.samples_df
+                .filter(pl.col("sample_uid").is_in(sample_uids))
+                .select(["sample_uid", "sample_name"])
+            )
+            sample_names_dict = dict(zip(
+                sample_name_mapping["sample_uid"].to_list(),
+                sample_name_mapping["sample_name"].to_list()
+            ))
         except Exception:
-            samples_info = None
+            pass
     renderers = []
-    # Iterate samples and build curves
-    for uid in sample_uids:
-        # Select features belonging to this sample
-        try:
-            if "sample_uid" in self.features_df.columns:
-                sample_feats = self.features_df.filter(pl.col("sample_uid") == uid)
-            elif "sample_name" in self.features_df.columns:
-                sample_feats = self.features_df.filter(pl.col("sample_name") == uid)
-            else:
-                self.logger.debug("No sample identifier column in features_df; skipping sample filtering")
-                continue
-        except Exception as e:
-            self.logger.debug(f"Error filtering features for sample {uid}: {e}")
-            continue
+    # Check sample identifier column
+    if "sample_uid" not in self.features_df.columns:
+        if "sample_name" in self.features_df.columns:
+            sample_id_col = "sample_name"
+        else:
+            self.logger.debug("No sample identifier column in features_df")
+            return
+    else:
+        sample_id_col = "sample_uid"
-        if sample_feats.is_empty():
-            continue
+    # OPTIMIZED: Filter once, group once instead of per-sample filtering
+    try:
+        # Filter all data once for selected samples and required conditions
+        all_sample_feats = self.features_df.filter(
+            pl.col(sample_id_col).is_in(sample_uids)
+        )
+        if all_sample_feats.is_empty():
+            self.logger.warning("No features found for the selected samples.")
+            return
-        # Filter to only use features with filled==False
-        if "filled" in sample_feats.columns:
-            sample_feats = sample_feats.filter(~pl.col("filled"))
-            if sample_feats.is_empty():
-                continue
+        # Filter to only use features with filled==False if column exists
+        if "filled" in all_sample_feats.columns:
+            all_sample_feats = all_sample_feats.filter(~pl.col("filled"))
+            if all_sample_feats.is_empty():
+                self.logger.warning("No non-filled features found for the selected samples.")
+                return
-        # Stay in Polars - much faster than pandas conversion!
-        if "rt" not in sample_feats.columns or "rt_original" not in sample_feats.columns:
-            continue
+        # Check required columns
+        if "rt" not in all_sample_feats.columns or "rt_original" not in all_sample_feats.columns:
+            self.logger.error("Required columns 'rt' or 'rt_original' not found in features_df.")
+            return
-        # Filter nulls and add delta column in Polars
-        sample_feats = (
-            sample_feats
+        # Filter nulls, add delta column, and sort - all in one operation
+        all_sample_feats = (
+            all_sample_feats
             .filter(
                 pl.col("rt").is_not_null() &
                 pl.col("rt_original").is_not_null()
@@ -1468,33 +1482,36 @@ def plot_rt_correction(
             .with_columns([
                 (pl.col("rt") - pl.col("rt_original")).alias("delta")
             ])
-            .sort("rt")
+            .sort([sample_id_col, "rt"])
         )
-        if sample_feats.is_empty():
-            continue
+        if all_sample_feats.is_empty():
+            self.logger.warning("No valid RT data found for the selected samples.")
+            return
-        # Extract arrays directly from Polars
-        rt = sample_feats["rt"].to_numpy()
-        delta = sample_feats["delta"].to_numpy()
+        # Group by sample and process each group (much faster than individual filtering)
+        for (sample_uid,), sample_group in all_sample_feats.group_by(sample_id_col):
+            if sample_group.is_empty():
+                continue
-        sample_name = str(uid)
-        if samples_info is not None:
-            try:
-                row = samples_info[samples_info["sample_uid"] == uid]
-                if not row.empty:
-                    sample_name = row.iloc[0].get("sample_name", sample_name)
-            except Exception:
-                pass
+            # Extract arrays directly from Polars
+            rt = sample_group["rt"].to_numpy()
+            delta = sample_group["delta"].to_numpy()
-        color = color_map.get(uid, "#000000")
+            # Get sample name efficiently from pre-built dictionary
+            sample_name = sample_names_dict.get(sample_uid, str(sample_uid))
+            color = color_map.get(sample_uid, "#000000")
-        data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
-        src = ColumnDataSource(data)
+            data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
+            src = ColumnDataSource(data)
-        r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
-        p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
-        renderers.append(r_line)
+            r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
+            p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
+            renderers.append(r_line)
+    except Exception as e:
+        self.logger.error(f"Error in optimized RT correction plotting: {e}")
+        return
     if not renderers:
         self.logger.warning("No RT correction curves to plot for the selected samples.")

{masster-0.5.3 → masster-0.5.4}/uv.lock RENAMED Viewed

@@ -1393,7 +1393,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.5.3"
+version = "0.5.4"
 source = { editable = "." }
 dependencies = [
     { name = "alpharaw" },