PyPI - masster - Versions diffs - 0.4.22__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

masster 0.4.22py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (22) hide show

masster/_version.py +1 -1
masster/sample/adducts.py +1 -1
masster/sample/load.py +10 -9
masster/sample/plot.py +1 -1
masster/sample/processing.py +4 -4
masster/sample/sample.py +29 -32
masster/study/analysis.py +1762 -0
masster/study/export.py +5 -3
masster/study/helpers.py +153 -80
masster/study/id.py +3 -3
masster/study/load.py +17 -52
masster/study/merge.py +316 -313
masster/study/parameters.py +3 -3
masster/study/plot.py +398 -43
masster/study/processing.py +4 -4
masster/study/save.py +8 -4
masster/study/study.py +97 -139
{masster-0.4.22.dist-info → masster-0.5.0.dist-info}/METADATA +54 -14
{masster-0.4.22.dist-info → masster-0.5.0.dist-info}/RECORD +22 -21
{masster-0.4.22.dist-info → masster-0.5.0.dist-info}/WHEEL +0 -0
{masster-0.4.22.dist-info → masster-0.5.0.dist-info}/entry_points.txt +0 -0
{masster-0.4.22.dist-info → masster-0.5.0.dist-info}/licenses/LICENSE +0 -0

masster/study/export.py CHANGED Viewed

@@ -60,7 +60,7 @@ def _get_mgf_df(self, **kwargs):
     # end of parameter initialization
     # Store parameters in the Study object
-    self.store_history(["get_mgf"], params.to_dict())
+    self.update_history(["get_mgf"], params.to_dict())
     self.logger.debug("Parameters stored to get_mgf")
     # Get parameter values for use in the method
@@ -442,7 +442,8 @@ def export_mgf(self, **kwargs):
         None: Writes MGF file to disk.
     """
     # Get mgf data as DataFrame
-    mgf_data = self._get_mgf_df(**kwargs)
+    from masster.study.export import _get_mgf_df
+    mgf_data = _get_mgf_df(self, **kwargs)
     if mgf_data is None or len(mgf_data) == 0:
         self.logger.warning("No MGF data generated.")
@@ -559,7 +560,8 @@ def export_mztab(self, filename: str | None = None, include_mgf=True, **kwargs)
     mgf_data = None
     mgf_mapping: dict[str, list[int]] = {}
     if include_mgf:
-        mgf_data = self._get_mgf_df(**kwargs)
+        from masster.study.export import _get_mgf_df
+        mgf_data = _get_mgf_df(self, **kwargs)
         # Create mapping from feature_uid to MGF indexes
         if mgf_data is not None and len(mgf_data) > 0:
             for row in mgf_data.iter_rows(named=True):

masster/study/helpers.py CHANGED Viewed

@@ -49,7 +49,7 @@ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
         s = owner
     else:
         # owner is expected to be a Study
-        s = get_sample(owner, sample)
+        s = get_samples(owner, sample)
     if s is None:
         raise ValueError("Could not resolve sample for BPC computation")
@@ -189,7 +189,7 @@ def get_tic(owner, sample=None, label=None):
     if hasattr(owner, "ms1_df"):
         s = owner
     else:
-        s = get_sample(owner, sample)
+        s = get_samples(owner, sample)
     if s is None:
         raise ValueError("Could not resolve sample for TIC computation")
@@ -278,7 +278,7 @@ def get_eic(owner, sample=None, mz=None, mz_tol=None, rt_unit="s", label=None):
         s = owner
     else:
         # owner is expected to be a Study
-        s = get_sample(owner, sample)
+        s = get_samples(owner, sample)
     if s is None:
         raise ValueError("Could not resolve sample for EIC computation")
@@ -360,7 +360,7 @@ def get_chrom(self, uids=None, samples=None):
         return None
     ids = self._get_consensus_uids(uids)
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if self.consensus_map is None:
         self.logger.error("No consensus map found.")
@@ -467,7 +467,7 @@ def get_chrom(self, uids=None, samples=None):
 # =====================================================================================
-def set_folder(self, folder):
+def set_study_folder(self, folder):
     """
     Set the folder for saving and loading files.
     """
@@ -492,7 +492,8 @@ def align_reset(self):
     )
     # Ensure column order is maintained after with_columns operation
-    self._ensure_features_df_schema_order()
+    from masster.study.helpers import _ensure_features_df_schema_order
+    _ensure_features_df_schema_order(self)
 # =====================================================================================
@@ -722,7 +723,7 @@ def fill_reset(self):
     )
-def _get_feature_uids(self, uids=None, seed=42):
+def _get_features_uids(self, uids=None, seed=42):
     """
     Helper function to get feature_uids from features_df based on input uids.
     If uids is None, returns all feature_uids.
@@ -806,7 +807,7 @@ def _get_consensus_uids(self, uids=None, seed=42):
         return consensus_uids
-def _get_sample_uids(self, samples=None, seed=42):
+def _get_samples_uids(self, samples=None, seed=42):
     """
     Helper function to get sample_uids from samples_df based on input samples.
     If samples is None, returns all sample_uids.
@@ -847,7 +848,7 @@ def _get_sample_uids(self, samples=None, seed=42):
         return sample_uids
-def get_sample(self, sample):
+def get_samples(self, sample):
     """
     Return a `Sample` object corresponding to the provided sample identifier.
@@ -911,6 +912,138 @@ def get_orphans(self):
     return not_in_consensus
+def get_sample_stats(self):
+    """
+    Get statistics for all samples in the study.
+    Returns:
+        pl.DataFrame: DataFrame with the following columns:
+            - sample_uid: Sample unique identifier
+            - num_features: Total number of features per sample
+            - num_ms1: Number of MS1 features per sample
+            - num_ms2: Number of MS2 features per sample
+            - num_linked_ms1: Number of non-filled features present in consensus_mapping_df
+            - num_orphans: Number of non-filled features not present in consensus_mapping_df
+            - max_rt_correction: Maximum RT correction applied
+            - average_rt_correction: Average RT correction applied
+            - num_linked_ms2: Number of linked MS2 spectra from consensus_ms2_df
+    """
+    if self.samples_df is None or self.samples_df.is_empty():
+        self.logger.warning("No samples found in study.")
+        return pl.DataFrame()
+    if self.features_df is None or self.features_df.is_empty():
+        self.logger.warning("No features found in study.")
+        return pl.DataFrame()
+    # Get base sample information
+    sample_uids = self.samples_df["sample_uid"].to_list()
+    stats_data = []
+    for sample_uid in sample_uids:
+        # Filter features for this sample
+        sample_features = self.features_df.filter(pl.col("sample_uid") == sample_uid)
+        if sample_features.is_empty():
+            # Sample has no features
+            stats_data.append({
+                "sample_uid": sample_uid,
+                "num_features": 0,
+                "num_ms1": 0,
+                "num_ms2": 0,
+                "num_linked_ms1": 0,
+                "num_orphans": 0,
+                "max_rt_correction": None,
+                "average_rt_correction": None,
+                "num_linked_ms2": 0
+            })
+            continue
+        # Basic feature counts
+        num_features = len(sample_features)
+        # Count MS1 and MS2 features
+        # Assume features with ms_level=1 or missing ms_level are MS1
+        num_ms1 = sample_features.filter(
+            pl.col("ms_level").is_null() | (pl.col("ms_level") == 1)
+        ).height if "ms_level" in sample_features.columns else num_features
+        num_ms2 = sample_features.filter(
+            pl.col("ms_level") == 2
+        ).height if "ms_level" in sample_features.columns else 0
+        # Get non-filled features for this sample
+        if "filled" in sample_features.columns:
+            non_filled_features = sample_features.filter(~pl.col("filled") | pl.col("filled").is_null())
+        else:
+            non_filled_features = sample_features
+        # Count linked MS1 features (non-filled and present in consensus_mapping_df)
+        num_linked_ms1 = 0
+        if not self.consensus_mapping_df.is_empty() and not non_filled_features.is_empty():
+            linked_feature_uids = self.consensus_mapping_df.filter(
+                pl.col("sample_uid") == sample_uid
+            )["feature_uid"].to_list()
+            num_linked_ms1 = non_filled_features.filter(
+                pl.col("feature_uid").is_in(linked_feature_uids)
+            ).height
+        # Count orphan features (non-filled and NOT present in consensus_mapping_df)
+        num_orphans = len(non_filled_features) - num_linked_ms1
+        # Calculate RT correction statistics
+        max_rt_correction = None
+        average_rt_correction = None
+        if "rt" in sample_features.columns and "rt_original" in sample_features.columns:
+            rt_corrections = sample_features.with_columns(
+                (pl.col("rt") - pl.col("rt_original")).alias("rt_correction")
+            ).filter(
+                pl.col("rt_correction").is_not_null()
+            )["rt_correction"]
+            if not rt_corrections.is_empty():
+                max_rt_correction = rt_corrections.abs().max()
+                average_rt_correction = rt_corrections.abs().mean()
+        # Count linked MS2 spectra from consensus_ms2_df
+        num_linked_ms2 = 0
+        if hasattr(self, 'consensus_ms2') and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
+            if "sample_uid" in self.consensus_ms2.columns:
+                num_linked_ms2 = self.consensus_ms2.filter(
+                    pl.col("sample_uid") == sample_uid
+                ).height
+        stats_data.append({
+            "sample_uid": sample_uid,
+            "num_features": num_features,
+            "num_ms1": num_ms1,
+            "num_ms2": num_ms2,
+            "num_linked_ms1": num_linked_ms1,
+            "num_orphans": num_orphans,
+            "max_rt_correction": max_rt_correction,
+            "average_rt_correction": average_rt_correction,
+            "num_linked_ms2": num_linked_ms2
+        })
+    # Create DataFrame with proper schema
+    return pl.DataFrame(
+        stats_data,
+        schema={
+            "sample_uid": pl.UInt64,
+            "num_features": pl.UInt32,
+            "num_ms1": pl.UInt32,
+            "num_ms2": pl.UInt32,
+            "num_linked_ms1": pl.UInt32,
+            "num_orphans": pl.UInt32,
+            "max_rt_correction": pl.Float64,
+            "average_rt_correction": pl.Float64,
+            "num_linked_ms2": pl.UInt32
+        }
+    )
 # =====================================================================================
 # DATA COMPRESSION AND RESTORATION FUNCTIONS
 # =====================================================================================
@@ -995,7 +1128,7 @@ def restore_features(self, samples=None, maps=False):
         return
     # Get sample_uids to process
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.warning("No valid samples specified.")
@@ -1154,7 +1287,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
         return
     # Get sample_uids to process
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.warning("No valid samples specified.")
         return
@@ -1610,7 +1743,7 @@ def sample_name_reset(self):
     )
-def set_source(self, filename):
+def set_samples_source(self, filename):
     """
     Reassign file_source for all samples in samples_df. If filename contains only a path,
     keep the current basename and build an absolute path. Check that the new file exists
@@ -3301,7 +3434,7 @@ def samples_delete(self, samples):
 # =====================================================================================
-def sample_color(self, by=None, palette="Turbo256"):
+def set_samples_color(self, by=None, palette="Turbo256"):
     """
     Set sample colors in the sample_color column of samples_df.
@@ -3344,13 +3477,13 @@ def sample_color(self, by=None, palette="Turbo256"):
     Example:
         # Set colors based on sample type
-        study.sample_color(by='sample_type', palette='Set1')
+        study.set_samples_color(by='sample_type', palette='Set1')
         # Set colors using a custom color list
-        study.sample_color(by=['#FF0000', '#00FF00', '#0000FF'])
+        study.set_samples_color(by=['#FF0000', '#00FF00', '#0000FF'])
         # Reset to default Turbo256 sequential colors
-        study.sample_color()
+        study.set_samples_color()
     """
     if self.samples_df is None or len(self.samples_df) == 0:
         self.logger.warning("No samples found in study.")
@@ -3473,67 +3606,7 @@ def sample_color(self, by=None, palette="Turbo256"):
         self.logger.debug(f"Set sample colors based on {by} using {palette} palette")
-def sample_color_reset(self):
-    """
-    Reset sample colors to default coloring using the 'turbo' colormap.
-    This function assigns colors by distributing samples evenly across the full
-    turbo colormap range, ensuring maximum color diversity and visual distinction
-    between samples.
-    Returns:
-        None (modifies self.samples_df in place)
-    """
-    if self.samples_df is None or len(self.samples_df) == 0:
-        self.logger.warning("No samples found in study.")
-        return
-    try:
-        from cmap import Colormap
-        # Use turbo colormap
-        cm = Colormap("turbo")
-        # Get sample count and assign colors evenly distributed across colormap
-        n_samples = len(self.samples_df)
-        colors = []
-        # Distribute samples evenly across the full colormap range
-        for i in range(n_samples):
-            # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
-            normalized_value = (
-                i + 0.5
-            ) / n_samples  # +0.5 to center samples in their bins
-            # Optionally, map to a subset of colormap to avoid extreme colors
-            # Use 10% to 90% of colormap range for better color diversity
-            normalized_value = 0.1 + (normalized_value * 0.8)
-            color_rgba = cm(normalized_value)
-            # Convert RGBA to hex
-            if len(color_rgba) >= 3:
-                r, g, b = color_rgba[:3]
-                # Convert to 0-255 range if needed
-                if max(color_rgba[:3]) <= 1.0:
-                    r, g, b = int(r * 255), int(g * 255), int(b * 255)
-                hex_color = f"#{r:02x}{g:02x}{b:02x}"
-                colors.append(hex_color)
-        # Update the sample_color column
-        self.samples_df = self.samples_df.with_columns(
-            pl.Series("sample_color", colors).alias("sample_color"),
-        )
-        self.logger.debug(
-            f"Reset sample colors using turbo colormap with even distribution ({n_samples} samples)",
-        )
-    except ImportError:
-        self.logger.error(
-            "cmap library is required for sample color reset. Install with: uv add cmap",
-        )
-    except Exception as e:
-        self.logger.error(f"Failed to reset sample colors: {e}")
 def _get_color_palette(palette_name):
@@ -3634,7 +3707,7 @@ def _get_color_palette(palette_name):
 def _sample_colors_from_colormap(palette_name, n_colors):
     """
-    Sample colors evenly from the whole colormap range, similar to sample_color_reset.
+    Sample colors evenly from the whole colormap range, similar to set_samples_color(by=None).
     Parameters:
         palette_name (str): Name of the palette/colormap
@@ -3686,7 +3759,7 @@ def _sample_colors_from_colormap(palette_name, n_colors):
         colors = []
-        # Distribute samples evenly across the full colormap range (same approach as sample_color_reset)
+        # Distribute samples evenly across the full colormap range (same approach as set_samples_color(by=None))
         for i in range(n_colors):
             # Evenly distribute samples across colormap (avoiding endpoints to prevent white/black)
             normalized_value = (
@@ -3818,7 +3891,7 @@ def restore_ms2(self, samples=None, **kwargs):
         return
     # Get sample_uids to process
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.warning("No valid samples specified.")
         return
@@ -3888,7 +3961,7 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
         return
     # Get sample_uids to process
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.warning("No valid samples specified.")
         return

masster/study/id.py CHANGED Viewed

@@ -124,8 +124,8 @@ def lib_load(
                 study.lib_df = pl.DataFrame()
     # Store this operation in history
-    if hasattr(study, "store_history"):
-        study.store_history(
+    if hasattr(study, "update_history"):
+        study.update_history(
             ["lib_load"],
             {"lib_source": str(lib_source), "polarity": polarity, "adducts": adducts},
         )
@@ -385,7 +385,7 @@ def _store_identification_history(study, effective_mz_tol, effective_rt_tol, tar
             history_params["params"] = params.to_dict()
         if kwargs:
             history_params["kwargs"] = kwargs
-        study.store_history(["identify"], history_params)
+        study.update_history(["identify"], history_params)
 def _validate_identify_inputs(study, logger=None):

masster/study/load.py CHANGED Viewed

@@ -213,18 +213,19 @@ def load(self, filename=None):
             return
     # self.logger.info(f"Loading study from {filename}")
-    self._load_study5(filename)
+    from masster.study.h5 import _load_study5
+    _load_study5(self, filename)
     # After loading the study, check if we have consensus features before loading consensus XML
-    if (self.consensus_df is not None and not self.consensus_df.is_empty()):
-        consensus_xml_path = filename.replace(".study5", ".consensusXML")
-        if os.path.exists(consensus_xml_path):
-            self._load_consensusXML(filename=consensus_xml_path)
+    #if (self.consensus_df is not None and not self.consensus_df.is_empty()):
+    #    consensus_xml_path = filename.replace(".study5", ".consensusXML")
+    #    if os.path.exists(consensus_xml_path):
+    #        self._load_consensusXML(filename=consensus_xml_path)
             # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
-        else:
-            self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
-    else:
-        self.logger.debug("No consensus features found, skipping consensusXML loading")
+    #    else:
+    #        self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
+    #else:
+    #    self.logger.debug("No consensus features found, skipping consensusXML loading")
     self.filename = filename
@@ -559,7 +560,7 @@ def fill_single(self, **kwargs):
     # end of parameter initialization
     # Store parameters in the Study object
-    self.store_history(["fill_single"], params.to_dict())
+    self.update_history(["fill_single"], params.to_dict())
     self.logger.debug("Parameters stored to fill_single")
     # Call the original fill_chrom_single function with extracted parameters
@@ -979,7 +980,7 @@ def fill(self, **kwargs):
     # end of parameter initialization
     # Store parameters in the Study object
-    self.store_history(["fill"], params.to_dict())
+    self.update_history(["fill"], params.to_dict())
     self.logger.debug("Parameters stored to fill")
     # Call the original fill_chrom function with extracted parameters
@@ -1115,7 +1116,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
         return missing_combinations
-def sanitize(self):
+def _sanitize(self):
     """
     Sanitize features DataFrame to ensure all complex objects are properly typed.
     Convert serialized objects back to their proper types (Chromatogram, Spectrum).
@@ -1209,7 +1210,7 @@ def sanitize(self):
         self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
-def load_features(self):
+def _load_features(self):
     """
     Load features by reconstructing FeatureMaps from the processed features_df data.
@@ -1630,7 +1631,7 @@ def _add_sample_optimized(
     # - No _ensure_features_df_schema_order()
     # - No complex column alignment
     # - No type casting loops
-    # - No sample_color_reset()
+    # - No set_samples_color(by=None) call needed
     self.logger.debug(
         f"Added sample {sample_name} with {ddaobj._oms_features_map.size()} features (optimized)",
@@ -1914,42 +1915,6 @@ def _add_sample_standard(
 def _sample_color_reset_optimized(self):
     """
-    Optimized version of sample_color_reset that caches colormap initialization.
+    Optimized version of sample color reset using set_samples_color.
     """
-    if self.samples_df is None or len(self.samples_df) == 0:
-        self.logger.warning("No samples found in study.")
-        return
-    # Cache the colormap if not already cached
-    if not hasattr(self, "_cached_colormap"):
-        try:
-            from cmap import Colormap
-            self._cached_colormap = Colormap("turbo")
-        except ImportError:
-            self.logger.warning("cmap package not available, using default colors")
-            return
-    cm = self._cached_colormap
-    n_samples = len(self.samples_df)
-    # Pre-allocate colors list for better performance
-    colors = [None] * n_samples
-    # Vectorized color generation
-    for i in range(n_samples):
-        normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
-        color_rgba = cm(normalized_value)
-        if len(color_rgba) >= 3:
-            r, g, b = color_rgba[:3]
-            if max(color_rgba[:3]) <= 1.0:
-                r, g, b = int(r * 255), int(g * 255), int(b * 255)
-            colors[i] = f"#{r:02x}{g:02x}{b:02x}"
-    # Update the sample_color column efficiently
-    self.samples_df = self.samples_df.with_columns(
-        pl.Series("sample_color", colors).alias("sample_color"),
-    )
-    self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")
+    return self.set_samples_color(by=None)

masster 0.4.22__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

masster 0.4.22py3-none-any.whl → 0.5.0py3-none-any.whl