PyPI - masster - Versions diffs - 0.4.14__tar.gz → 0.4.16__tar.gz - Mend

masster 0.4.14tar.gz → 0.4.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (96) hide show

{masster-0.4.14 → masster-0.4.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.4.14
+Version: 0.4.16
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.4.14 → masster-0.4.16}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.4.14"
+version = "0.4.16"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.4.14 → masster-0.4.16}/src/masster/__init__.py RENAMED Viewed

@@ -16,6 +16,7 @@ from masster.lib import Lib
 from masster.sample.sample import Sample
 from masster.spectrum import Spectrum
 from masster.study.study import Study
+from masster.wizard import Wizard, wizard_def
 __all__ = [
@@ -24,6 +25,7 @@ __all__ = [
     "Sample",
     "Spectrum",
     "Study",
+    "Wizard",
     "__version__",
     #    "get_version",
 ]

{masster-0.4.14 → masster-0.4.16}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.4.14"
+__version__ = "0.4.16"
 def get_version():

{masster-0.4.14 → masster-0.4.16}/src/masster/study/__init__.py RENAMED Viewed

@@ -5,5 +5,6 @@ This module provides the Sample class for handling mass spectrometry data.
 """
 from .study import Study
+from . import merge as _  # Import unified merge system  # noqa: F401
 __all__ = ["Study"]

{masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/find_consensus_def.py RENAMED Viewed

@@ -32,7 +32,7 @@ class find_consensus_defaults:
                 "dtype": str,
                 "description": "Feature grouping algorithm",
                 "default": "qt",
-                "allowed_values": ["qt", "kd", "unlabeled", "sequential"],
+                "allowed_values": ["qt", "kd", "unlabeled", "kd-nowarp"],
             },
             "min_samples": {
                 "dtype": int,

{masster-0.4.14 → masster-0.4.16}/src/masster/study/defaults/merge_def.py RENAMED Viewed

@@ -9,55 +9,99 @@ class merge_defaults:
     """
     Parameter class for Study merge method.
-    This class encapsulates parameters for consensus feature detection across samples,
-    including algorithm selection, grouping tolerances, and minimum sample requirements.
+    This class encapsulates parameters for all merge algorithms including
+    method selection, grouping tolerances, and algorithm-specific parameters.
     Attributes:
-        algorithm (str): Feature grouping algorithm. Default is "qt".
-        min_samples (int): Minimum number of samples for a consensus feature. Default is 1.
+        method (str): Merge method to use ('kd', 'qt', 'kd-nowarp', 'chunked'). Default is "kd".
+        min_samples (int): Minimum number of samples for a consensus feature. Default is 50.
+        rt_tol (float): RT tolerance for grouping (seconds). Default is 2.0.
+        mz_tol (float): m/z tolerance for grouping (Da for all methods). Default is 0.01.
+        chunk_size (int): Chunk size for 'chunked' method. Default is 500.
+        nr_partitions (int): Number of partitions in m/z dimension for KD algorithms. Default is 500.
+        min_rel_cc_size (float): Minimum relative connected component size for conflict resolution. Default is 0.3.
+        max_pairwise_log_fc (float): Maximum pairwise log fold change for conflict resolution. Default is 0.5.
+        max_nr_conflicts (int): Maximum number of conflicts allowed in consensus feature. Default is 0.
         link_ms2 (bool): Whether to link MS2 spectra to consensus features. Default is True.
-        mz_tol (float): m/z tolerance for grouping (Da). Default is 0.01.
-        rt_tol (float): RT tolerance for grouping (seconds). Default is 1.0.
     """
-    algorithm: str = "qt"
-    min_samples: int = 1
-    link_ms2: bool = True
+    method: str = "kd"
+    min_samples: int = 10
+    rt_tol: float = 5.0
     mz_tol: float = 0.01
-    rt_tol: float = 1.0
+    chunk_size: int = 300
+    nr_partitions: int = 1000
+    min_rel_cc_size: float = 0.2
+    max_pairwise_log_fc: float = -1.0
+    max_nr_conflicts: int = 0
+    link_ms2: bool = True
     _param_metadata: dict[str, dict[str, Any]] = field(
         default_factory=lambda: {
-            "algorithm": {
+            "method": {
                 "dtype": str,
-                "description": "Feature grouping algorithm",
-                "default": "qt",
-                "allowed_values": ["qt", "kd", "unlabeled", "sequential"],
+                "description": "Merge method (algorithm) to use",
+                "default": "kd",
+                "allowed_values": ["kd", "qt", "kd-nowarp", "chunked"],
             },
             "min_samples": {
                 "dtype": int,
                 "description": "Minimum number of samples for a consensus feature",
-                "default": 1,
+                "default": 50,
                 "min_value": 1,
             },
-            "link_ms2": {
-                "dtype": bool,
-                "description": "Whether to link MS2 spectra to consensus features",
-                "default": True,
+            "rt_tol": {
+                "dtype": float,
+                "description": "RT tolerance for grouping (seconds)",
+                "default": 2.0,
+                "min_value": 0.1,
+                "max_value": 60.0,
             },
             "mz_tol": {
                 "dtype": float,
-                "description": "m/z tolerance for grouping (Da)",
+                "description": "m/z tolerance for grouping (Da for all methods)",
                 "default": 0.01,
                 "min_value": 0.001,
                 "max_value": 1.0,
             },
-            "rt_tol": {
+            "chunk_size": {
+                "dtype": int,
+                "description": "Chunk size for 'chunked' method",
+                "default": 500,
+                "min_value": 10,
+            },
+            "nr_partitions": {
+                "dtype": int,
+                "description": "Number of partitions in m/z dimension for KD algorithms",
+                "default": 500,
+                "min_value": 10,
+                "max_value": 10000,
+            },
+            "min_rel_cc_size": {
                 "dtype": float,
-                "description": "RT tolerance for grouping (seconds)",
-                "default": 1.0,
-                "min_value": 0.1,
-                "max_value": 60.0,
+                "description": "Minimum relative connected component size for conflict resolution",
+                "default": 0.3,
+                "min_value": 0.0,
+                "max_value": 1.0,
+            },
+            "max_pairwise_log_fc": {
+                "dtype": float,
+                "description": "Maximum pairwise log fold change for conflict resolution",
+                "default": 0.5,
+                "min_value": 0.0,
+                "max_value": 10.0,
+            },
+            "max_nr_conflicts": {
+                "dtype": int,
+                "description": "Maximum number of conflicts allowed in consensus feature",
+                "default": 0,
+                "min_value": 0,
+                "max_value": 1000,
+            },
+            "link_ms2": {
+                "dtype": bool,
+                "description": "Whether to link MS2 spectra to consensus features",
+                "default": True,
             },
         },
         repr=False,

{masster-0.4.14 → masster-0.4.16}/src/masster/study/h5.py RENAMED Viewed

@@ -56,6 +56,45 @@ def _decode_bytes_attr(attr_value):
     return str(attr_value) if attr_value is not None else ""
+def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFrame:
+    """Create an empty DataFrame with the correct schema based on study5_schema.json."""
+    if df_name not in schema:
+        # Fallback to basic empty DataFrame if schema not found
+        return pl.DataFrame()
+    df_schema = schema[df_name]["columns"]
+    empty_data = {}
+    polars_schema = {}
+    for col_name, col_info in df_schema.items():
+        dtype_str = col_info["dtype"]
+        # Convert string representation to actual Polars dtype
+        if dtype_str == "pl.Int64":
+            polars_dtype = pl.Int64
+        elif dtype_str == "pl.Int32":
+            polars_dtype = pl.Int32
+        elif dtype_str == "pl.Float64":
+            polars_dtype = pl.Float64
+        elif dtype_str == "pl.Utf8":
+            polars_dtype = pl.Utf8
+        elif dtype_str == "pl.String":
+            polars_dtype = pl.String
+        elif dtype_str == "pl.Boolean":
+            polars_dtype = pl.Boolean
+        elif dtype_str == "pl.Object":
+            polars_dtype = pl.Object
+        elif dtype_str == "pl.Null":
+            polars_dtype = pl.Null
+        else:
+            # Fallback to string if unknown type
+            polars_dtype = pl.String
+        empty_data[col_name] = []
+        polars_schema[col_name] = polars_dtype
+    return pl.DataFrame(empty_data, schema=polars_schema)
 def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=10000):
     """
     Save an entire DataFrame to HDF5 with optimized batch processing and memory efficiency.
@@ -1080,7 +1119,7 @@ def _save_study5_compressed(self, filename):
     if not filename.endswith(".study5"):
         filename += ".study5"
-    self.logger.debug(f"Compressed saving study to {filename}")
+    self.logger.debug(f"Save study")
     # delete existing file if it exists
     if os.path.exists(filename):
@@ -1132,7 +1171,7 @@ def _save_study5_compressed(self, filename):
         with tqdm(
             total=total_steps,
-            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Fast saving study",
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving study ({sum(count for _, count in dataframes_to_save)} total rows)",
             disable=tdqm_disable,
         ) as pbar:
             # Create groups for organization
@@ -1186,8 +1225,11 @@ def _save_study5_compressed(self, filename):
                 )
                 pbar.update(1)
-            # Store features_df - use fast method that skips chrom and ms2_specs columns
+                # Store features_df - use fast method that skips chrom and ms2_specs columns
             if self.features_df is not None and not self.features_df.is_empty():
+                pbar.set_description(
+                    f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving features ({len(self.features_df)} rows, compressed)"
+                )
                 self.logger.debug(
                     f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
                 )
@@ -1411,7 +1453,7 @@ def _save_study5(self, filename):
     if not filename.endswith(".study5"):
         filename += ".study5"
-    self.logger.info(f"Saving study to {filename}")
+    self.logger.info("Save study...")
     # delete existing file if it exists
     if os.path.exists(filename):
@@ -1463,7 +1505,7 @@ def _save_study5(self, filename):
         with tqdm(
             total=total_steps,
-            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving study",
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving study ({sum(count for _, count in dataframes_to_save)} total rows)",
             disable=tdqm_disable,
         ) as pbar:
             # Create groups for organization
@@ -1498,12 +1540,12 @@ def _save_study5(self, filename):
                 metadata_group.create_dataset("parameters", data="")
             pbar.update(1)
-            pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes",
-            )
             # Store samples_df - use optimized batch processing
             if self.samples_df is not None and not self.samples_df.is_empty():
+                pbar.set_description(
+                    f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving samples ({len(self.samples_df)} rows)"
+                )
                 samples_group = f.create_group("samples")
                 self.logger.debug(
                     f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
@@ -1519,6 +1561,9 @@ def _save_study5(self, filename):
             # Store features_df - use optimized batch processing
             if self.features_df is not None and not self.features_df.is_empty():
+                pbar.set_description(
+                    f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving features ({len(self.features_df)} rows)"
+                )
                 self.logger.debug(
                     f"Saving features_df with {len(self.features_df)} rows using optimized method",
                 )
@@ -1533,6 +1578,9 @@ def _save_study5(self, filename):
             # Store consensus_df - use optimized batch processing
             if self.consensus_df is not None and not self.consensus_df.is_empty():
+                pbar.set_description(
+                    f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving consensus ({len(self.consensus_df)} rows)"
+                )
                 self.logger.debug(
                     f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
                 )
@@ -1690,8 +1738,9 @@ def _load_study5(self, filename=None):
         # Use progress bar to show loading progress
         with tqdm(
             total=len(loading_steps),
-            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading study",
+            desc="Loading study",
             disable=tdqm_disable,
+            unit="step"
         ) as pbar:
             # Load metadata
             pbar.set_description(
@@ -1792,83 +1841,7 @@ def _load_study5(self, filename=None):
                 self.logger.debug(
                     "No samples data found in study5 file. Initializing empty samples_df.",
                 )
-                self.samples_df = pl.DataFrame(
-                    {
-                        "sample_uid": [],
-                        "sample_name": [],
-                        "sample_path": [],
-                        "sample_type": [],
-                        "size": [],
-                        "map_id": [],
-                        "sample_source": [],
-                        "num_ms1": [],
-                        "num_ms2": [],
-                        "sample_group": [],
-                        "sample_batch": [],
-                        "sample_sequence": [],
-                    },
-                    schema={
-                        "sample_uid": pl.Int64,
-                        "sample_name": pl.Utf8,
-                        "sample_path": pl.Utf8,
-                        "sample_type": pl.Utf8,
-                        "size": pl.Int64,
-                        "map_id": pl.Int64,
-                        "sample_source": pl.Utf8,
-                        "num_ms1": pl.Int64,
-                        "num_ms2": pl.Int64,
-                        "sample_group": pl.Utf8,
-                        "sample_batch": pl.Int64,
-                        "sample_sequence": pl.Int64,
-                    },
-                )
-            pbar.update(1)
-            # Load samples_df
-            pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples",
-            )
-            if "samples" in f and len(f["samples"].keys()) > 0:
-                self.samples_df = _load_dataframe_from_group(
-                    f["samples"],
-                    schema,
-                    "samples_df",
-                    self.logger,
-                )
-            else:
-                # Initialize empty samples_df with the correct schema if no data exists
-                self.logger.debug(
-                    "No samples data found in study5 file. Initializing empty samples_df.",
-                )
-                self.samples_df = pl.DataFrame(
-                    {
-                        "sample_uid": [],
-                        "sample_name": [],
-                        "sample_path": [],
-                        "sample_type": [],
-                        "size": [],
-                        "map_id": [],
-                        "sample_source": [],
-                        "num_ms1": [],
-                        "num_ms2": [],
-                        "sample_group": [],
-                        "sample_batch": [],
-                        "sample_sequence": [],
-                    },
-                    schema={
-                        "sample_uid": pl.Int64,
-                        "sample_name": pl.Utf8,
-                        "sample_path": pl.Utf8,
-                        "sample_type": pl.Utf8,
-                        "size": pl.Int64,
-                        "map_id": pl.Int64,
-                        "sample_source": pl.Utf8,
-                        "num_ms1": pl.Int64,
-                        "num_ms2": pl.Int64,
-                        "sample_group": pl.Utf8,
-                        "sample_batch": pl.Int64,
-                        "sample_sequence": pl.Int64,
-                    },
-                )
+                self.samples_df = _create_empty_dataframe_from_schema("samples_df", schema)
             pbar.update(1)
             # Load features_df
@@ -1885,7 +1858,7 @@ def _load_study5(self, filename=None):
                     object_columns,
                 )
             else:
-                self.features_df = None
+                self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
             pbar.update(1)
             # Load consensus_df
@@ -1942,7 +1915,7 @@ def _load_study5(self, filename=None):
                             ],
                         )
             else:
-                self.consensus_df = None
+                self.consensus_df = _create_empty_dataframe_from_schema("consensus_df", schema)
             pbar.update(1)
             # Load consensus_mapping_df
@@ -1957,21 +1930,7 @@ def _load_study5(self, filename=None):
                     self.logger,
                 )
             else:
-                self.consensus_mapping_df = None
-            pbar.update(1)
-            # Load consensus_mapping_df
-            pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping",
-            )
-            if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
-                self.consensus_mapping_df = _load_dataframe_from_group(
-                    f["consensus_mapping"],
-                    schema,
-                    "consensus_mapping_df",
-                    self.logger,
-                )
-            else:
-                self.consensus_mapping_df = None
+                self.consensus_mapping_df = _create_empty_dataframe_from_schema("consensus_mapping_df", schema)
             pbar.update(1)
             # Load consensus_ms2
@@ -1988,7 +1947,7 @@ def _load_study5(self, filename=None):
                     object_columns,
                 )
             else:
-                self.consensus_ms2 = None
+                self.consensus_ms2 = _create_empty_dataframe_from_schema("consensus_ms2", schema)
             pbar.update(1)
             # Load lib_df
@@ -2004,7 +1963,7 @@ def _load_study5(self, filename=None):
                     [],
                 )
             else:
-                self.lib_df = None
+                self.lib_df = _create_empty_dataframe_from_schema("lib_df", schema)
             pbar.update(1)
             # Load id_df
@@ -2020,7 +1979,7 @@ def _load_study5(self, filename=None):
                     [],
                 )
             else:
-                self.id_df = None
+                self.id_df = _create_empty_dataframe_from_schema("id_df", schema)
             pbar.update(1)
     # Check and migrate old string-based map_id to integer indices

{masster-0.4.14 → masster-0.4.16}/src/masster/study/id.py RENAMED Viewed

@@ -1291,7 +1291,7 @@ def _get_adducts(study, adducts_list: list = None, **kwargs):
             logger = getattr(study, "logger", None)
             if logger:
-                logger.debug(
+                logger.trace(
                     f"Study adducts: generated {adducts_before_filter}, filtered to {adducts_after_filter} (min_prob={min_probability})",
                 )

{masster-0.4.14 → masster-0.4.16}/src/masster/study/load.py RENAMED Viewed

@@ -214,13 +214,18 @@ def load(self, filename=None):
     # self.logger.info(f"Loading study from {filename}")
     self._load_study5(filename)
-    # After loading the study, check if consensus XML exists and load it
-    consensus_xml_path = filename.replace(".study5", ".consensusXML")
-    if os.path.exists(consensus_xml_path):
-        self._load_consensusXML(filename=consensus_xml_path)
-        # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
+    # After loading the study, check if we have consensus features before loading consensus XML
+    if (self.consensus_df is not None and not self.consensus_df.is_empty()):
+        consensus_xml_path = filename.replace(".study5", ".consensusXML")
+        if os.path.exists(consensus_xml_path):
+            self._load_consensusXML(filename=consensus_xml_path)
+            # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
+        else:
+            self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
     else:
-        self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
+        self.logger.debug("No consensus features found, skipping consensusXML loading")
     self.filename = filename

masster 0.4.14__tar.gz → 0.4.16__tar.gz

Potentially problematic release.

masster 0.4.14tar.gz → 0.4.16tar.gz