PyPI - masster - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

masster 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (10) hide show

masster/_version.py +1 -1
masster/study/h5.py +317 -137
masster/study/load.py +23 -134
masster/study/save.py +0 -6
masster/study/study.py +26 -3
{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/METADATA +8 -151
{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/RECORD +10 -10
{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/WHEEL +0 -0
{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/entry_points.txt +0 -0
{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/licenses/LICENSE +0 -0

masster/_version.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.2.1"
+__version__ = "0.2.3"
 def get_version():

masster/study/h5.py CHANGED Viewed

@@ -2,25 +2,7 @@
 _study_h5.py
 This module provides HDF5-based save/load functionality for the Study class.
-It handles seria        elif col == "chrom":
-            #        elif col == "spectrum":
-            # Handle single Spectrum objects
-            data_as_str = []
-            for item in data:
-                if item is not None:
-                    data_as_str.append(item.to_json())
-                else:
-                    data_as_str.append("None")
-            group.create_dataset(col, data=data_as_str, **optimal_compression)hromatogram objects
-            data_as_str = []
-            for item in data:
-                if item is not None:
-                    data_as_str.append(item.to_json())
-                else:
-                    data_as_str.append("None")
-            group.create_dataset(col, data=data_as_str, **optimal_compression)           else:
-                    data_as_str.append("null")
-            group.create_dataset(col, data=data_as_str, **optimal_compression)n and deserialization of Polars DataFrames with complex objects
+It handles serialization and deserialization of Polars DataFrames with complex objects
 like Chromatogram and Spectrum instances.
 Key Features:
@@ -449,7 +431,7 @@ def _save_study5(self, filename=None):
     if not filename.endswith(".study5"):
         filename += ".study5"
-    self.logger.debug(f"Saving study to {filename}")
+    self.logger.info(f"Saving study to {filename}")
     # delete existing file if it exists
     if os.path.exists(filename):
@@ -529,7 +511,7 @@ def _save_study5(self, filename=None):
                 data = consensus_ms2[col] if dtype == "object" else consensus_ms2[col].to_list()
                 _save_dataframe_column(consensus_ms2_group, col, data, dtype, self.logger)
-    self.logger.info(f"Study saved to {filename}")
+    self.logger.debug(f"Save completed for {filename}")
 def _load_study5(self, filename=None):
@@ -551,6 +533,11 @@ def _load_study5(self, filename=None):
         - Properly handles MS2 scan lists and spectrum lists
         - Restores parameters dictionary from JSON serialization
     """
+    from datetime import datetime
+    from tqdm import tqdm
+    self.logger.info(f"Loading study from {filename}")
     # Handle default filename
     if filename is None:
         if self.default_folder is not None:
@@ -573,134 +560,327 @@ def _load_study5(self, filename=None):
     if not schema:
         self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
-    with h5py.File(filename, "r") as f:
-        # Load metadata
-        if "metadata" in f:
-            metadata = f["metadata"]
-            self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
-            if hasattr(self, "label"):
-                self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
-            # Load parameters from JSON
-            if "parameters" in metadata:
-                try:
-                    parameters_data = metadata["parameters"][()]
-                    if isinstance(parameters_data, bytes):
-                        parameters_data = parameters_data.decode("utf-8")
+    # Define loading steps for progress tracking
+    loading_steps = [
+        "metadata",
+        "samples_df",
+        "features_df",
+        "consensus_df",
+        "consensus_mapping_df",
+        "consensus_ms2"
+    ]
+    # Check if progress bar should be disabled based on log level
+    tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-                    if parameters_data and parameters_data != "":
-                        self.history = json.loads(parameters_data)
-                    else:
+    with h5py.File(filename, "r") as f:
+        # Use progress bar to show loading progress
+        with tqdm(
+            total=len(loading_steps),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading study",
+            disable=tdqm_disable,
+        ) as pbar:
+            # Load metadata
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata")
+            if "metadata" in f:
+                metadata = f["metadata"]
+                self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
+                if hasattr(self, "label"):
+                    self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
+                # Load parameters from JSON
+                if "parameters" in metadata:
+                    try:
+                        parameters_data = metadata["parameters"][()]
+                        if isinstance(parameters_data, bytes):
+                            parameters_data = parameters_data.decode("utf-8")
+                        if parameters_data and parameters_data != "":
+                            self.history = json.loads(parameters_data)
+                        else:
+                            self.history = {}
+                    except (json.JSONDecodeError, ValueError, TypeError) as e:
+                        self.logger.warning(f"Failed to deserialize parameters: {e}")
                         self.history = {}
-                except (json.JSONDecodeError, ValueError, TypeError) as e:
-                    self.logger.warning(f"Failed to deserialize parameters: {e}")
+                else:
                     self.history = {}
-            else:
-                self.history = {}
-            # Reconstruct self.parameters from loaded history
-            from masster.study.defaults.study_def import study_defaults
-            # Always create a fresh study_defaults object to ensure we have all defaults
-            self.parameters = study_defaults()
-            # Update parameters from loaded history if available
-            if self.history and "study" in self.history:
-                study_params = self.history["study"]
-                if isinstance(study_params, dict):
-                    failed_params = self.parameters.set_from_dict(study_params, validate=False)
-                    if failed_params:
-                        self.logger.debug(f"Could not set study parameters: {failed_params}")
+                # Reconstruct self.parameters from loaded history
+                from masster.study.defaults.study_def import study_defaults
+                # Always create a fresh study_defaults object to ensure we have all defaults
+                self.parameters = study_defaults()
+                # Update parameters from loaded history if available
+                if self.history and "study" in self.history:
+                    study_params = self.history["study"]
+                    if isinstance(study_params, dict):
+                        failed_params = self.parameters.set_from_dict(study_params, validate=False)
+                        if failed_params:
+                            self.logger.debug(f"Could not set study parameters: {failed_params}")
+                        else:
+                            self.logger.debug("Successfully updated parameters from loaded history")
                     else:
-                        self.logger.debug("Successfully updated parameters from loaded history")
+                        self.logger.debug("Study parameters in history are not a valid dictionary")
                 else:
-                    self.logger.debug("Study parameters in history are not a valid dictionary")
+                    self.logger.debug("No study parameters found in history, using defaults")
+                # Synchronize instance attributes with parameters (similar to __init__)
+                # Note: default_folder and label are already loaded from metadata attributes above
+                # but we ensure they match the parameters for consistency
+                if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
+                    self.default_folder = self.parameters.default_folder
+                if hasattr(self.parameters, 'label') and self.parameters.label is not None:
+                    self.label = self.parameters.label
+                if hasattr(self.parameters, 'log_level'):
+                    self.log_level = self.parameters.log_level
+                if hasattr(self.parameters, 'log_label'):
+                    self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
+                if hasattr(self.parameters, 'log_sink'):
+                    self.log_sink = self.parameters.log_sink
+            pbar.update(1)
+            # Load samples_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples")
+            if "samples" in f and len(f["samples"].keys()) > 0:
+                self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
             else:
-                self.logger.debug("No study parameters found in history, using defaults")
-            # Synchronize instance attributes with parameters (similar to __init__)
-            # Note: default_folder and label are already loaded from metadata attributes above
-            # but we ensure they match the parameters for consistency
-            if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
-                self.default_folder = self.parameters.default_folder
-            if hasattr(self.parameters, 'label') and self.parameters.label is not None:
-                self.label = self.parameters.label
-            if hasattr(self.parameters, 'log_level'):
-                self.log_level = self.parameters.log_level
-            if hasattr(self.parameters, 'log_label'):
-                self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
-            if hasattr(self.parameters, 'log_sink'):
-                self.log_sink = self.parameters.log_sink
-        # Load samples_df
-        if "samples" in f and len(f["samples"].keys()) > 0:
-            self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
+                # Initialize empty samples_df with the correct schema if no data exists
+                self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
+                self.samples_df = pl.DataFrame(
+                    {
+                        "sample_uid": [],
+                        "sample_name": [],
+                        "sample_path": [],
+                        "sample_type": [],
+                        "size": [],
+                        "map_id": [],
+                    },
+                    schema={
+                        "sample_uid": pl.Int64,
+                        "sample_name": pl.Utf8,
+                        "sample_path": pl.Utf8,
+                        "sample_type": pl.Utf8,
+                        "size": pl.Int64,
+                        "map_id": pl.Utf8,
+                    },
+                )
+            pbar.update(1)
+            # Load features_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features")
+            if "features" in f and len(f["features"].keys()) > 0:
+                object_columns = ["chrom", "ms2_scans", "ms2_specs"]
+                self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
+            else:
+                self.features_df = None
+            pbar.update(1)
+            # Load consensus_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus")
+            if "consensus" in f and len(f["consensus"].keys()) > 0:
+                self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
+            else:
+                self.consensus_df = None
+            pbar.update(1)
+            # Load consensus_mapping_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping")
+            if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
+                self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
+            else:
+                self.consensus_mapping_df = None
+            pbar.update(1)
+            # Load consensus_ms2
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus MS2")
+            if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
+                object_columns = ["spec"]
+                self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
+            else:
+                self.consensus_ms2 = None
+            pbar.update(1)
+    self.logger.info(f"Study loaded from {filename}")
+def _load_h5(self, filename=None):
+    """
+    Load Study instance data from a legacy .h5 HDF5 file with progress tracking.
+    This is a legacy method for loading older HDF5 format files. For new files,
+    use _load_study5() which has improved schema handling and performance.
+    Args:
+        filename (str, optional): Path to the .h5 HDF5 file to load. If None, uses default.
+    Returns:
+        None (modifies self in place)
+    Notes:
+        - Legacy format loader with basic DataFrame reconstruction
+        - Includes progress bar for loading steps
+        - For new projects, prefer _load_study5() method
+    """
+    from datetime import datetime
+    from tqdm import tqdm
+    # Handle default filename
+    if filename is None:
+        if self.default_folder is not None:
+            filename = os.path.join(self.default_folder, "study.h5")
         else:
-            # Initialize empty samples_df with the correct schema if no data exists
-            self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
-            self.samples_df = pl.DataFrame(
-                {
-                    "sample_uid": [],
-                    "sample_name": [],
-                    "sample_path": [],
-                    "sample_type": [],
-                    "size": [],
-                    "map_id": [],
-                },
-                schema={
-                    "sample_uid": pl.Int64,
-                    "sample_name": pl.Utf8,
-                    "sample_path": pl.Utf8,
-                    "sample_type": pl.Utf8,
-                    "size": pl.Int64,
-                    "map_id": pl.Utf8,
-                },
-            )
-            # Initialize empty samples_df with the correct schema if no data exists
-            self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
-            self.samples_df = pl.DataFrame(
-                {
+            self.logger.error("Either filename or default_folder must be provided")
+            return
+    # Add .h5 extension if not provided
+    if not filename.endswith(".h5"):
+        filename += ".h5"
+    if not os.path.exists(filename):
+        self.logger.error(f"File {filename} does not exist")
+        return
+    # Define loading steps for progress tracking
+    loading_steps = [
+        "metadata",
+        "samples_df",
+        "features_df",
+        "consensus_df",
+        "consensus_mapping_df"
+    ]
+    # Check if progress bar should be disabled based on log level
+    tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
+    with h5py.File(filename, "r") as f:
+        # Use progress bar to show loading progress
+        with tqdm(
+            total=len(loading_steps),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading legacy study",
+            disable=tdqm_disable,
+        ) as pbar:
+            # Load metadata
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata")
+            if "metadata" in f:
+                metadata = f["metadata"]
+                self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
+                if hasattr(self, "label"):
+                    self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
+                # Load parameters from JSON if available
+                if "parameters" in metadata:
+                    try:
+                        parameters_data = metadata["parameters"][()]
+                        if isinstance(parameters_data, bytes):
+                            parameters_data = parameters_data.decode("utf-8")
+                        if parameters_data and parameters_data != "":
+                            self.history = json.loads(parameters_data)
+                        else:
+                            self.history = {}
+                    except (json.JSONDecodeError, ValueError, TypeError) as e:
+                        self.logger.warning(f"Failed to deserialize parameters: {e}")
+                        self.history = {}
+                else:
+                    self.history = {}
+            pbar.update(1)
+            # Load samples_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples")
+            if "samples" in f and len(f["samples"].keys()) > 0:
+                samples_data = {}
+                for col in f["samples"].keys():
+                    column_data = f["samples"][col][:]
+                    # Handle byte strings
+                    if len(column_data) > 0 and isinstance(column_data[0], bytes):
+                        column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
+                    samples_data[col] = column_data
+                if samples_data:
+                    self.samples_df = pl.DataFrame(samples_data)
+                else:
+                    # Initialize empty samples_df
+                    self.samples_df = pl.DataFrame({
+                        "sample_uid": [],
+                        "sample_name": [],
+                        "sample_path": [],
+                        "sample_type": [],
+                        "size": [],
+                        "map_id": [],
+                    })
+            else:
+                self.samples_df = pl.DataFrame({
                     "sample_uid": [],
                     "sample_name": [],
                     "sample_path": [],
                     "sample_type": [],
                     "size": [],
                     "map_id": [],
-                },
-                schema={
-                    "sample_uid": pl.Int64,
-                    "sample_name": pl.Utf8,
-                    "sample_path": pl.Utf8,
-                    "sample_type": pl.Utf8,
-                    "size": pl.Int64,
-                    "map_id": pl.Utf8,
-                },
-            )
-        # Load features_df
-        if "features" in f and len(f["features"].keys()) > 0:
-            object_columns = ["chrom", "ms2_scans", "ms2_specs"]
-            self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
-        else:
-            self.features_df = None
-        # Load consensus_df
-        if "consensus" in f and len(f["consensus"].keys()) > 0:
-            self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
-        else:
-            self.consensus_df = None
-        # Load consensus_mapping_df
-        if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
-            self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
-        else:
-            self.consensus_mapping_df = None
-        # Load consensus_ms2
-        if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
-            object_columns = ["spec"]
-            self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
-        else:
-            self.consensus_ms2 = None
+                })
+            pbar.update(1)
+            # Load features_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features")
+            if "features" in f and len(f["features"].keys()) > 0:
+                features_data = {}
+                for col in f["features"].keys():
+                    column_data = f["features"][col][:]
+                    # Handle special object columns
+                    if col in ["chrom", "ms2_specs"]:
+                        reconstructed_data = _reconstruct_object_column(column_data, col)
+                        features_data[col] = reconstructed_data
+                    else:
+                        # Handle byte strings
+                        if len(column_data) > 0 and isinstance(column_data[0], bytes):
+                            column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
+                        features_data[col] = column_data
+                if features_data:
+                    # Create DataFrame with Object columns handled properly
+                    object_columns = ["chrom", "ms2_specs"]
+                    self.features_df = _create_dataframe_with_objects(features_data, object_columns)
+                else:
+                    self.features_df = None
+            else:
+                self.features_df = None
+            pbar.update(1)
+            # Load consensus_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus")
+            if "consensus" in f and len(f["consensus"].keys()) > 0:
+                consensus_data = {}
+                for col in f["consensus"].keys():
+                    column_data = f["consensus"][col][:]
+                    # Handle byte strings
+                    if len(column_data) > 0 and isinstance(column_data[0], bytes):
+                        column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
+                    consensus_data[col] = column_data
+                if consensus_data:
+                    self.consensus_df = pl.DataFrame(consensus_data)
+                else:
+                    self.consensus_df = None
+            else:
+                self.consensus_df = None
+            pbar.update(1)
+            # Load consensus_mapping_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping")
+            if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
+                mapping_data = {}
+                for col in f["consensus_mapping"].keys():
+                    column_data = f["consensus_mapping"][col][:]
+                    mapping_data[col] = column_data
+                if mapping_data:
+                    self.consensus_mapping_df = pl.DataFrame(mapping_data)
+                else:
+                    self.consensus_mapping_df = None
+            else:
+                self.consensus_mapping_df = None
+            pbar.update(1)
-    self.logger.info(f"Study loaded from {filename}")
+    self.logger.info(f"Legacy study loaded from {filename}")

masster/study/load.py CHANGED Viewed

@@ -256,7 +256,8 @@ def load(self, filename=None):
         else:
             self.logger.error("Either filename or default_folder must be provided")
             return
+    self.logger.info(f"Loading study from {filename}")
     self._load_study5(filename)
     # After loading the study, check if consensus XML exists and load it
     consensus_xml_path = filename.replace(".study5", ".consensusXML")
@@ -267,13 +268,13 @@ def load(self, filename=None):
         self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
-def fill_chrom(
+def fill_chrom_single(
     self,
-    uids=fill_chrom_defaults().uids,
-    mz_tol=fill_chrom_defaults().mz_tol,
-    rt_tol=fill_chrom_defaults().rt_tol,
-    min_samples_rel=fill_chrom_defaults().min_samples_rel,
-    min_samples_abs=fill_chrom_defaults().min_samples_abs,
+    uids=None,
+    mz_tol: float = 0.010,
+    rt_tol: float = 10.0,
+    min_samples_rel: float = 0.0,
+    min_samples_abs: int = 2,
 ):
     """Fill missing chromatograms by extracting from raw data.
@@ -281,10 +282,10 @@ def fill_chrom(
     Args:
         uids: Consensus UIDs to process (default: all)
-        mz_tol: m/z tolerance for extraction
-        rt_tol: RT tolerance for extraction
-        min_samples_rel: Relative minimum sample threshold
-        min_samples_abs: Absolute minimum sample threshold
+        mz_tol: m/z tolerance for extraction (default: 0.010 Da)
+        rt_tol: RT tolerance for extraction (default: 10.0 seconds)
+        min_samples_rel: Relative minimum sample threshold (default: 0.0)
+        min_samples_abs: Absolute minimum sample threshold (default: 2)
     """
     uids = self._get_consensus_uids(uids)
@@ -685,28 +686,28 @@ def _process_sample_for_parallel_fill(
     return new_features, new_mapping, counter
-def fill_chrom_parallel(
+def fill_chrom(
     self,
-    uids=fill_chrom_defaults().uids,
-    mz_tol=fill_chrom_defaults().mz_tol,
-    rt_tol=fill_chrom_defaults().rt_tol,
-    min_samples_rel=fill_chrom_defaults().min_samples_rel,
-    min_samples_abs=fill_chrom_defaults().min_samples_abs,
+    uids=None,
+    mz_tol: float = 0.010,
+    rt_tol: float = 10.0,
+    min_samples_rel: float = 0.0,
+    min_samples_abs: int = 2,
     num_workers=4,
 ):
     """Fill missing chromatograms by extracting from raw data using parallel processing.
     Args:
         uids: Consensus UIDs to process (default: all)
-        mz_tol: m/z tolerance for extraction
-        rt_tol: RT tolerance for extraction
-        min_samples_rel: Relative minimum sample threshold
-        min_samples_abs: Absolute minimum sample threshold
+        mz_tol: m/z tolerance for extraction (default: 0.010 Da)
+        rt_tol: RT tolerance for extraction (default: 10.0 seconds)
+        min_samples_rel: Relative minimum sample threshold (default: 0.0)
+        min_samples_abs: Absolute minimum sample threshold (default: 2)
         num_workers: Number of parallel workers (default: 4)
     """
     uids = self._get_consensus_uids(uids)
-    self.logger.info("Gap filling...")
+    self.logger.info(f"Gap filling with {num_workers} workers...")
     self.logger.debug(
         f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
     )
@@ -1075,115 +1076,3 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
     fh.load(filename, self.consensus_map)
     self.logger.debug(f"Loaded consensus map from {filename}.")
-"""def find_features(
-    self,
-    reset=None,
-    chrom_peak_snr=None,
-    noise=None,
-    chrom_fwhm=None,
-    chrom_coherence=None,
-    prominence_scaled=None,
-    link_ms2=None,
-    save_mgf=None,
-    save_stats=None,
-):
-    self.logger.debug("Finding features for all samples in the study.")
-    # Initialize default parameters inside the function
-    if reset is None:
-        reset = False
-    if chrom_peak_snr is None:
-        chrom_peak_snr = 10.0
-    if noise is None:
-        noise = 200
-    # Create parameter object and update with provided values
-    params = fill_chrom_defaults()
-    # Set explicit parameters
-    params.set('uids', uids, validate=True)
-    params.set('mz_tol', mz_tol, validate=True)
-    params.set('rt_tol', rt_tol, validate=True)
-    params.set('min_samples_rel', min_samples_rel, validate=True)
-    params.set('min_samples_abs', min_samples_abs, validate=True)
-    # Store parameters in the Study object
-    self.store_history(["fill_chrom"], params.to_dict())
-    self.logger.debug("Parameters stored to fill_chrom")
-    if chrom_fwhm is None:
-        chrom_fwhm = 1.0
-    if chrom_coherence is None:
-        chrom_coherence = 0.3
-    if prominence_scaled is None:
-        prominence_scaled = 1.0
-    if link_ms2 is None:
-        link_ms2 = True
-    if save_mgf is None:
-        save_mgf = False
-    if save_stats is None:
-        save_stats = False
-    # iterate over all samples in samples_df - using Polars iteration
-    for index, row_dict in enumerate(self.samples_df.iter_rows(named=True)):
-        # check if features_maps is None
-        if self.features_maps[index] is not None and not reset:
-            # skip this sample
-            continue
-        if self.features_maps[index] is not None and not reset:
-            # skip this sample
-            continue
-        # load the sample
-        ddaobj = Sample(row_dict["sample_path"])
-        # find features
-        ddaobj.find_features(
-            chrom_peak_snr=chrom_peak_snr,
-            noise=noise,
-            chrom_fwhm=chrom_fwhm,
-        )
-        ddaobj.filter_features(
-            prominence_scaled=prominence_scaled,
-            coherence=chrom_coherence,
-        )
-        # link MS2
-        if link_ms2:
-            ddaobj.find_ms2()
-        # add to features_maps at the index of the sample
-        self.features_maps[index] = ddaobj.features
-        # add to features_df
-        f_df = ddaobj.features_df.clone()
-        # add column 'feature_uid' with the uid as uint64
-        f_df = f_df.with_columns(pl.lit(row_dict["sample_uid"]).alias("sample_uid"))
-        # move sample_uid to the first column
-        other_cols = [col for col in f_df.columns if col != "sample_uid"]
-        f_df = f_df.select(["sample_uid"] + other_cols)
-        offset = (
-            self.features_df.get_column("feature_uid").max() + 1
-            if not self.features_df.is_empty()
-            else 1
-        )
-        f_df = f_df.with_columns(
-            pl.int_range(offset, offset + len(f_df)).alias("feature_uid"),
-        )
-        # remove all rows with sample_uid=row_dict['sample_uid']
-        self.features_df = self.features_df.filter(
-            pl.col("sample_uid") != row_dict["sample_uid"],
-        )
-        self.features_df = pl.concat([self.features_df, f_df])
-        if self.default_folder is not None:
-            bname = os.path.join(self.default_folder, row_dict["sample_name"])
-            ddaobj.save(filename=bname + ".mzpkl")
-            ddaobj.save_features(filename=bname + ".featureXML")
-        else:
-            bname = row_dict["sample_path"].replace(".mzpkl", "").replace(".wiff", "")
-            ddaobj.save(filename=bname + ".mzpkl")
-            ddaobj.save_features(filename=bname + ".featureXML")
-        if save_stats:
-            ddaobj.save_stats(filename=bname + "_stats.csv")
-        if save_mgf:
-            ddaobj.save_mgf(filename=bname + ".mgf", include_all_ms1=True)
-"""

masster/study/save.py CHANGED Viewed

@@ -122,12 +122,6 @@ def _save_consensusXML(self, filename:str):
         return
     fh = oms.ConsensusXMLFile()
-    # check if filename includes any path
-    if not os.path.isabs(filename):
-        if self.default_folder is not None:
-            filename = os.path.join(self.default_folder, filename)
-        else:
-            filename = os.path.join(os.getcwd(), filename)
     fh.store(filename, self.consensus_map)
     self.logger.info(f"Saved consensus map to {filename}")

masster/study/study.py CHANGED Viewed

@@ -71,8 +71,8 @@ from masster.study.helpers import set_default_folder
 from masster.study.load import add_folder
 from masster.study.load import add_sample
 from masster.study.load import (
+    fill_chrom_single,
     fill_chrom,
-    fill_chrom_parallel,
     _process_sample_for_parallel_fill,
 )
 from masster.study.load import _get_missing_consensus_sample_combinations
@@ -147,6 +147,7 @@ class Study:
     def __init__(
         self,
+        filename=None,
         **kwargs,
     ):
         """
@@ -156,6 +157,10 @@ class Study:
         data storage, and processing parameters used for study-level analysis.
         Parameters:
+            filename (str, optional): Path to a .study5 file to load automatically.
+                                    If provided, the default_folder will be set to the
+                                    directory containing this file, and the study will
+                                    be loaded automatically.
             **kwargs: Keyword arguments for setting study parameters. Can include:
                      - A study_defaults instance to set all parameters at once (pass as params=study_defaults(...))
                      - Individual parameter names and values (see study_defaults for available parameters)
@@ -172,6 +177,20 @@ class Study:
         """
         # Initialize default parameters
+        # Handle filename parameter for automatic loading
+        auto_load_filename = None
+        if filename is not None:
+            if not filename.endswith('.study5'):
+                raise ValueError("filename must be a .study5 file")
+            if not os.path.exists(filename):
+                raise FileNotFoundError(f"Study file not found: {filename}")
+            # Set default_folder to the directory containing the file if not already specified
+            if 'default_folder' not in kwargs:
+                kwargs['default_folder'] = os.path.dirname(os.path.abspath(filename))
+            auto_load_filename = filename
         # Check if a study_defaults instance was passed
         if "params" in kwargs and isinstance(kwargs["params"], study_defaults):
             params = kwargs.pop("params")
@@ -234,6 +253,10 @@ class Study:
             sink=self.log_sink
         )
+        # Auto-load study file if filename was provided
+        if auto_load_filename is not None:
+            self.load(filename=auto_load_filename)
     # Attach module functions as class methods
@@ -242,7 +265,7 @@ class Study:
     save_consensus = save_consensus
     save_samples = save_samples
     align = align
-    fill_chrom = fill_chrom
+    fill_chrom_single = fill_chrom_single
     find_consensus = find_consensus
     find_ms2 = find_ms2
     integrate_chrom = integrate_chrom
@@ -276,7 +299,7 @@ class Study:
     get_gaps_stats = get_gaps_stats
     get_orphans = get_orphans
     set_default_folder = set_default_folder
-    fill_chrom_parallel = fill_chrom_parallel
+    fill_chrom = fill_chrom
     _process_sample_for_parallel_fill = _process_sample_for_parallel_fill
     _get_missing_consensus_sample_combinations = _get_missing_consensus_sample_combinations
     _load_consensusXML = _load_consensusXML

{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.2.1
+Version: 0.2.3
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster
@@ -730,18 +730,11 @@ Description-Content-Type: text/markdown
 # MASSter
-**MASSter** is a comprehensive Python package for mass spectrometry data analysis, designed for metabolomics and LC-MS data processing. It provides tools for feature detection, alignment, consensus building, and interactive visualization of mass spectrometry datasets.
+**MASSter** is a comprehensive Python package for mass spectrometry data analysis, designed for metabolomics and LC-MS data processing. It provides tools for feature detection, alignment, consensus building, and interactive visualization of mass spectrometry datasets. It is designed to deal with DDA, and hides functionalities for DIA and ZTScan DIA data.
 Most core processing functions are derived from OpenMS. We use the same nomenclature and refer to their documentation for an explanation of the parameters. To a large extent, however, you should be able to use the defaults (=no parameters) when calling processing steps.
-## Features
-- **Mass spectrometry data processing**: Support for multiple file formats (.wiff, .mzML, .raw, .mzpkl)
-- **Feature detection and alignment**: Automated chromatographic peak detection and retention time alignment
-- **Consensus feature building**: Identification of features across multiple samples
-- **Interactive visualizations**: 2D plots, chromatograms, and statistical dashboards
-- **Batch processing**: Process entire studies with multiple samples
-- **Export capabilities**: MGF export for spectral library searches
+This is a poorly documented, stable branch of the development codebase in use in the Zamboni lab. Novel functionalities will be added based on need and requests.
 ## Installation
@@ -749,9 +742,7 @@ Most core processing functions are derived from OpenMS. We use the same nomencla
 pip install masster
 ```
-## Quick Start
-### Basic Workflow
+### Basic Workflow for analyzing LC-MS study with 2-... samples
 ```python
 import masster
@@ -769,146 +760,19 @@ study.align(rt_max_diff=2.0)
 study.find_consensus(min_samples=3)
 # Retrieve missing data for quantification
-study.fill_chrom_parallel()
+study.fill_chrom(abs_)
 # Integrate according to consensus metadata
 study.integrate_chrom()
-# link MS2 across the whole study
+# link MS2 across the whole study and export them
 study.find_ms2()
-# Export MGF file
 study.export_mgf()
-# Save the study
+# Save the study to .study5
 study.save()
 ```
-### Single Sample Processing
-```python
-from masster.sample import Sample
-# Load a single sample (mzML, RAW, WIFF)
-sample = Sample("path/to/your/file.mzML")
-# Detect features
-sample.find_features(chrom_peak_snr=10, noise=500, chrom_fwhm=1.0)
-# Detect adducts
-sample.find_adducts()
-# Find MS2 spectra
-sample.find_ms2()
-# Save results
-sample.save()
-```
-## Visualization Examples
-Masster provides extensive plotting capabilities for data exploration and quality control:
-### 2D Data Visualization
-```python
-# Plot 2D overview of MS data with detected features
-sample.plot_2d(
-    filename="overview_2d.html",
-    show_features=True,
-    show_ms2=True,
-    title="MS Data Overview"
-)
-# Plot with feature filtering
-sample.plot_2d(
-    filename="features_ms2_only.html",
-    show_only_features_with_ms2=True,
-    markersize=8
-)
-```
-### Study-Level Plots
-```python
-# Plot features from multiple samples
-study.plot_samples_2d(
-    samples=None,  # Use all samples
-    filename="multi_sample_overview.html",
-    markersize=3,
-    alpha_max=0.8
-)
-# Plot consensus features
-study.plot_consensus_2d(
-    filename="consensus_features.html",
-    colorby="number_samples",
-    sizeby="inty_mean"
-)
-# Plot chromatograms for specific features
-study.plot_chrom(
-    uids=[1, 2, 3],  # Feature UIDs
-    filename="chromatograms.html",
-    aligned=True
-)
-```
-### Quality Control Plots
-```python
-# Plot DDA acquisition statistics
-sample.plot_dda_stats(filename="dda_stats.html")
-# Plot feature statistics
-sample.plot_feature_stats(filename="feature_stats.html")
-# Plot total ion chromatogram
-sample.plot_tic(filename="tic.html")
-```
-### Advanced Plotting Options
-```python
-# Plot with Oracle annotation data
-sample.plot_2d_oracle(
-    oracle_folder="path/to/oracle/results",
-    colorby="hg",  # Color by chemical class
-    filename="annotated_features.html"
-)
-# Plot MS2 cycle view
-sample.plot_ms2_cycle(
-    cycle=100,
-    filename="ms2_cycle.html",
-    centroid=True
-)
-# Plot extracted ion chromatogram
-sample.plot_eic(
-    feature_uid=123,
-    rt_tol=10,
-    mz_tol=0.005,
-    filename="eic.html"
-)
-```
-## File Format Support
-- **Input formats**: .wiff, .mzML, .raw files
-- **Intermediate formats**: .sample5 and .study5 (HDF5) for fast loading
-- **Export formats**: .mgf, .csv
-- **Visualization**: .html (interactive), .png, .svg
-## Advanced Features
-### Batch Processing
-Use the command-line interface for processing multiple files:
-```bash
-python -m masster.demo.example_batch_process input_directory --recursive --dest output_directory
-```
 ## Requirements
 - Python ≥ 3.11
@@ -919,13 +783,6 @@ python -m masster.demo.example_batch_process input_directory --recursive --dest
 GNU Affero General Public License v3
-## Contributing
-Contributions are welcome! Please see our contributing guidelines and code of conduct.
 ## Citation
-If you use Masster in your research, please cite:
-```
-[Citation details to be added]
-```
+If you use Masster in your research, please cite this repository.

{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 masster/__init__.py,sha256=xeh-hwR_2umE0CpRXn8t22wbkt4IT-FBEzeJknL8J6c,670
-masster/_version.py,sha256=yivSeSaLoFmSzFJ3xhHAIjpI_6_SVEIqEZxVZ-NVYPU,239
+masster/_version.py,sha256=-QmvlpTZa_4FtjijQydS9z8bCyNLc0Gv3QiTHg5Ncro,239
 masster/chromatogram.py,sha256=f25rMrNvCQN0A93wp9QPdG3H4FiOlYPbRY3H4yd7Q5Y,18910
 masster/logger.py,sha256=9uzuVEPwQkVlnsqT_eVvh33FZY_FIm3Wn2TaJcGhZP8,10674
 masster/spectrum.py,sha256=XiClDcN1uiG-_2TIr7Bqp7x8gWvHPbC5oh3zUu3fr6Y,46789
@@ -26,14 +26,14 @@ masster/sample/defaults/get_spectrum_def.py,sha256=hy3t3zbIVvKRQmVQl8xAXrmQ4LSDb
 masster/sample/defaults/sample_def.py,sha256=WHjw-jsYinPKCC02J2Fn5SGB2OW12ntEQn-sHmqESqs,13758
 masster/study/__init__.py,sha256=bTbxmTgBAL_1iB73JE8fKdo9wik9m4dcmMppElU0V18,157
 masster/study/export.py,sha256=xmT2WhAuSGGcqHw8Wa44r6g5ud1mzzywOc3TnNqNh8E,12624
-masster/study/h5.py,sha256=IwNvqgFw9aRMH6tgfxotE5gb0i_ug0siIal0im_v3mk,30762
+masster/study/h5.py,sha256=BPpcEV_fZ3dJCEkzEga_V1zUkKQEj_kxAeMSF56sSts,39260
 masster/study/helpers.py,sha256=ePh5hPgSAgfu7-crsm4th0QYGeQbHk9kNj7OyHMclpQ,15860
-masster/study/load.py,sha256=SptaAH3L1jAk_tbSY6WpuLeekrcqjIL5HuF2NH5cfQc,42626
+masster/study/load.py,sha256=rTmm5E-UsTg0SJqwa4i4II5ca82m8OEn05yWW2G_YPc,38718
 masster/study/parameters.py,sha256=iKCIf7_bivi0Jkz4hreKmCyusXpQX5IIuuhnmS52-Q4,3177
 masster/study/plot.py,sha256=nY6zWKUOhlyDHra4BI0c8dx7PX5fHFW8v2Ma9YpscvU,21437
 masster/study/processing.py,sha256=PjfpsVASaR0uSE4vqKzBppq4jM3HexzbGw_bn5kDwdA,42552
-masster/study/save.py,sha256=hfbYoGMaBwKPvoTm5eV3OJoSw7o3Rbed68S4RaEz1I8,5053
-masster/study/study.py,sha256=9n-u_7mNynDOTAjwN_sm6AixpApKLVoImeNF56ryIQ4,20382
+masster/study/save.py,sha256=_DmnAwhlZQRNeVDLNER63pXVhinV-poKMvJlIz6Bt-Y,4791
+masster/study/study.py,sha256=gXc1j4wljbw-Zx-JPsyYO86EoXPaR0N7D2GepJZOPhA,21530
 masster/study/study5_schema.json,sha256=7LfsgI-dZGpoaPiAy0kh6gDJL4yKuA7-7PHbo9j4A6E,4630
 masster/study/defaults/__init__.py,sha256=wkul1Qq83nPHI5XebWvu3yKjp5tF8OdZDJJho8r2_qA,569
 masster/study/defaults/align_def.py,sha256=8Itwit6gaqVhF9A3w9V-uqgKlcQE6uCXyC3ul_gPWFo,8872
@@ -43,8 +43,8 @@ masster/study/defaults/find_consensus_def.py,sha256=artvErq4w07SfHB0WHi68ZjxGg0X
 masster/study/defaults/find_ms2_def.py,sha256=k-GmnCKgQuVO6M-EAjzGOqgdFrqZviRaNAdiFmwVujY,4907
 masster/study/defaults/integrate_chrom_def.py,sha256=FY9QdJpdWe18sYucrwNKoZYY0eoOo0a_hcdkZHm_W00,7107
 masster/study/defaults/study_def.py,sha256=SzUzd2YTGDGCHNMR-Dw57j5PprEnPhpITonv7wx6HQA,9035
-masster-0.2.1.dist-info/METADATA,sha256=b2SrmjarfUyfV7Vh5FHIZEgZRH_R9rKa2VleZbs7EoQ,47257
-masster-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-masster-0.2.1.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
-masster-0.2.1.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-masster-0.2.1.dist-info/RECORD,,
+masster-0.2.3.dist-info/METADATA,sha256=hYc0JozT_r5KPMj4znX9ee0omRbd1p8sK9SU9OaIEm8,44324
+masster-0.2.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+masster-0.2.3.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
+masster-0.2.3.dist-info/licenses/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+masster-0.2.3.dist-info/RECORD,,

{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{masster-0.2.1.dist-info → masster-0.2.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

masster 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

Potentially problematic release.

masster 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl