PyPI - masster - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

masster 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (12) hide show

masster/_version.py +1 -1
masster/sample/h5.py +18 -2
masster/sample/sample5_schema.json +76 -58
masster/study/h5.py +317 -138
masster/study/helpers.py +6 -39
masster/study/load.py +23 -134
masster/study/study.py +29 -11
{masster-0.2.0.dist-info → masster-0.2.2.dist-info}/METADATA +31 -55
{masster-0.2.0.dist-info → masster-0.2.2.dist-info}/RECORD +12 -12
{masster-0.2.0.dist-info → masster-0.2.2.dist-info}/WHEEL +0 -0
{masster-0.2.0.dist-info → masster-0.2.2.dist-info}/entry_points.txt +0 -0
{masster-0.2.0.dist-info → masster-0.2.2.dist-info}/licenses/LICENSE +0 -0

masster/study/h5.py CHANGED Viewed

@@ -2,25 +2,7 @@
 _study_h5.py
 This module provides HDF5-based save/load functionality for the Study class.
-It handles seria        elif col == "chrom":
-            #        elif col == "spectrum":
-            # Handle single Spectrum objects
-            data_as_str = []
-            for item in data:
-                if item is not None:
-                    data_as_str.append(item.to_json())
-                else:
-                    data_as_str.append("None")
-            group.create_dataset(col, data=data_as_str, **optimal_compression)hromatogram objects
-            data_as_str = []
-            for item in data:
-                if item is not None:
-                    data_as_str.append(item.to_json())
-                else:
-                    data_as_str.append("None")
-            group.create_dataset(col, data=data_as_str, **optimal_compression)           else:
-                    data_as_str.append("null")
-            group.create_dataset(col, data=data_as_str, **optimal_compression)n and deserialization of Polars DataFrames with complex objects
+It handles serialization and deserialization of Polars DataFrames with complex objects
 like Chromatogram and Spectrum instances.
 Key Features:
@@ -449,7 +431,7 @@ def _save_study5(self, filename=None):
     if not filename.endswith(".study5"):
         filename += ".study5"
-    self.logger.debug(f"Saving study to {filename}")
+    self.logger.info(f"Saving study to {filename}")
     # delete existing file if it exists
     if os.path.exists(filename):
@@ -529,8 +511,7 @@ def _save_study5(self, filename=None):
                 data = consensus_ms2[col] if dtype == "object" else consensus_ms2[col].to_list()
                 _save_dataframe_column(consensus_ms2_group, col, data, dtype, self.logger)
-    self.logger.info(f"Study saved to {filename}")
-    self.logger.info(f"Study saved to {filename}")
+    self.logger.debug(f"Save completed for {filename}")
 def _load_study5(self, filename=None):
@@ -552,6 +533,11 @@ def _load_study5(self, filename=None):
         - Properly handles MS2 scan lists and spectrum lists
         - Restores parameters dictionary from JSON serialization
     """
+    from datetime import datetime
+    from tqdm import tqdm
+    self.logger.info(f"Loading study from {filename}")
     # Handle default filename
     if filename is None:
         if self.default_folder is not None:
@@ -574,134 +560,327 @@ def _load_study5(self, filename=None):
     if not schema:
         self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
-    with h5py.File(filename, "r") as f:
-        # Load metadata
-        if "metadata" in f:
-            metadata = f["metadata"]
-            self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
-            if hasattr(self, "label"):
-                self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
-            # Load parameters from JSON
-            if "parameters" in metadata:
-                try:
-                    parameters_data = metadata["parameters"][()]
-                    if isinstance(parameters_data, bytes):
-                        parameters_data = parameters_data.decode("utf-8")
+    # Define loading steps for progress tracking
+    loading_steps = [
+        "metadata",
+        "samples_df",
+        "features_df",
+        "consensus_df",
+        "consensus_mapping_df",
+        "consensus_ms2"
+    ]
+    # Check if progress bar should be disabled based on log level
+    tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-                    if parameters_data and parameters_data != "":
-                        self.history = json.loads(parameters_data)
-                    else:
+    with h5py.File(filename, "r") as f:
+        # Use progress bar to show loading progress
+        with tqdm(
+            total=len(loading_steps),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading study",
+            disable=tdqm_disable,
+        ) as pbar:
+            # Load metadata
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata")
+            if "metadata" in f:
+                metadata = f["metadata"]
+                self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
+                if hasattr(self, "label"):
+                    self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
+                # Load parameters from JSON
+                if "parameters" in metadata:
+                    try:
+                        parameters_data = metadata["parameters"][()]
+                        if isinstance(parameters_data, bytes):
+                            parameters_data = parameters_data.decode("utf-8")
+                        if parameters_data and parameters_data != "":
+                            self.history = json.loads(parameters_data)
+                        else:
+                            self.history = {}
+                    except (json.JSONDecodeError, ValueError, TypeError) as e:
+                        self.logger.warning(f"Failed to deserialize parameters: {e}")
                         self.history = {}
-                except (json.JSONDecodeError, ValueError, TypeError) as e:
-                    self.logger.warning(f"Failed to deserialize parameters: {e}")
+                else:
                     self.history = {}
-            else:
-                self.history = {}
-            # Reconstruct self.parameters from loaded history
-            from masster.study.defaults.study_def import study_defaults
-            # Always create a fresh study_defaults object to ensure we have all defaults
-            self.parameters = study_defaults()
-            # Update parameters from loaded history if available
-            if self.history and "study" in self.history:
-                study_params = self.history["study"]
-                if isinstance(study_params, dict):
-                    failed_params = self.parameters.set_from_dict(study_params, validate=False)
-                    if failed_params:
-                        self.logger.debug(f"Could not set study parameters: {failed_params}")
+                # Reconstruct self.parameters from loaded history
+                from masster.study.defaults.study_def import study_defaults
+                # Always create a fresh study_defaults object to ensure we have all defaults
+                self.parameters = study_defaults()
+                # Update parameters from loaded history if available
+                if self.history and "study" in self.history:
+                    study_params = self.history["study"]
+                    if isinstance(study_params, dict):
+                        failed_params = self.parameters.set_from_dict(study_params, validate=False)
+                        if failed_params:
+                            self.logger.debug(f"Could not set study parameters: {failed_params}")
+                        else:
+                            self.logger.debug("Successfully updated parameters from loaded history")
                     else:
-                        self.logger.debug("Successfully updated parameters from loaded history")
+                        self.logger.debug("Study parameters in history are not a valid dictionary")
                 else:
-                    self.logger.debug("Study parameters in history are not a valid dictionary")
+                    self.logger.debug("No study parameters found in history, using defaults")
+                # Synchronize instance attributes with parameters (similar to __init__)
+                # Note: default_folder and label are already loaded from metadata attributes above
+                # but we ensure they match the parameters for consistency
+                if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
+                    self.default_folder = self.parameters.default_folder
+                if hasattr(self.parameters, 'label') and self.parameters.label is not None:
+                    self.label = self.parameters.label
+                if hasattr(self.parameters, 'log_level'):
+                    self.log_level = self.parameters.log_level
+                if hasattr(self.parameters, 'log_label'):
+                    self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
+                if hasattr(self.parameters, 'log_sink'):
+                    self.log_sink = self.parameters.log_sink
+            pbar.update(1)
+            # Load samples_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples")
+            if "samples" in f and len(f["samples"].keys()) > 0:
+                self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
             else:
-                self.logger.debug("No study parameters found in history, using defaults")
-            # Synchronize instance attributes with parameters (similar to __init__)
-            # Note: default_folder and label are already loaded from metadata attributes above
-            # but we ensure they match the parameters for consistency
-            if hasattr(self.parameters, 'default_folder') and self.parameters.default_folder is not None:
-                self.default_folder = self.parameters.default_folder
-            if hasattr(self.parameters, 'label') and self.parameters.label is not None:
-                self.label = self.parameters.label
-            if hasattr(self.parameters, 'log_level'):
-                self.log_level = self.parameters.log_level
-            if hasattr(self.parameters, 'log_label'):
-                self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
-            if hasattr(self.parameters, 'log_sink'):
-                self.log_sink = self.parameters.log_sink
-        # Load samples_df
-        if "samples" in f and len(f["samples"].keys()) > 0:
-            self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
+                # Initialize empty samples_df with the correct schema if no data exists
+                self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
+                self.samples_df = pl.DataFrame(
+                    {
+                        "sample_uid": [],
+                        "sample_name": [],
+                        "sample_path": [],
+                        "sample_type": [],
+                        "size": [],
+                        "map_id": [],
+                    },
+                    schema={
+                        "sample_uid": pl.Int64,
+                        "sample_name": pl.Utf8,
+                        "sample_path": pl.Utf8,
+                        "sample_type": pl.Utf8,
+                        "size": pl.Int64,
+                        "map_id": pl.Utf8,
+                    },
+                )
+            pbar.update(1)
+            # Load features_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features")
+            if "features" in f and len(f["features"].keys()) > 0:
+                object_columns = ["chrom", "ms2_scans", "ms2_specs"]
+                self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
+            else:
+                self.features_df = None
+            pbar.update(1)
+            # Load consensus_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus")
+            if "consensus" in f and len(f["consensus"].keys()) > 0:
+                self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
+            else:
+                self.consensus_df = None
+            pbar.update(1)
+            # Load consensus_mapping_df
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping")
+            if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
+                self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
+            else:
+                self.consensus_mapping_df = None
+            pbar.update(1)
+            # Load consensus_ms2
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus MS2")
+            if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
+                object_columns = ["spec"]
+                self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
+            else:
+                self.consensus_ms2 = None
+            pbar.update(1)
+    self.logger.info(f"Study loaded from {filename}")
+def _load_h5(self, filename=None):
+    """
+    Load Study instance data from a legacy .h5 HDF5 file with progress tracking.
+    This is a legacy method for loading older HDF5 format files. For new files,
+    use _load_study5() which has improved schema handling and performance.
+    Args:
+        filename (str, optional): Path to the .h5 HDF5 file to load. If None, uses default.
+    Returns:
+        None (modifies self in place)
+    Notes:
+        - Legacy format loader with basic DataFrame reconstruction
+        - Includes progress bar for loading steps
+        - For new projects, prefer _load_study5() method
+    """
+    from datetime import datetime
+    from tqdm import tqdm
+    # Handle default filename
+    if filename is None:
+        if self.default_folder is not None:
+            filename = os.path.join(self.default_folder, "study.h5")
         else:
-            # Initialize empty samples_df with the correct schema if no data exists
-            self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
-            self.samples_df = pl.DataFrame(
-                {
-                    "sample_uid": [],
-                    "sample_name": [],
-                    "sample_path": [],
-                    "sample_type": [],
-                    "size": [],
-                    "map_id": [],
-                },
-                schema={
-                    "sample_uid": pl.Int64,
-                    "sample_name": pl.Utf8,
-                    "sample_path": pl.Utf8,
-                    "sample_type": pl.Utf8,
-                    "size": pl.Int64,
-                    "map_id": pl.Utf8,
-                },
-            )
-            # Initialize empty samples_df with the correct schema if no data exists
-            self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
-            self.samples_df = pl.DataFrame(
-                {
+            self.logger.error("Either filename or default_folder must be provided")
+            return
+    # Add .h5 extension if not provided
+    if not filename.endswith(".h5"):
+        filename += ".h5"
+    if not os.path.exists(filename):
+        self.logger.error(f"File {filename} does not exist")
+        return
+    # Define loading steps for progress tracking
+    loading_steps = [
+        "metadata",
+        "samples_df",
+        "features_df",
+        "consensus_df",
+        "consensus_mapping_df"
+    ]
+    # Check if progress bar should be disabled based on log level
+    tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
+    with h5py.File(filename, "r") as f:
+        # Use progress bar to show loading progress
+        with tqdm(
+            total=len(loading_steps),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading legacy study",
+            disable=tdqm_disable,
+        ) as pbar:
+            # Load metadata
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata")
+            if "metadata" in f:
+                metadata = f["metadata"]
+                self.default_folder = _decode_bytes_attr(metadata.attrs.get("default_folder", ""))
+                if hasattr(self, "label"):
+                    self.label = _decode_bytes_attr(metadata.attrs.get("label", ""))
+                # Load parameters from JSON if available
+                if "parameters" in metadata:
+                    try:
+                        parameters_data = metadata["parameters"][()]
+                        if isinstance(parameters_data, bytes):
+                            parameters_data = parameters_data.decode("utf-8")
+                        if parameters_data and parameters_data != "":
+                            self.history = json.loads(parameters_data)
+                        else:
+                            self.history = {}
+                    except (json.JSONDecodeError, ValueError, TypeError) as e:
+                        self.logger.warning(f"Failed to deserialize parameters: {e}")
+                        self.history = {}
+                else:
+                    self.history = {}
+            pbar.update(1)
+            # Load samples_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples")
+            if "samples" in f and len(f["samples"].keys()) > 0:
+                samples_data = {}
+                for col in f["samples"].keys():
+                    column_data = f["samples"][col][:]
+                    # Handle byte strings
+                    if len(column_data) > 0 and isinstance(column_data[0], bytes):
+                        column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
+                    samples_data[col] = column_data
+                if samples_data:
+                    self.samples_df = pl.DataFrame(samples_data)
+                else:
+                    # Initialize empty samples_df
+                    self.samples_df = pl.DataFrame({
+                        "sample_uid": [],
+                        "sample_name": [],
+                        "sample_path": [],
+                        "sample_type": [],
+                        "size": [],
+                        "map_id": [],
+                    })
+            else:
+                self.samples_df = pl.DataFrame({
                     "sample_uid": [],
                     "sample_name": [],
                     "sample_path": [],
                     "sample_type": [],
                     "size": [],
                     "map_id": [],
-                },
-                schema={
-                    "sample_uid": pl.Int64,
-                    "sample_name": pl.Utf8,
-                    "sample_path": pl.Utf8,
-                    "sample_type": pl.Utf8,
-                    "size": pl.Int64,
-                    "map_id": pl.Utf8,
-                },
-            )
-        # Load features_df
-        if "features" in f and len(f["features"].keys()) > 0:
-            object_columns = ["chrom", "ms2_scans", "ms2_specs"]
-            self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
-        else:
-            self.features_df = None
-        # Load consensus_df
-        if "consensus" in f and len(f["consensus"].keys()) > 0:
-            self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger)
-        else:
-            self.consensus_df = None
-        # Load consensus_mapping_df
-        if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
-            self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
-        else:
-            self.consensus_mapping_df = None
-        # Load consensus_ms2
-        if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
-            object_columns = ["spec"]
-            self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
-        else:
-            self.consensus_ms2 = None
+                })
+            pbar.update(1)
+            # Load features_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features")
+            if "features" in f and len(f["features"].keys()) > 0:
+                features_data = {}
+                for col in f["features"].keys():
+                    column_data = f["features"][col][:]
+                    # Handle special object columns
+                    if col in ["chrom", "ms2_specs"]:
+                        reconstructed_data = _reconstruct_object_column(column_data, col)
+                        features_data[col] = reconstructed_data
+                    else:
+                        # Handle byte strings
+                        if len(column_data) > 0 and isinstance(column_data[0], bytes):
+                            column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
+                        features_data[col] = column_data
+                if features_data:
+                    # Create DataFrame with Object columns handled properly
+                    object_columns = ["chrom", "ms2_specs"]
+                    self.features_df = _create_dataframe_with_objects(features_data, object_columns)
+                else:
+                    self.features_df = None
+            else:
+                self.features_df = None
+            pbar.update(1)
+            # Load consensus_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus")
+            if "consensus" in f and len(f["consensus"].keys()) > 0:
+                consensus_data = {}
+                for col in f["consensus"].keys():
+                    column_data = f["consensus"][col][:]
+                    # Handle byte strings
+                    if len(column_data) > 0 and isinstance(column_data[0], bytes):
+                        column_data = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
+                    consensus_data[col] = column_data
+                if consensus_data:
+                    self.consensus_df = pl.DataFrame(consensus_data)
+                else:
+                    self.consensus_df = None
+            else:
+                self.consensus_df = None
+            pbar.update(1)
+            # Load consensus_mapping_df (legacy format)
+            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping")
+            if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
+                mapping_data = {}
+                for col in f["consensus_mapping"].keys():
+                    column_data = f["consensus_mapping"][col][:]
+                    mapping_data[col] = column_data
+                if mapping_data:
+                    self.consensus_mapping_df = pl.DataFrame(mapping_data)
+                else:
+                    self.consensus_mapping_df = None
+            else:
+                self.consensus_mapping_df = None
+            pbar.update(1)
-    self.logger.info(f"Study loaded from {filename}")
+    self.logger.info(f"Legacy study loaded from {filename}")

masster/study/helpers.py CHANGED Viewed

@@ -114,45 +114,6 @@ def get_chrom(self, uids=None, samples=None):
     # Return as Polars DataFrame (can handle complex objects like Chromatogram)
     return df2_pivoted
-'''
-def migrate_adduct_columns(self):
-    """
-    Migrate adduct_right and adduct_mass_right columns to adduct and adduct_mass.
-    This fixes an issue where join operations created _right suffixed columns.
-    """
-    if self.features_df.is_empty():
-        return
-    # Check if we have the _right suffixed columns
-    has_adduct_right = "adduct_right" in self.features_df.columns
-    has_adduct_mass_right = "adduct_mass_right" in self.features_df.columns
-    has_adduct = "adduct" in self.features_df.columns
-    has_adduct_mass = "adduct_mass" in self.features_df.columns
-    if has_adduct_right or has_adduct_mass_right:
-        self.logger.info("Migrating adduct column names...")
-        # Start with all columns except those we're replacing/dropping
-        columns_to_keep = [
-            col
-            for col in self.features_df.columns
-            if col not in ["adduct_right", "adduct_mass_right", "adduct", "adduct_mass"]
-        ]
-        # Add the migrated columns
-        if has_adduct_right:
-            columns_to_keep.append(pl.col("adduct_right").alias("adduct"))
-        if has_adduct_mass_right:
-            columns_to_keep.append(pl.col("adduct_mass_right").alias("adduct_mass"))
-        # Apply the migration
-        self.features_df = self.features_df.select(columns_to_keep)
-        self.logger.success("Adduct column migration completed.")
-    else:
-        self.logger.info("No adduct column migration needed.")
-'''
 def set_default_folder(self, folder):
     """
     Set the default folder for saving and loading files.
@@ -448,6 +409,12 @@ def _get_sample_uids(self, samples=None, seed=42):
         sample_uids = list(set(sample_uids))
         return sample_uids
+def get_orphans(self):
+    """
+    Get all features that are not in the consensus mapping.
+    """
+    not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
+    return not_in_consensus
 def compress(self):
     """

masster 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

masster 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl