PyPI - masster - Versions diffs - 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl - Mend

masster 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (33) hide show

masster/docs/SCX_API_Documentation.md +0 -0
masster/docs/SCX_DLL_Analysis.md +0 -0
masster/logger.py +92 -78
masster/sample/defaults/find_features_def.py +16 -6
masster/sample/defaults/sample_def.py +1 -1
masster/sample/h5.py +2 -2
masster/sample/helpers.py +190 -140
masster/sample/load.py +13 -9
masster/sample/plot.py +256 -147
masster/sample/processing.py +18 -12
masster/sample/sample.py +10 -4
masster/sample/sample5_schema.json +38 -29
masster/sample/save.py +16 -13
masster/sample/sciex.py +187 -176
masster/study/defaults/align_def.py +231 -13
masster/study/defaults/fill_chrom_def.py +1 -5
masster/study/defaults/integrate_chrom_def.py +1 -5
masster/study/defaults/study_def.py +2 -2
masster/study/export.py +144 -131
masster/study/h5.py +193 -133
masster/study/helpers.py +757 -246
masster/study/helpers_optimized.py +99 -57
masster/study/load.py +57 -25
masster/study/plot.py +1244 -129
masster/study/processing.py +194 -86
masster/study/save.py +7 -7
masster/study/study.py +154 -89
masster/study/study5_schema.json +15 -15
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/METADATA +1 -1
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/RECORD +33 -31
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/WHEEL +0 -0
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/entry_points.txt +0 -0
{masster-0.3.10.dist-info → masster-0.3.12.dist-info}/licenses/LICENSE +0 -0

masster/study/h5.py CHANGED Viewed

@@ -59,10 +59,10 @@ def _decode_bytes_attr(attr_value):
 def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=10000):
     """
     Save an entire DataFrame to HDF5 with optimized batch processing and memory efficiency.
     This function replaces individual column processing with batch operations for much
     better performance on large datasets (300+ samples).
     Args:
         df: Polars DataFrame to save
         group: HDF5 group to save to
@@ -73,17 +73,17 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
     """
     if df is None or df.is_empty():
         return
     try:
         # Reorder columns according to schema
         df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
         total_rows = len(df_ordered)
         # Group columns by processing type for batch optimization
         numeric_cols = []
         string_cols = []
         object_cols = []
         for col in df_ordered.columns:
             dtype = str(df_ordered[col].dtype).lower()
             if dtype == "object":
@@ -92,23 +92,25 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
                 string_cols.append(col)
             else:
                 numeric_cols.append(col)
-        logger.debug(f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns")
+        logger.debug(
+            f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
+        )
         # Process numeric columns in batch (most efficient)
         if numeric_cols:
             for col in numeric_cols:
                 _save_numeric_column_fast(group, col, df_ordered[col], logger)
-        # Process string columns in batch
+        # Process string columns in batch
         if string_cols:
             for col in string_cols:
                 _save_string_column_fast(group, col, df_ordered[col], logger)
         # Process object columns with optimized serialization
         if object_cols:
             _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
     except Exception as e:
         logger.error(f"Failed to save DataFrame {df_name}: {e}")
         # Fallback to old method for safety
@@ -119,20 +121,20 @@ def _save_numeric_column_fast(group, col, data_series, logger):
     """Fast numeric column saving with optimal compression."""
     try:
         import numpy as np
         # Get compression settings based on column name
         if col in ["consensus_uid", "feature_uid", "scan_id", "rt", "mz", "intensity"]:
             compression_kwargs = {"compression": "lzf", "shuffle": True}
         else:
             compression_kwargs = {"compression": "lzf"}
         # Convert to numpy array efficiently
         try:
             data_array = data_series.to_numpy()
         except Exception:
             # Fallback for complex data types
             data_array = np.array(data_series.to_list())
         # Handle None/null values efficiently
         if data_array.dtype == object:
             # Check if this is actually a list/array column that should be treated as object
@@ -141,13 +143,13 @@ def _save_numeric_column_fast(group, col, data_series, logger):
                 if val is not None:
                     sample_value = val
                     break
             # If sample value is a list/array, treat as object column
             if isinstance(sample_value, (list, tuple, np.ndarray)):
                 logger.debug(f"Column '{col}' contains array-like data, treating as object")
                 _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
                 return
             # Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
             try:
                 data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
@@ -156,9 +158,9 @@ def _save_numeric_column_fast(group, col, data_series, logger):
                 logger.debug(f"Column '{col}' is not numeric, treating as object")
                 _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
                 return
         group.create_dataset(col, data=data_array, **compression_kwargs)
     except Exception as e:
         logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
         # Fallback to old method
@@ -170,10 +172,10 @@ def _save_string_column_fast(group, col, data_series, logger):
     try:
         # Convert to string array efficiently
         string_data = ["None" if x is None else str(x) for x in data_series.to_list()]
         compression_kwargs = {"compression": "gzip", "compression_opts": 6}
         group.create_dataset(col, data=string_data, **compression_kwargs)
     except Exception as e:
         logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
         # Fallback to old method
@@ -183,11 +185,11 @@ def _save_string_column_fast(group, col, data_series, logger):
 def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
     """Optimized object column processing with chunking and parallel serialization."""
     import json
     def serialize_chunk(col_name, chunk_data):
         """Serialize a chunk of object data."""
         serialized_chunk = []
         if col_name == "chrom":
             # Handle Chromatogram objects
             for item in chunk_data:
@@ -233,19 +235,19 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
             logger.warning(f"Unknown object column '{col_name}', using default serialization")
             for item in chunk_data:
                 serialized_chunk.append(str(item) if item is not None else "None")
         return serialized_chunk
     # Process each object column
     for col in object_cols:
         try:
             data_list = df[col].to_list()
             total_items = len(data_list)
             if total_items == 0:
                 group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
                 continue
             # For small datasets, process directly
             if total_items <= chunk_size:
                 serialized_data = serialize_chunk(col, data_list)
@@ -253,19 +255,19 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
             else:
                 # For large datasets, use chunked processing with parallel serialization
                 logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
                 all_serialized = []
                 num_chunks = (total_items + chunk_size - 1) // chunk_size
                 # Use thread pool for parallel serialization of chunks
                 with ThreadPoolExecutor(max_workers=min(4, num_chunks)) as executor:
                     futures = {}
                     for i in range(0, total_items, chunk_size):
-                        chunk = data_list[i:i + chunk_size]
+                        chunk = data_list[i : i + chunk_size]
                         future = executor.submit(serialize_chunk, col, chunk)
                         futures[future] = i
                     # Collect results in order
                     results = {}
                     for future in as_completed(futures):
@@ -274,18 +276,20 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                             chunk_result = future.result()
                             results[chunk_start] = chunk_result
                         except Exception as e:
-                            logger.warning(f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}")
+                            logger.warning(
+                                f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}"
+                            )
                             # Fallback to simple string conversion for this chunk
-                            chunk = data_list[chunk_start:chunk_start + chunk_size]
+                            chunk = data_list[chunk_start : chunk_start + chunk_size]
                             results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
                     # Reassemble in correct order
                     for i in range(0, total_items, chunk_size):
                         if i in results:
                             all_serialized.extend(results[i])
                 group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
         except Exception as e:
             logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
             # Fallback to old method
@@ -430,7 +434,9 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
                     data_as_str.append("None")
             group.create_dataset(col, data=data_as_str, compression=compression)
         else:
-            logger.warning(f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.")
+            logger.warning(
+                f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column."
+            )
     elif dtype == "string":
         # Handle string columns
         string_data = ["None" if x is None else str(x) for x in data]
@@ -479,6 +485,7 @@ def _reconstruct_object_column(data_col, col_name: str):
         # Handle non-string data (e.g., float32 NaN from corrupted compression)
         if not isinstance(item, str):
             import numpy as np
             if isinstance(item, (float, np.floating)) and np.isnan(item):
                 reconstructed_data.append(None)
                 continue
@@ -594,16 +601,16 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
     expected_length = None
     if regular_data:
         for values in regular_data.values():
-            if values is not None and hasattr(values, '__len__'):
+            if values is not None and hasattr(values, "__len__"):
                 expected_length = len(values)
                 break
     if expected_length is None and object_data:
         for values in object_data.values():
-            if values is not None and hasattr(values, '__len__'):
+            if values is not None and hasattr(values, "__len__"):
                 expected_length = len(values)
                 break
     if expected_length is None:
         expected_length = 0
@@ -611,7 +618,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
     for col in object_columns:
         if col in object_data:
             values = object_data[col]
-            if values is None or (hasattr(values, '__len__') and len(values) == 0):
+            if values is None or (hasattr(values, "__len__") and len(values) == 0):
                 object_data[col] = [None] * expected_length
                 # print(f"DEBUG: Fixed object column '{col}' to have length {expected_length}")
@@ -624,12 +631,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
             # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
             if col == "adducts":
                 # Handle adducts as List(Struct) - now contains dicts
-                df = df.with_columns([pl.Series(col, values, dtype=pl.List(pl.Struct([
-                    pl.Field("adduct", pl.Utf8),
-                    pl.Field("count", pl.Int64),
-                    pl.Field("percentage", pl.Float64),
-                    pl.Field("mass", pl.Float64)
-                ])))])
+                df = df.with_columns([
+                    pl.Series(
+                        col,
+                        values,
+                        dtype=pl.List(
+                            pl.Struct([
+                                pl.Field("adduct", pl.Utf8),
+                                pl.Field("count", pl.Int64),
+                                pl.Field("percentage", pl.Float64),
+                                pl.Field("mass", pl.Float64),
+                            ]),
+                        ),
+                    ),
+                ])
             else:
                 # Other object columns stay as Object
                 df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -640,12 +655,20 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
             # print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
             if col == "adducts":
                 # Handle adducts as List(Struct) - now contains dicts
-                df = df.with_columns([pl.Series(col, values, dtype=pl.List(pl.Struct([
-                    pl.Field("adduct", pl.Utf8),
-                    pl.Field("count", pl.Int64),
-                    pl.Field("percentage", pl.Float64),
-                    pl.Field("mass", pl.Float64)
-                ])))])
+                df = df.with_columns([
+                    pl.Series(
+                        col,
+                        values,
+                        dtype=pl.List(
+                            pl.Struct([
+                                pl.Field("adduct", pl.Utf8),
+                                pl.Field("count", pl.Int64),
+                                pl.Field("percentage", pl.Float64),
+                                pl.Field("mass", pl.Float64),
+                            ]),
+                        ),
+                    ),
+                ])
             else:
                 # Other object columns stay as Object
                 df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -713,11 +736,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     # Determine expected DataFrame length from loaded columns
     expected_length = None
     for col, values in data.items():
-        if values is not None and hasattr(values, '__len__'):
+        if values is not None and hasattr(values, "__len__"):
             expected_length = len(values)
             logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
             break
     # If no data loaded yet, try HDF5 columns directly
     if expected_length is None:
         hdf5_columns = list(group.keys())
@@ -727,7 +750,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
                 expected_length = len(col_data)
                 logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
                 break
     # Default to 0 if no data found
     if expected_length is None:
         expected_length = 0
@@ -747,25 +770,25 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     # Check for columns in HDF5 file that are not in schema (for backward compatibility)
     hdf5_columns = list(group.keys())
     extra_columns = [col for col in hdf5_columns if col not in (schema_columns or [])]
     for col in extra_columns:
         logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
         column_data = group[col][:]
         # Try to determine if this should be treated as an object column
         # by checking if the data looks like JSON strings
         if len(column_data) > 0 and isinstance(column_data[0], bytes):
             try:
                 # Check if it looks like JSON
-                test_decode = column_data[0].decode('utf-8')
-                if test_decode.startswith('[') or test_decode.startswith('{'):
+                test_decode = column_data[0].decode("utf-8")
+                if test_decode.startswith("[") or test_decode.startswith("{"):
                     # Looks like JSON, treat as object column
                     data[col] = _reconstruct_object_column(column_data, col)
                     if col not in object_columns:
                         object_columns.append(col)
                 else:
                     # Regular string data
-                    data[col] = [item.decode('utf-8') if isinstance(item, bytes) else item for item in column_data]
+                    data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
             except Exception:
                 # If decoding fails, treat as regular data
                 data[col] = column_data
@@ -784,7 +807,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
             if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
                 dtype_str = schema[df_name]["columns"][col]["dtype"]
                 should_be_string = dtype_str == "pl.Utf8"
             if should_be_string:
                 processed_values = []
                 for val in values:
@@ -815,11 +838,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
 def _save_study5_compressed(self, filename=None):
     """
     Compressed save identical to _save_study5 but skips serialization of chrom and ms2_specs columns in features_df.
     This version maintains full compatibility with _load_study5() while providing performance benefits
     by skipping the serialization of heavy object columns (chrom and ms2_specs) in features_df.
     """
     # if no extension is given, add .study5
     if not filename.endswith(".study5"):
         filename += ".study5"
@@ -849,18 +872,17 @@ def _save_study5_compressed(self, filename=None):
             dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
         if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
             dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
         total_steps = len(dataframes_to_save) + 1  # +1 for metadata
         # Show progress for large saves
         tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
         with tqdm(
             total=total_steps,
             desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Fast saving study",
             disable=tdqm_disable,
         ) as pbar:
             # Create groups for organization
             metadata_group = f.create_group("metadata")
             features_group = f.create_group("features")
@@ -883,9 +905,11 @@ def _save_study5_compressed(self, filename=None):
                     metadata_group.create_dataset("parameters", data="")
             else:
                 metadata_group.create_dataset("parameters", data="")
             pbar.update(1)
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes"
+            )
             # Store samples_df - use optimized batch processing
             if self.samples_df is not None and not self.samples_df.is_empty():
@@ -896,7 +920,9 @@ def _save_study5_compressed(self, filename=None):
             # Store features_df - use fast method that skips chrom and ms2_specs columns
             if self.features_df is not None and not self.features_df.is_empty():
-                self.logger.debug(f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)")
+                self.logger.debug(
+                    f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)"
+                )
                 _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
                 pbar.update(1)
@@ -932,10 +958,10 @@ def _save_study5_compressed(self, filename=None):
 def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
     """
     Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
     This function is identical to _save_dataframe_optimized but excludes heavy object columns
     (chrom and ms2_specs) when saving features_df to improve performance.
     Args:
         df: Polars DataFrame to save
         group: HDF5 group to save to
@@ -946,24 +972,24 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
     """
     if df is None or df.is_empty():
         return
     try:
         # Reorder columns according to schema
         df_ordered = _reorder_columns_by_schema(df.clone(), schema, df_name)
         # Skip chrom and ms2_specs columns for features_df
         if df_name == "features_df":
             skip_columns = ["chrom", "ms2_specs"]
             df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
             logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
         total_rows = len(df_ordered)
         # Group columns by processing type for batch optimization
         numeric_cols = []
         string_cols = []
         object_cols = []
         for col in df_ordered.columns:
             dtype = str(df_ordered[col].dtype).lower()
             if dtype == "object":
@@ -972,23 +998,25 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
                 string_cols.append(col)
             else:
                 numeric_cols.append(col)
-        logger.debug(f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns")
+        logger.debug(
+            f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
+        )
         # Process numeric columns in batch (most efficient)
         if numeric_cols:
             for col in numeric_cols:
                 _save_numeric_column_fast(group, col, df_ordered[col], logger)
-        # Process string columns in batch
+        # Process string columns in batch
         if string_cols:
             for col in string_cols:
                 _save_string_column_fast(group, col, df_ordered[col], logger)
         # Process object columns with optimized serialization
         if object_cols:
             _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
     except Exception as e:
         logger.error(f"Failed to save DataFrame {df_name}: {e}")
         # Fallback to old method for safety
@@ -1054,18 +1082,17 @@ def _save_study5(self, filename=None):
             dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
         if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
             dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
         total_steps = len(dataframes_to_save) + 1  # +1 for metadata
         # Show progress for large saves
         tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
         with tqdm(
             total=total_steps,
             desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving study",
             disable=tdqm_disable,
         ) as pbar:
             # Create groups for organization
             metadata_group = f.create_group("metadata")
             features_group = f.create_group("features")
@@ -1088,9 +1115,11 @@ def _save_study5(self, filename=None):
                     metadata_group.create_dataset("parameters", data="")
             else:
                 metadata_group.create_dataset("parameters", data="")
             pbar.update(1)
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes"
+            )
             # Store samples_df - use optimized batch processing
             if self.samples_df is not None and not self.samples_df.is_empty():
@@ -1099,7 +1128,7 @@ def _save_study5(self, filename=None):
                 _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
                 pbar.update(1)
-            # Store features_df - use optimized batch processing
+            # Store features_df - use optimized batch processing
             if self.features_df is not None and not self.features_df.is_empty():
                 self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
                 _save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
@@ -1154,7 +1183,7 @@ def _load_study5(self, filename=None):
         - Properly handles MS2 scan lists and spectrum lists
         - Restores parameters dictionary from JSON serialization
     """
     self.logger.info(f"Loading study from {filename}")
     # Handle default filename
@@ -1182,26 +1211,26 @@ def _load_study5(self, filename=None):
     # Define loading steps for progress tracking
     loading_steps = [
         "metadata",
-        "samples_df",
+        "samples_df",
         "features_df",
         "consensus_df",
         "consensus_mapping_df",
-        "consensus_ms2"
+        "consensus_ms2",
     ]
     # Check if progress bar should be disabled based on log level
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
     # Define loading steps for progress tracking
     loading_steps = [
         "metadata",
-        "samples_df",
+        "samples_df",
         "features_df",
         "consensus_df",
         "consensus_mapping_df",
-        "consensus_ms2"
+        "consensus_ms2",
     ]
     # Check if progress bar should be disabled based on log level
     tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
@@ -1212,9 +1241,10 @@ def _load_study5(self, filename=None):
             desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading study",
             disable=tdqm_disable,
         ) as pbar:
             # Load metadata
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata"
+            )
             if "metadata" in f:
                 metadata = f["metadata"]
                 self.folder = _decode_bytes_attr(metadata.attrs.get("folder", ""))
@@ -1240,10 +1270,10 @@ def _load_study5(self, filename=None):
                 # Reconstruct self.parameters from loaded history
                 from masster.study.defaults.study_def import study_defaults
                 # Always create a fresh study_defaults object to ensure we have all defaults
                 self.parameters = study_defaults()
                 # Update parameters from loaded history if available
                 if self.history and "study" in self.history:
                     study_params = self.history["study"]
@@ -1257,24 +1287,26 @@ def _load_study5(self, filename=None):
                         self.logger.debug("Study parameters in history are not a valid dictionary")
                 else:
                     self.logger.debug("No study parameters found in history, using defaults")
                 # Synchronize instance attributes with parameters (similar to __init__)
                 # Note: folder and label are already loaded from metadata attributes above
                 # but we ensure they match the parameters for consistency
-                if hasattr(self.parameters, 'folder') and self.parameters.folder is not None:
+                if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
                     self.folder = self.parameters.folder
-                if hasattr(self.parameters, 'label') and self.parameters.label is not None:
+                if hasattr(self.parameters, "label") and self.parameters.label is not None:
                     self.label = self.parameters.label
-                if hasattr(self.parameters, 'log_level'):
+                if hasattr(self.parameters, "log_level"):
                     self.log_level = self.parameters.log_level
-                if hasattr(self.parameters, 'log_label'):
+                if hasattr(self.parameters, "log_label"):
                     self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
-                if hasattr(self.parameters, 'log_sink'):
+                if hasattr(self.parameters, "log_sink"):
                     self.log_sink = self.parameters.log_sink
             pbar.update(1)
             # Load samples_df
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples"
+            )
             if "samples" in f and len(f["samples"].keys()) > 0:
                 self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
             else:
@@ -1306,7 +1338,9 @@ def _load_study5(self, filename=None):
                 )
             pbar.update(1)
             # Load samples_df
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples"
+            )
             if "samples" in f and len(f["samples"].keys()) > 0:
                 self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
             else:
@@ -1339,66 +1373,92 @@ def _load_study5(self, filename=None):
             pbar.update(1)
             # Load features_df
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features"
+            )
             if "features" in f and len(f["features"].keys()) > 0:
                 object_columns = ["chrom", "ms2_scans", "ms2_specs"]
-                self.features_df = _load_dataframe_from_group(f["features"], schema, "features_df", self.logger, object_columns)
+                self.features_df = _load_dataframe_from_group(
+                    f["features"], schema, "features_df", self.logger, object_columns
+                )
             else:
                 self.features_df = None
             pbar.update(1)
             # Load consensus_df
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus"
+            )
             if "consensus" in f and len(f["consensus"].keys()) > 0:
                 # Only include adducts in object_columns if it actually exists in the file
                 object_columns = []
                 if "adducts" in f["consensus"]:
                     object_columns.append("adducts")
-                self.consensus_df = _load_dataframe_from_group(f["consensus"], schema, "consensus_df", self.logger, object_columns)
+                self.consensus_df = _load_dataframe_from_group(
+                    f["consensus"], schema, "consensus_df", self.logger, object_columns
+                )
                 # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
                 if self.consensus_df is not None:
                     if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
                         self.logger.info("Adding missing 'adducts' column for backward compatibility")
                         empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
                         # If column exists but is Null, drop it first
                         if "adducts" in self.consensus_df.columns:
                             self.consensus_df = self.consensus_df.drop("adducts")
                         self.consensus_df = self.consensus_df.with_columns([
-                            pl.Series("adducts", empty_adducts, dtype=pl.List(pl.Struct([
-                                pl.Field("adduct", pl.Utf8),
-                                pl.Field("count", pl.Int64),
-                                pl.Field("percentage", pl.Float64),
-                                pl.Field("mass", pl.Float64)
-                            ])))
+                            pl.Series(
+                                "adducts",
+                                empty_adducts,
+                                dtype=pl.List(
+                                    pl.Struct([
+                                        pl.Field("adduct", pl.Utf8),
+                                        pl.Field("count", pl.Int64),
+                                        pl.Field("percentage", pl.Float64),
+                                        pl.Field("mass", pl.Float64),
+                                    ]),
+                                ),
+                            ),
                         ])
             else:
                 self.consensus_df = None
             pbar.update(1)
             # Load consensus_mapping_df
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping"
+            )
             if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
-                self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
+                self.consensus_mapping_df = _load_dataframe_from_group(
+                    f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
+                )
             else:
                 self.consensus_mapping_df = None
             pbar.update(1)
             # Load consensus_mapping_df
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping"
+            )
             if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
-                self.consensus_mapping_df = _load_dataframe_from_group(f["consensus_mapping"], schema, "consensus_mapping_df", self.logger)
+                self.consensus_mapping_df = _load_dataframe_from_group(
+                    f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
+                )
             else:
                 self.consensus_mapping_df = None
             pbar.update(1)
             # Load consensus_ms2
-            pbar.set_description(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus MS2")
+            pbar.set_description(
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus MS2"
+            )
             if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
                 object_columns = ["spec"]
-                self.consensus_ms2 = _load_dataframe_from_group(f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns)
+                self.consensus_ms2 = _load_dataframe_from_group(
+                    f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns
+                )
             else:
                 self.consensus_ms2 = None
             pbar.update(1)

masster 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

Potentially problematic release.

masster 0.3.10py3-none-any.whl → 0.3.12py3-none-any.whl