PyPI - masster - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

masster 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (34) hide show

masster/chromatogram.py +2 -2
masster/data/libs/urine.csv +3 -3
masster/logger.py +8 -8
masster/sample/adducts.py +337 -263
masster/sample/defaults/find_adducts_def.py +21 -8
masster/sample/h5.py +557 -278
masster/sample/helpers.py +131 -75
masster/sample/lib.py +2 -2
masster/sample/load.py +25 -11
masster/sample/plot.py +5 -5
masster/sample/processing.py +115 -85
masster/sample/sample.py +28 -15
masster/sample/sample5_schema.json +44 -44
masster/sample/save.py +34 -11
masster/spectrum.py +2 -2
masster/study/defaults/align_def.py +5 -1
masster/study/defaults/identify_def.py +3 -1
masster/study/defaults/study_def.py +58 -25
masster/study/export.py +354 -204
masster/study/h5.py +557 -155
masster/study/helpers.py +487 -194
masster/study/id.py +536 -347
masster/study/load.py +228 -138
masster/study/plot.py +68 -68
masster/study/processing.py +455 -253
masster/study/save.py +14 -4
masster/study/study.py +122 -40
masster/study/study5_schema.json +149 -149
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0

masster/study/h5.py CHANGED Viewed

@@ -109,7 +109,13 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
         # Process object columns with optimized serialization
         if object_cols:
-            _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
+            _save_object_columns_optimized(
+                group,
+                df_ordered,
+                object_cols,
+                logger,
+                chunk_size,
+            )
     except Exception as e:
         logger.error(f"Failed to save DataFrame {df_name}: {e}")
@@ -146,17 +152,33 @@ def _save_numeric_column_fast(group, col, data_series, logger):
             # If sample value is a list/array, treat as object column
             if isinstance(sample_value, (list, tuple, np.ndarray)):
-                logger.debug(f"Column '{col}' contains array-like data, treating as object")
-                _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
+                logger.debug(
+                    f"Column '{col}' contains array-like data, treating as object",
+                )
+                _save_dataframe_column_legacy_single(
+                    group,
+                    col,
+                    data_series.to_list(),
+                    "object",
+                    logger,
+                )
                 return
             # Otherwise, convert None values to -123 sentinel for mixed-type numeric columns
             try:
-                data_array = np.array([(-123 if x is None else float(x)) for x in data_array])
+                data_array = np.array(
+                    [(-123 if x is None else float(x)) for x in data_array],
+                )
             except (ValueError, TypeError):
                 # If conversion fails, this is not a numeric column
                 logger.debug(f"Column '{col}' is not numeric, treating as object")
-                _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "object", logger)
+                _save_dataframe_column_legacy_single(
+                    group,
+                    col,
+                    data_series.to_list(),
+                    "object",
+                    logger,
+                )
                 return
         group.create_dataset(col, data=data_array, **compression_kwargs)
@@ -164,7 +186,13 @@ def _save_numeric_column_fast(group, col, data_series, logger):
     except Exception as e:
         logger.warning(f"Failed to save numeric column '{col}' efficiently: {e}")
         # Fallback to old method
-        _save_dataframe_column_legacy_single(group, col, data_series.to_list(), str(data_series.dtype), logger)
+        _save_dataframe_column_legacy_single(
+            group,
+            col,
+            data_series.to_list(),
+            str(data_series.dtype),
+            logger,
+        )
 def _save_string_column_fast(group, col, data_series, logger):
@@ -179,7 +207,13 @@ def _save_string_column_fast(group, col, data_series, logger):
     except Exception as e:
         logger.warning(f"Failed to save string column '{col}' efficiently: {e}")
         # Fallback to old method
-        _save_dataframe_column_legacy_single(group, col, data_series.to_list(), "string", logger)
+        _save_dataframe_column_legacy_single(
+            group,
+            col,
+            data_series.to_list(),
+            "string",
+            logger,
+        )
 def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
@@ -232,7 +266,9 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                 else:
                     serialized_chunk.append("None")
         else:
-            logger.warning(f"Unknown object column '{col_name}', using default serialization")
+            logger.warning(
+                f"Unknown object column '{col_name}', using default serialization",
+            )
             for item in chunk_data:
                 serialized_chunk.append(str(item) if item is not None else "None")
@@ -245,16 +281,28 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
             total_items = len(data_list)
             if total_items == 0:
-                group.create_dataset(col, data=[], compression="gzip", compression_opts=6)
+                group.create_dataset(
+                    col,
+                    data=[],
+                    compression="gzip",
+                    compression_opts=6,
+                )
                 continue
             # For small datasets, process directly
             if total_items <= chunk_size:
                 serialized_data = serialize_chunk(col, data_list)
-                group.create_dataset(col, data=serialized_data, compression="gzip", compression_opts=6)
+                group.create_dataset(
+                    col,
+                    data=serialized_data,
+                    compression="gzip",
+                    compression_opts=6,
+                )
             else:
                 # For large datasets, use chunked processing with parallel serialization
-                logger.debug(f"Processing large object column '{col}' with {total_items} items in chunks")
+                logger.debug(
+                    f"Processing large object column '{col}' with {total_items} items in chunks",
+                )
                 all_serialized = []
                 num_chunks = (total_items + chunk_size - 1) // chunk_size
@@ -281,28 +329,58 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                             )
                             # Fallback to simple string conversion for this chunk
                             chunk = data_list[chunk_start : chunk_start + chunk_size]
-                            results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
+                            results[chunk_start] = [
+                                str(item) if item is not None else "None"
+                                for item in chunk
+                            ]
                     # Reassemble in correct order
                     for i in range(0, total_items, chunk_size):
                         if i in results:
                             all_serialized.extend(results[i])
-                group.create_dataset(col, data=all_serialized, compression="gzip", compression_opts=6)
+                group.create_dataset(
+                    col,
+                    data=all_serialized,
+                    compression="gzip",
+                    compression_opts=6,
+                )
         except Exception as e:
-            logger.warning(f"Failed to save object column '{col}' with optimization: {e}")
+            logger.warning(
+                f"Failed to save object column '{col}' with optimization: {e}",
+            )
             # Fallback to old method
-            _save_dataframe_column_legacy_single(group, col, df[col].to_list(), "object", logger)
+            _save_dataframe_column_legacy_single(
+                group,
+                col,
+                df[col].to_list(),
+                "object",
+                logger,
+            )
-def _save_dataframe_column_legacy_single(group, col: str, data, dtype: str, logger, compression="gzip"):
+def _save_dataframe_column_legacy_single(
+    group,
+    col: str,
+    data,
+    dtype: str,
+    logger,
+    compression="gzip",
+):
     """Legacy single column save method for fallback."""
     # This is the original _save_dataframe_column method for compatibility
     return _save_dataframe_column_legacy(group, col, data, dtype, logger, compression)
-def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, compression="gzip"):
+def _save_dataframe_column_legacy(
+    group,
+    col: str,
+    data,
+    dtype: str,
+    logger,
+    compression="gzip",
+):
     """
     Save a single DataFrame column to an HDF5 group with optimized compression.
@@ -327,7 +405,10 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
     # Optimized compression configuration
     COMPRESSION_CONFIG = {
-        "fast_access": {"compression": "lzf", "shuffle": True},  # Fast I/O for IDs, rt, mz
+        "fast_access": {
+            "compression": "lzf",
+            "shuffle": True,
+        },  # Fast I/O for IDs, rt, mz
         "numeric": {"compression": "lzf"},  # Standard numeric data
         "string": {"compression": "gzip", "compression_opts": 6},  # String data
         "json": {"compression": "gzip", "compression_opts": 6},  # JSON objects
@@ -350,11 +431,22 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
             return COMPRESSION_CONFIG["fast_access"]
         # JSON object columns (complex serialized data)
-        elif column_name in ["spectrum", "chromatogram", "chromatograms", "ms2_specs", "chrom"]:
+        elif column_name in [
+            "spectrum",
+            "chromatogram",
+            "chromatograms",
+            "ms2_specs",
+            "chrom",
+        ]:
             return COMPRESSION_CONFIG["json"]
         # String/text columns
-        elif data_type in ["string", "object"] and column_name in ["sample_name", "file_path", "label", "file_type"]:
+        elif data_type in ["string", "object"] and column_name in [
+            "sample_name",
+            "file_path",
+            "label",
+            "file_type",
+        ]:
             return COMPRESSION_CONFIG["string"]
         # Large bulk numeric data
@@ -524,12 +616,16 @@ def _reconstruct_object_column(data_col, col_name: str):
                 for adduct_row in adducts_list:
                     if len(adduct_row) >= 3:
                         # Convert from [adduct, count, percentage] to dict structure
-                        converted_adducts.append({
-                            "adduct": str(adduct_row[0]),
-                            "count": int(float(adduct_row[1])),
-                            "percentage": float(adduct_row[2]),
-                            "mass": float(adduct_row[3]) if len(adduct_row) > 3 else 0.0
-                        })
+                        converted_adducts.append(
+                            {
+                                "adduct": str(adduct_row[0]),
+                                "count": int(float(adduct_row[1])),
+                                "percentage": float(adduct_row[2]),
+                                "mass": float(adduct_row[3])
+                                if len(adduct_row) > 3
+                                else 0.0,
+                            },
+                        )
                 reconstructed_data.append(converted_adducts)
             else:
                 # Unknown object column
@@ -544,9 +640,14 @@ def _clean_string_nulls(df: pl.DataFrame) -> pl.DataFrame:
     """Convert string null representations to proper nulls."""
     for col in df.columns:
         if df[col].dtype == pl.Utf8:
-            df = df.with_columns([
-                pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
-            ])
+            df = df.with_columns(
+                [
+                    pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                    .then(None)
+                    .otherwise(pl.col(col))
+                    .alias(col),
+                ],
+            )
     return df
@@ -587,7 +688,11 @@ def _apply_schema_casting(df: pl.DataFrame, schema: dict, df_name: str) -> pl.Da
     return df
-def _reorder_columns_by_schema(df: pl.DataFrame, schema: dict, df_name: str) -> pl.DataFrame:
+def _reorder_columns_by_schema(
+    df: pl.DataFrame,
+    schema: dict,
+    df_name: str,
+) -> pl.DataFrame:
     """Reorder DataFrame columns to match schema order."""
     if df_name not in schema or "columns" not in schema[df_name]:
         return df
@@ -641,20 +746,24 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
             # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
             if col == "adducts":
                 # Handle adducts as List(Struct) - now contains dicts
-                df = df.with_columns([
-                    pl.Series(
-                        col,
-                        values,
-                        dtype=pl.List(
-                            pl.Struct([
-                                pl.Field("adduct", pl.Utf8),
-                                pl.Field("count", pl.Int64),
-                                pl.Field("percentage", pl.Float64),
-                                pl.Field("mass", pl.Float64),
-                            ]),
+                df = df.with_columns(
+                    [
+                        pl.Series(
+                            col,
+                            values,
+                            dtype=pl.List(
+                                pl.Struct(
+                                    [
+                                        pl.Field("adduct", pl.Utf8),
+                                        pl.Field("count", pl.Int64),
+                                        pl.Field("percentage", pl.Float64),
+                                        pl.Field("mass", pl.Float64),
+                                    ],
+                                ),
+                            ),
                         ),
-                    ),
-                ])
+                    ],
+                )
             else:
                 # Other object columns stay as Object
                 df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -665,20 +774,24 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
             # print(f"DEBUG: Creating object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
             if col == "adducts":
                 # Handle adducts as List(Struct) - now contains dicts
-                df = df.with_columns([
-                    pl.Series(
-                        col,
-                        values,
-                        dtype=pl.List(
-                            pl.Struct([
-                                pl.Field("adduct", pl.Utf8),
-                                pl.Field("count", pl.Int64),
-                                pl.Field("percentage", pl.Float64),
-                                pl.Field("mass", pl.Float64),
-                            ]),
+                df = df.with_columns(
+                    [
+                        pl.Series(
+                            col,
+                            values,
+                            dtype=pl.List(
+                                pl.Struct(
+                                    [
+                                        pl.Field("adduct", pl.Utf8),
+                                        pl.Field("count", pl.Int64),
+                                        pl.Field("percentage", pl.Float64),
+                                        pl.Field("mass", pl.Float64),
+                                    ],
+                                ),
+                            ),
                         ),
-                    ),
-                ])
+                    ],
+                )
             else:
                 # Other object columns stay as Object
                 df = df.with_columns([pl.Series(col, values, dtype=pl.Object)])
@@ -686,7 +799,13 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
     return df
-def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object_columns: list | None = None) -> pl.DataFrame:
+def _load_dataframe_from_group(
+    group,
+    schema: dict,
+    df_name: str,
+    logger,
+    object_columns: list | None = None,
+) -> pl.DataFrame:
     """Load a DataFrame from HDF5 group using schema."""
     if object_columns is None:
         object_columns = []
@@ -700,7 +819,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     )
     schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
     logger.debug(f"Schema section for {df_name}: {schema_section}")
-    schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
+    schema_columns = (
+        schema_section.get("columns", []) if isinstance(schema_section, dict) else []
+    )
     logger.debug(f"Schema columns for {df_name}: {schema_columns}")
     if schema_columns is None:
         schema_columns = []
@@ -723,7 +844,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
         effective_columns = hdf5_columns.copy()
         for old_name, new_name in column_migrations.items():
             if old_name in effective_columns:
-                logger.info(f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility")
+                logger.info(
+                    f"Will migrate column '{old_name}' to '{new_name}' for backward compatibility",
+                )
                 # Add the new name to effective columns and optionally remove old name
                 effective_columns.append(new_name)
@@ -788,7 +911,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     for col, values in data.items():
         if values is not None and hasattr(values, "__len__"):
             expected_length = len(values)
-            logger.debug(f"Determined expected_length={expected_length} from loaded column '{col}'")
+            logger.debug(
+                f"Determined expected_length={expected_length} from loaded column '{col}'",
+            )
             break
     # If no data loaded yet, try HDF5 columns directly
@@ -798,7 +923,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
             col_data = group[col][:]
             if expected_length is None:
                 expected_length = len(col_data)
-                logger.debug(f"Determined expected_length={expected_length} from HDF5 column '{col}'")
+                logger.debug(
+                    f"Determined expected_length={expected_length} from HDF5 column '{col}'",
+                )
                 break
     # Default to 0 if no data found
@@ -812,26 +939,38 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
         # For missing columns, create appropriately sized array with appropriate defaults
         if col in object_columns:
             data[col] = [None] * expected_length
-            logger.debug(f"Created missing object column '{col}' with length {expected_length}")
+            logger.debug(
+                f"Created missing object column '{col}' with length {expected_length}",
+            )
         else:
             # Provide specific default values for new columns for backward compatibility
             if df_name == "samples_df":
                 if col == "sample_group":
                     data[col] = [""] * expected_length  # Empty string default
-                    logger.debug(f"Created missing column '{col}' with empty string defaults")
+                    logger.debug(
+                        f"Created missing column '{col}' with empty string defaults",
+                    )
                 elif col == "sample_batch":
                     data[col] = [1] * expected_length  # Batch 1 default
-                    logger.debug(f"Created missing column '{col}' with batch 1 defaults")
+                    logger.debug(
+                        f"Created missing column '{col}' with batch 1 defaults",
+                    )
                 elif col == "sample_sequence":
                     # Create increasing sequence numbers
                     data[col] = list(range(1, expected_length + 1))
-                    logger.debug(f"Created missing column '{col}' with sequence 1-{expected_length}")
+                    logger.debug(
+                        f"Created missing column '{col}' with sequence 1-{expected_length}",
+                    )
                 else:
                     data[col] = [None] * expected_length
-                    logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
+                    logger.debug(
+                        f"Created missing regular column '{col}' with length {expected_length}",
+                    )
             else:
                 data[col] = [None] * expected_length
-                logger.debug(f"Created missing regular column '{col}' with length {expected_length}")
+                logger.debug(
+                    f"Created missing regular column '{col}' with length {expected_length}",
+                )
     # Check for columns in HDF5 file that are not in schema (for backward compatibility)
     # But skip the old column names we already migrated
@@ -845,7 +984,11 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
         }
         migrated_old_names = set(column_migrations.keys())
-    extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
+    extra_columns = [
+        col
+        for col in hdf5_columns
+        if col not in (schema_columns or []) and col not in migrated_old_names
+    ]
     for col in extra_columns:
         logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -864,7 +1007,10 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
                         object_columns.append(col)
                 else:
                     # Regular string data
-                    data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
+                    data[col] = [
+                        item.decode("utf-8") if isinstance(item, bytes) else item
+                        for item in column_data
+                    ]
             except Exception:
                 # If decoding fails, treat as regular data
                 data[col] = column_data
@@ -877,10 +1023,19 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     # Handle byte string conversion for non-object columns
     # Only convert to strings for columns that should actually be strings
     for col, values in data.items():
-        if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
+        if (
+            col not in object_columns
+            and values is not None
+            and len(values) > 0
+            and isinstance(values[0], bytes)
+        ):
             # Check schema to see if this should be a string column
             should_be_string = False
-            if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
+            if (
+                df_name in schema
+                and "columns" in schema[df_name]
+                and col in schema[df_name]["columns"]
+            ):
                 dtype_str = schema[df_name]["columns"][col]["dtype"]
                 should_be_string = dtype_str == "pl.Utf8"
@@ -898,7 +1053,9 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
         logger.debug(f"Creating DataFrame with object columns: {object_columns}")
         for col in object_columns:
             if col in data:
-                logger.debug(f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}")
+                logger.debug(
+                    f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
+                )
         df = _create_dataframe_with_objects(data, object_columns)
     else:
         df = pl.DataFrame(data)
@@ -944,19 +1101,34 @@ def _save_study5_compressed(self, filename):
             dataframes_to_save.append(("features", len(self.features_df)))
         if self.consensus_df is not None and not self.consensus_df.is_empty():
             dataframes_to_save.append(("consensus", len(self.consensus_df)))
-        if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
-            dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
+        if (
+            self.consensus_mapping_df is not None
+            and not self.consensus_mapping_df.is_empty()
+        ):
+            dataframes_to_save.append(
+                ("consensus_mapping", len(self.consensus_mapping_df)),
+            )
         if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
             dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
-        if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
+        if (
+            hasattr(self, "lib_df")
+            and self.lib_df is not None
+            and not self.lib_df.is_empty()
+        ):
             dataframes_to_save.append(("lib", len(self.lib_df)))
-        if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
+        if (
+            hasattr(self, "id_df")
+            and self.id_df is not None
+            and not self.id_df.is_empty()
+        ):
             dataframes_to_save.append(("id", len(self.id_df)))
         total_steps = len(dataframes_to_save) + 1  # +1 for metadata
         # Show progress for large saves
-        tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
+        tdqm_disable = (
+            self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
+        )
         with tqdm(
             total=total_steps,
@@ -974,8 +1146,14 @@ def _save_study5_compressed(self, filename):
             # Store metadata
             metadata_group.attrs["format"] = "master-study-1"
-            metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
-            metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
+            metadata_group.attrs["folder"] = (
+                str(self.folder) if self.folder is not None else ""
+            )
+            metadata_group.attrs["label"] = (
+                str(self.label)
+                if hasattr(self, "label") and self.label is not None
+                else ""
+            )
             # Store parameters as JSON
             if hasattr(self, "parameters") and self.history is not None:
@@ -996,8 +1174,16 @@ def _save_study5_compressed(self, filename):
             # Store samples_df - use optimized batch processing
             if self.samples_df is not None and not self.samples_df.is_empty():
                 samples_group = f.create_group("samples")
-                self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
-                _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
+                self.logger.debug(
+                    f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.samples_df,
+                    samples_group,
+                    schema,
+                    "samples_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store features_df - use fast method that skips chrom and ms2_specs columns
@@ -1005,50 +1191,115 @@ def _save_study5_compressed(self, filename):
                 self.logger.debug(
                     f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
                 )
-                _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
+                _save_dataframe_optimized_fast(
+                    self.features_df,
+                    features_group,
+                    schema,
+                    "features_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store consensus_df - use optimized batch processing
             if self.consensus_df is not None and not self.consensus_df.is_empty():
-                self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
-                _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
+                self.logger.debug(
+                    f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.consensus_df,
+                    consensus_group,
+                    schema,
+                    "consensus_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store consensus_mapping_df - keep existing fast method
-            if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
+            if (
+                self.consensus_mapping_df is not None
+                and not self.consensus_mapping_df.is_empty()
+            ):
                 consensus_mapping = self.consensus_mapping_df.clone()
-                self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
+                self.logger.debug(
+                    f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
+                )
                 for col in consensus_mapping.columns:
                     try:
                         data = consensus_mapping[col].to_numpy()
                         # Use LZF compression for consensus mapping data
-                        consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
+                        consensus_mapping_group.create_dataset(
+                            col,
+                            data=data,
+                            compression="lzf",
+                            shuffle=True,
+                        )
                     except Exception as e:
-                        self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
+                        self.logger.warning(
+                            f"Failed to save column '{col}' in consensus_mapping_df: {e}",
+                        )
                 pbar.update(1)
             # Store consensus_ms2 - use optimized batch processing
             if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
-                self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
-                _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
+                self.logger.debug(
+                    f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.consensus_ms2,
+                    consensus_ms2_group,
+                    schema,
+                    "consensus_ms2",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store lib_df - library data
-            if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
-                self.logger.debug(f"Saving lib_df with {len(self.lib_df)} rows using optimized method")
-                _save_dataframe_optimized(self.lib_df, lib_group, schema, "lib_df", self.logger)
+            if (
+                hasattr(self, "lib_df")
+                and self.lib_df is not None
+                and not self.lib_df.is_empty()
+            ):
+                self.logger.debug(
+                    f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.lib_df,
+                    lib_group,
+                    schema,
+                    "lib_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store id_df - identification results
-            if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
-                self.logger.debug(f"Saving id_df with {len(self.id_df)} rows using optimized method")
-                _save_dataframe_optimized(self.id_df, id_group, schema, "id_df", self.logger)
+            if (
+                hasattr(self, "id_df")
+                and self.id_df is not None
+                and not self.id_df.is_empty()
+            ):
+                self.logger.debug(
+                    f"Saving id_df with {len(self.id_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.id_df,
+                    id_group,
+                    schema,
+                    "id_df",
+                    self.logger,
+                )
                 pbar.update(1)
     self.logger.debug(f"Fast save completed for {filename}")
-def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_size=10000):
+def _save_dataframe_optimized_fast(
+    df,
+    group,
+    schema,
+    df_name,
+    logger,
+    chunk_size=10000,
+):
     """
     Save DataFrame with optimized batch processing, but skip chrom and ms2_specs columns for features_df.
@@ -1073,7 +1324,9 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
         # Skip chrom and ms2_specs columns for features_df
         if df_name == "features_df":
             skip_columns = ["chrom", "ms2_specs"]
-            df_ordered = df_ordered.select([col for col in df_ordered.columns if col not in skip_columns])
+            df_ordered = df_ordered.select(
+                [col for col in df_ordered.columns if col not in skip_columns],
+            )
             logger.debug(f"Fast save: skipping columns {skip_columns} for {df_name}")
         total_rows = len(df_ordered)
@@ -1108,7 +1361,13 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
         # Process object columns with optimized serialization
         if object_cols:
-            _save_object_columns_optimized(group, df_ordered, object_cols, logger, chunk_size)
+            _save_object_columns_optimized(
+                group,
+                df_ordered,
+                object_cols,
+                logger,
+                chunk_size,
+            )
     except Exception as e:
         logger.error(f"Failed to save DataFrame {df_name}: {e}")
@@ -1173,19 +1432,34 @@ def _save_study5(self, filename):
             dataframes_to_save.append(("features", len(self.features_df)))
         if self.consensus_df is not None and not self.consensus_df.is_empty():
             dataframes_to_save.append(("consensus", len(self.consensus_df)))
-        if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
-            dataframes_to_save.append(("consensus_mapping", len(self.consensus_mapping_df)))
+        if (
+            self.consensus_mapping_df is not None
+            and not self.consensus_mapping_df.is_empty()
+        ):
+            dataframes_to_save.append(
+                ("consensus_mapping", len(self.consensus_mapping_df)),
+            )
         if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
             dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
-        if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
+        if (
+            hasattr(self, "lib_df")
+            and self.lib_df is not None
+            and not self.lib_df.is_empty()
+        ):
             dataframes_to_save.append(("lib", len(self.lib_df)))
-        if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
+        if (
+            hasattr(self, "id_df")
+            and self.id_df is not None
+            and not self.id_df.is_empty()
+        ):
             dataframes_to_save.append(("id", len(self.id_df)))
         total_steps = len(dataframes_to_save) + 1  # +1 for metadata
         # Show progress for large saves
-        tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
+        tdqm_disable = (
+            self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
+        )
         with tqdm(
             total=total_steps,
@@ -1203,8 +1477,14 @@ def _save_study5(self, filename):
             # Store metadata
             metadata_group.attrs["format"] = "master-study-1"
-            metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
-            metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
+            metadata_group.attrs["folder"] = (
+                str(self.folder) if self.folder is not None else ""
+            )
+            metadata_group.attrs["label"] = (
+                str(self.label)
+                if hasattr(self, "label") and self.label is not None
+                else ""
+            )
             # Store parameters as JSON
             if hasattr(self, "parameters") and self.history is not None:
@@ -1225,51 +1505,119 @@ def _save_study5(self, filename):
             # Store samples_df - use optimized batch processing
             if self.samples_df is not None and not self.samples_df.is_empty():
                 samples_group = f.create_group("samples")
-                self.logger.debug(f"Saving samples_df with {len(self.samples_df)} rows using optimized method")
-                _save_dataframe_optimized(self.samples_df, samples_group, schema, "samples_df", self.logger)
+                self.logger.debug(
+                    f"Saving samples_df with {len(self.samples_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.samples_df,
+                    samples_group,
+                    schema,
+                    "samples_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store features_df - use optimized batch processing
             if self.features_df is not None and not self.features_df.is_empty():
-                self.logger.debug(f"Saving features_df with {len(self.features_df)} rows using optimized method")
-                _save_dataframe_optimized(self.features_df, features_group, schema, "features_df", self.logger)
+                self.logger.debug(
+                    f"Saving features_df with {len(self.features_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.features_df,
+                    features_group,
+                    schema,
+                    "features_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store consensus_df - use optimized batch processing
             if self.consensus_df is not None and not self.consensus_df.is_empty():
-                self.logger.debug(f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method")
-                _save_dataframe_optimized(self.consensus_df, consensus_group, schema, "consensus_df", self.logger)
+                self.logger.debug(
+                    f"Saving consensus_df with {len(self.consensus_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.consensus_df,
+                    consensus_group,
+                    schema,
+                    "consensus_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store consensus_mapping_df - keep existing fast method
-            if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
+            if (
+                self.consensus_mapping_df is not None
+                and not self.consensus_mapping_df.is_empty()
+            ):
                 consensus_mapping = self.consensus_mapping_df.clone()
-                self.logger.debug(f"Saving consensus_mapping_df with {len(consensus_mapping)} rows")
+                self.logger.debug(
+                    f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
+                )
                 for col in consensus_mapping.columns:
                     try:
                         data = consensus_mapping[col].to_numpy()
                         # Use LZF compression for consensus mapping data
-                        consensus_mapping_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
+                        consensus_mapping_group.create_dataset(
+                            col,
+                            data=data,
+                            compression="lzf",
+                            shuffle=True,
+                        )
                     except Exception as e:
-                        self.logger.warning(f"Failed to save column '{col}' in consensus_mapping_df: {e}")
+                        self.logger.warning(
+                            f"Failed to save column '{col}' in consensus_mapping_df: {e}",
+                        )
                 pbar.update(1)
             # Store consensus_ms2 - use optimized batch processing
             if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
-                self.logger.debug(f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method")
-                _save_dataframe_optimized(self.consensus_ms2, consensus_ms2_group, schema, "consensus_ms2", self.logger)
+                self.logger.debug(
+                    f"Saving consensus_ms2 with {len(self.consensus_ms2)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.consensus_ms2,
+                    consensus_ms2_group,
+                    schema,
+                    "consensus_ms2",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store lib_df - library data
-            if hasattr(self, 'lib_df') and self.lib_df is not None and not self.lib_df.is_empty():
-                self.logger.debug(f"Saving lib_df with {len(self.lib_df)} rows using optimized method")
-                _save_dataframe_optimized(self.lib_df, lib_group, schema, "lib_df", self.logger)
+            if (
+                hasattr(self, "lib_df")
+                and self.lib_df is not None
+                and not self.lib_df.is_empty()
+            ):
+                self.logger.debug(
+                    f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.lib_df,
+                    lib_group,
+                    schema,
+                    "lib_df",
+                    self.logger,
+                )
                 pbar.update(1)
             # Store id_df - identification results
-            if hasattr(self, 'id_df') and self.id_df is not None and not self.id_df.is_empty():
-                self.logger.debug(f"Saving id_df with {len(self.id_df)} rows using optimized method")
-                _save_dataframe_optimized(self.id_df, id_group, schema, "id_df", self.logger)
+            if (
+                hasattr(self, "id_df")
+                and self.id_df is not None
+                and not self.id_df.is_empty()
+            ):
+                self.logger.debug(
+                    f"Saving id_df with {len(self.id_df)} rows using optimized method",
+                )
+                _save_dataframe_optimized(
+                    self.id_df,
+                    id_group,
+                    schema,
+                    "id_df",
+                    self.logger,
+                )
                 pbar.update(1)
     self.logger.info(f"Study saved successfully to {filename}")
@@ -1319,7 +1667,9 @@ def _load_study5(self, filename=None):
     schema_path = os.path.join(os.path.dirname(__file__), "study5_schema.json")
     schema = _load_schema(schema_path)
     if not schema:
-        self.logger.warning(f"Schema file {schema_path} not found. Using default types.")
+        self.logger.warning(
+            f"Schema file {schema_path} not found. Using default types.",
+        )
     # Define loading steps for progress tracking
     loading_steps = [
@@ -1393,27 +1743,48 @@ def _load_study5(self, filename=None):
                 if self.history and "study" in self.history:
                     study_params = self.history["study"]
                     if isinstance(study_params, dict):
-                        failed_params = self.parameters.set_from_dict(study_params, validate=False)
+                        failed_params = self.parameters.set_from_dict(
+                            study_params,
+                            validate=False,
+                        )
                         if failed_params:
-                            self.logger.debug(f"Could not set study parameters: {failed_params}")
+                            self.logger.debug(
+                                f"Could not set study parameters: {failed_params}",
+                            )
                         else:
-                            self.logger.debug("Successfully updated parameters from loaded history")
+                            self.logger.debug(
+                                "Successfully updated parameters from loaded history",
+                            )
                     else:
-                        self.logger.debug("Study parameters in history are not a valid dictionary")
+                        self.logger.debug(
+                            "Study parameters in history are not a valid dictionary",
+                        )
                 else:
-                    self.logger.debug("No study parameters found in history, using defaults")
+                    self.logger.debug(
+                        "No study parameters found in history, using defaults",
+                    )
                 # Synchronize instance attributes with parameters (similar to __init__)
                 # Note: folder and label are already loaded from metadata attributes above
                 # but we ensure they match the parameters for consistency
-                if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
+                if (
+                    hasattr(self.parameters, "folder")
+                    and self.parameters.folder is not None
+                ):
                     self.folder = self.parameters.folder
-                if hasattr(self.parameters, "label") and self.parameters.label is not None:
+                if (
+                    hasattr(self.parameters, "label")
+                    and self.parameters.label is not None
+                ):
                     self.label = self.parameters.label
                 if hasattr(self.parameters, "log_level"):
                     self.log_level = self.parameters.log_level
                 if hasattr(self.parameters, "log_label"):
-                    self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
+                    self.log_label = (
+                        self.parameters.log_label
+                        if self.parameters.log_label is not None
+                        else ""
+                    )
                 if hasattr(self.parameters, "log_sink"):
                     self.log_sink = self.parameters.log_sink
             pbar.update(1)
@@ -1423,10 +1794,17 @@ def _load_study5(self, filename=None):
                 f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples",
             )
             if "samples" in f and len(f["samples"].keys()) > 0:
-                self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
+                self.samples_df = _load_dataframe_from_group(
+                    f["samples"],
+                    schema,
+                    "samples_df",
+                    self.logger,
+                )
             else:
                 # Initialize empty samples_df with the correct schema if no data exists
-                self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
+                self.logger.debug(
+                    "No samples data found in study5 file. Initializing empty samples_df.",
+                )
                 self.samples_df = pl.DataFrame(
                     {
                         "sample_uid": [],
@@ -1463,10 +1841,17 @@ def _load_study5(self, filename=None):
                 f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples",
             )
             if "samples" in f and len(f["samples"].keys()) > 0:
-                self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
+                self.samples_df = _load_dataframe_from_group(
+                    f["samples"],
+                    schema,
+                    "samples_df",
+                    self.logger,
+                )
             else:
                 # Initialize empty samples_df with the correct schema if no data exists
-                self.logger.debug("No samples data found in study5 file. Initializing empty samples_df.")
+                self.logger.debug(
+                    "No samples data found in study5 file. Initializing empty samples_df.",
+                )
                 self.samples_df = pl.DataFrame(
                     {
                         "sample_uid": [],
@@ -1536,28 +1921,39 @@ def _load_study5(self, filename=None):
                 # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
                 if self.consensus_df is not None:
-                    if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
-                        self.logger.info("Adding missing 'adducts' column for backward compatibility")
-                        empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
+                    if (
+                        "adducts" not in self.consensus_df.columns
+                        or self.consensus_df["adducts"].dtype == pl.Null
+                    ):
+                        self.logger.info(
+                            "Adding missing 'adducts' column for backward compatibility",
+                        )
+                        empty_adducts: list[list] = [
+                            [] for _ in range(len(self.consensus_df))
+                        ]
                         # If column exists but is Null, drop it first
                         if "adducts" in self.consensus_df.columns:
                             self.consensus_df = self.consensus_df.drop("adducts")
-                        self.consensus_df = self.consensus_df.with_columns([
-                            pl.Series(
-                                "adducts",
-                                empty_adducts,
-                                dtype=pl.List(
-                                    pl.Struct([
-                                        pl.Field("adduct", pl.Utf8),
-                                        pl.Field("count", pl.Int64),
-                                        pl.Field("percentage", pl.Float64),
-                                        pl.Field("mass", pl.Float64),
-                                    ]),
+                        self.consensus_df = self.consensus_df.with_columns(
+                            [
+                                pl.Series(
+                                    "adducts",
+                                    empty_adducts,
+                                    dtype=pl.List(
+                                        pl.Struct(
+                                            [
+                                                pl.Field("adduct", pl.Utf8),
+                                                pl.Field("count", pl.Int64),
+                                                pl.Field("percentage", pl.Float64),
+                                                pl.Field("mass", pl.Float64),
+                                            ],
+                                        ),
+                                    ),
                                 ),
-                            ),
-                        ])
+                            ],
+                        )
             else:
                 self.consensus_df = None
             pbar.update(1)
@@ -1641,8 +2037,14 @@ def _load_study5(self, filename=None):
             pbar.update(1)
     # Check and migrate old string-based map_id to integer indices
-    if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
-        self.logger.info("Detected old string-based map_id format, migrating to integer indices")
+    if (
+        self.samples_df is not None
+        and not self.samples_df.is_empty()
+        and self.samples_df["map_id"].dtype == pl.Utf8
+    ):
+        self.logger.info(
+            "Detected old string-based map_id format, migrating to integer indices",
+        )
         # Convert string-based map_id to integer indices
         sample_count = len(self.samples_df)

masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

Potentially problematic release.

masster 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl