PyPI - masster - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

masster 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (34) hide show

masster/chromatogram.py +2 -2
masster/data/libs/urine.csv +3 -3
masster/logger.py +8 -8
masster/sample/adducts.py +337 -263
masster/sample/defaults/find_adducts_def.py +21 -8
masster/sample/h5.py +557 -278
masster/sample/helpers.py +131 -75
masster/sample/lib.py +2 -2
masster/sample/load.py +25 -11
masster/sample/plot.py +5 -5
masster/sample/processing.py +115 -85
masster/sample/sample.py +28 -15
masster/sample/sample5_schema.json +44 -44
masster/sample/save.py +34 -11
masster/spectrum.py +2 -2
masster/study/defaults/align_def.py +5 -1
masster/study/defaults/identify_def.py +3 -1
masster/study/defaults/study_def.py +58 -25
masster/study/export.py +354 -204
masster/study/h5.py +557 -155
masster/study/helpers.py +487 -194
masster/study/id.py +536 -347
masster/study/load.py +228 -138
masster/study/plot.py +68 -68
masster/study/processing.py +455 -253
masster/study/save.py +14 -4
masster/study/study.py +122 -40
masster/study/study5_schema.json +149 -149
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/METADATA +5 -3
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/RECORD +34 -34
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/WHEEL +0 -0
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/entry_points.txt +0 -0
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/licenses/LICENSE +0 -0
{masster-0.4.4.dist-info → masster-0.4.6.dist-info}/top_level.txt +0 -0

masster/sample/h5.py CHANGED Viewed

@@ -11,7 +11,13 @@ from masster.chromatogram import Chromatogram
 from masster.spectrum import Spectrum
-def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, save_featurexml=False):
+def _save_sample5(
+    self,
+    filename=None,
+    include_ms1=True,
+    include_scans=True,
+    save_featurexml=False,
+):
     """
     Save the instance data to a sample5 HDF5 file with optimized compression.
@@ -56,14 +62,16 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
             return
     # synchronize feature_map if it exists
-    if hasattr(self, '_feature_map') and self._feature_map is not None:
+    if hasattr(self, "_feature_map") and self._feature_map is not None:
         self._features_sync()
     # if no extension is given, add .sample5
     if not filename.endswith(".sample5"):
         filename += ".sample5"
-    self.logger.debug(f"Saving sample to {filename} with optimized LZF+shuffle compression")
+    self.logger.debug(
+        f"Saving sample to {filename} with optimized LZF+shuffle compression",
+    )
     # delete existing file if it exists
     if os.path.exists(filename):
@@ -116,12 +124,18 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
                     except Exception:
                         try:
                             # Try to convert to numeric using numpy
-                            numeric_data = np.array([
-                                float(x)
-                                if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
-                                else np.nan
-                                for x in data
-                            ])
+                            numeric_data = np.array(
+                                [
+                                    float(x)
+                                    if x is not None
+                                    and str(x)
+                                    .replace(".", "")
+                                    .replace("-", "")
+                                    .isdigit()
+                                    else np.nan
+                                    for x in data
+                                ],
+                            )
                             if not np.isnan(numeric_data).all():
                                 scans_group.create_dataset(
                                     col,
@@ -149,7 +163,12 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
                             )
                             scans_group[col].attrs["dtype"] = "string_repr"
                 else:
-                    scans_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
+                    scans_group.create_dataset(
+                        col,
+                        data=data,
+                        compression="lzf",
+                        shuffle=True,
+                    )
                     scans_group[col].attrs["dtype"] = "native"
             scans_group.attrs["columns"] = list(scans_df.columns)
@@ -226,7 +245,12 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
                     data = features[col].to_list()
                     # convert None to 'None' strings
                     data = ["None" if x is None else x for x in data]
-                    features_group.create_dataset(col, data=data, compression="lzf", shuffle=True)
+                    features_group.create_dataset(
+                        col,
+                        data=data,
+                        compression="lzf",
+                        shuffle=True,
+                    )
                 else:
                     try:
                         data = features[col].to_numpy()
@@ -261,16 +285,18 @@ def _save_sample5(self, filename=None, include_ms1=True, include_scans=True, sav
         feature_map = self._get_feature_map()
         if feature_map is not None:
             # Temporarily set features for save operation
-            old_features = getattr(self, '_oms_features_map', None)
+            old_features = getattr(self, "_oms_features_map", None)
             self._oms_features_map = feature_map
             try:
-                self._save_featureXML(filename=filename.replace(".sample5", ".featureXML"))
+                self._save_featureXML(
+                    filename=filename.replace(".sample5", ".featureXML"),
+                )
             finally:
                 # Restore original features value
                 if old_features is not None:
                     self._oms_features_map = old_features
                 else:
-                    delattr(self, '_oms_features_map')
+                    delattr(self, "_oms_features_map")
         else:
             self.logger.warning("Cannot save featureXML: no feature data available")
@@ -309,15 +335,21 @@ def _load_sample5(self, filename: str, map: bool = False):
         # Load metadata
         if "metadata" in f:
             metadata_group = f["metadata"]
-            self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
+            self.file_path = decode_metadata_attr(
+                metadata_group.attrs.get("file_path", ""),
+            )
             # Load file_source if it exists, otherwise set it equal to file_path
             if "file_source" in metadata_group.attrs:
-                self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
+                self.file_source = decode_metadata_attr(
+                    metadata_group.attrs.get("file_source", ""),
+                )
             else:
                 self.file_source = self.file_path
-            self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
+            self.file_type = decode_metadata_attr(
+                metadata_group.attrs.get("file_type", ""),
+            )
             self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
             # Load parameters from JSON in metadata
@@ -368,19 +400,23 @@ def _load_sample5(self, filename: str, map: bool = False):
                 # Convert "None" strings and NaN values to proper null values
                 for col in self.scans_df.columns:
                     if self.scans_df[col].dtype == pl.Utf8:  # String columns
-                        self.scans_df = self.scans_df.with_columns([
-                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
-                            .then(None)
-                            .otherwise(pl.col(col))
-                            .alias(col),
-                        ])
+                        self.scans_df = self.scans_df.with_columns(
+                            [
+                                pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                                .then(None)
+                                .otherwise(pl.col(col))
+                                .alias(col),
+                            ],
+                        )
                     elif self.scans_df[col].dtype in [
                         pl.Float64,
                         pl.Float32,
                     ]:  # Float columns
-                        self.scans_df = self.scans_df.with_columns([
-                            pl.col(col).fill_nan(None).alias(col),
-                        ])
+                        self.scans_df = self.scans_df.with_columns(
+                            [
+                                pl.col(col).fill_nan(None).alias(col),
+                            ],
+                        )
                 # update all columns with schema types
                 for col in self.scans_df.columns:
@@ -398,7 +434,9 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
+                                            pl.col(col)
+                                            .str.to_integer()
+                                            .cast(eval(dtype_str)),
                                         )
                                     elif self.scans_df[col].dtype in [
                                         pl.Float64,
@@ -418,7 +456,9 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
+                                            pl.col(col)
+                                            .str.to_decimal()
+                                            .cast(eval(dtype_str)),
                                         )
                                     else:
                                         # Try direct casting
@@ -442,7 +482,9 @@ def _load_sample5(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    lambda x: x.decode("utf-8")
+                                                    if isinstance(x, bytes)
+                                                    else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .cast(target_dtype),
@@ -451,7 +493,9 @@ def _load_sample5(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    lambda x: x.decode("utf-8")
+                                                    if isinstance(x, bytes)
+                                                    else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_integer()
@@ -461,7 +505,9 @@ def _load_sample5(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    lambda x: x.decode("utf-8")
+                                                    if isinstance(x, bytes)
+                                                    else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_decimal()
@@ -490,7 +536,9 @@ def _load_sample5(self, filename: str, map: bool = False):
             if "scans_df" in schema and "columns" in schema["scans_df"]:
                 schema_column_order = list(schema["scans_df"]["columns"].keys())
                 # Only reorder columns that exist in both schema and DataFrame
-                existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
+                existing_columns = [
+                    col for col in schema_column_order if col in self.scans_df.columns
+                ]
                 if existing_columns:
                     self.scans_df = self.scans_df.select(existing_columns)
@@ -617,23 +665,29 @@ def _load_sample5(self, filename: str, map: bool = False):
                         if k in schema.get("features_df", {}).get("columns", {})
                         and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
                     }
-                    regular_columns = {k: v for k, v in data.items() if k not in object_columns}
+                    regular_columns = {
+                        k: v for k, v in data.items() if k not in object_columns
+                    }
                     # Create DataFrame with regular columns first
                     if regular_columns:
                         self.features_df = pl.DataFrame(regular_columns)
                         # Add Object columns one by one
                         for col, values in object_columns.items():
-                            self.features_df = self.features_df.with_columns([
-                                pl.Series(col, values, dtype=pl.Object),
-                            ])
+                            self.features_df = self.features_df.with_columns(
+                                [
+                                    pl.Series(col, values, dtype=pl.Object),
+                                ],
+                            )
                     else:
                         # Only Object columns
                         self.features_df = pl.DataFrame()
                         for col, values in object_columns.items():
-                            self.features_df = self.features_df.with_columns([
-                                pl.Series(col, values, dtype=pl.Object),
-                            ])
+                            self.features_df = self.features_df.with_columns(
+                                [
+                                    pl.Series(col, values, dtype=pl.Object),
+                                ],
+                            )
                 # update all columns with schema types (skip Object columns)
                 for col in self.features_df.columns:
@@ -650,16 +704,25 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     # Convert to numeric first, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col)
+                                                .str.to_integer()
+                                                .cast(eval(dtype_str)),
+                                            )
                                         )
                                     elif self.features_df[col].dtype in [
                                         pl.Float64,
                                         pl.Float32,
                                     ]:
                                         # Float data - cast to integer with null handling for NaN values
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).cast(eval(dtype_str), strict=False),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col).cast(
+                                                    eval(dtype_str),
+                                                    strict=False,
+                                                ),
+                                            )
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -670,50 +733,70 @@ def _load_sample5(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .cast(target_dtype),
                                                     )
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_integer()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_integer()
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_decimal()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_decimal()
-                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col).cast(target_dtype),
+                                                    )
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
                                             )
                                 elif "Float" in dtype_str:
                                     # Convert to float, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col)
+                                                .str.to_decimal()
+                                                .cast(eval(dtype_str)),
+                                            )
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -724,43 +807,59 @@ def _load_sample5(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .cast(target_dtype),
                                                     )
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_integer()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_integer()
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_decimal()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_decimal()
-                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col).cast(target_dtype),
+                                                    )
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
                                             )
                                 elif "Utf8" in dtype_str:
                                     # Ensure it's string type
@@ -776,43 +875,59 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     if "Binary" in str(current_dtype):
                                         # Convert binary to string first, then to target type
                                         if target_dtype == pl.Utf8:
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col)
-                                                .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                    return_dtype=pl.Utf8,
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8")
+                                                        if isinstance(x, bytes)
+                                                        else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .cast(target_dtype),
                                                 )
-                                                .cast(target_dtype),
                                             )
                                         elif "Int" in str(target_dtype):
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col)
-                                                .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                    return_dtype=pl.Utf8,
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8")
+                                                        if isinstance(x, bytes)
+                                                        else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
                                                 )
-                                                .str.to_integer()
-                                                .cast(target_dtype),
                                             )
                                         elif "Float" in str(target_dtype):
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col)
-                                                .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                    return_dtype=pl.Utf8,
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8")
+                                                        if isinstance(x, bytes)
+                                                        else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
                                                 )
-                                                .str.to_decimal()
-                                                .cast(target_dtype),
                                             )
                                         else:
                                             # Try direct casting
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
                                             )
                                     else:
                                         # Try direct casting for non-binary types
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).cast(target_dtype),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
+                                            )
                                         )
                         except Exception as e:
                             self.logger.warning(
@@ -827,23 +942,31 @@ def _load_sample5(self, filename: str, map: bool = False):
                 # This ensures "None" strings introduced by failed conversions are properly handled
                 for col in self.features_df.columns:
                     if self.features_df[col].dtype == pl.Utf8:  # String columns
-                        self.features_df = self.features_df.with_columns([
-                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
-                            .then(None)
-                            .otherwise(pl.col(col))
-                            .alias(col),
-                        ])
+                        self.features_df = self.features_df.with_columns(
+                            [
+                                pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                                .then(None)
+                                .otherwise(pl.col(col))
+                                .alias(col),
+                            ],
+                        )
                     # Float columns
                     elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
-                        self.features_df = self.features_df.with_columns([
-                            pl.col(col).fill_nan(None).alias(col),
-                        ])
+                        self.features_df = self.features_df.with_columns(
+                            [
+                                pl.col(col).fill_nan(None).alias(col),
+                            ],
+                        )
                 # Ensure column order matches schema order
                 if "features_df" in schema and "columns" in schema["features_df"]:
                     schema_column_order = list(schema["features_df"]["columns"].keys())
                     # Only reorder columns that exist in both schema and DataFrame
-                    existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
+                    existing_columns = [
+                        col
+                        for col in schema_column_order
+                        if col in self.features_df.columns
+                    ]
                     if existing_columns:
                         self.features_df = self.features_df.select(existing_columns)
@@ -873,13 +996,17 @@ def _load_sample5(self, filename: str, map: bool = False):
                             dtype_str = schema_columns[col]["dtype"]
                             try:
                                 if "Int" in dtype_str:
-                                    self.ms1_df = self.ms1_df.with_columns([
-                                        pl.col(col).cast(pl.Int64, strict=False),
-                                    ])
+                                    self.ms1_df = self.ms1_df.with_columns(
+                                        [
+                                            pl.col(col).cast(pl.Int64, strict=False),
+                                        ],
+                                    )
                                 elif "Float" in dtype_str:
-                                    self.ms1_df = self.ms1_df.with_columns([
-                                        pl.col(col).cast(pl.Float64, strict=False),
-                                    ])
+                                    self.ms1_df = self.ms1_df.with_columns(
+                                        [
+                                            pl.col(col).cast(pl.Float64, strict=False),
+                                        ],
+                                    )
                             except Exception as e:
                                 self.logger.warning(
                                     f"Failed to apply schema type {dtype_str} to column {col}: {e}",
@@ -948,15 +1075,21 @@ def _load_sample5_study(self, filename: str, map: bool = False):
         # Load metadata
         if "metadata" in f:
             metadata_group = f["metadata"]
-            self.file_path = decode_metadata_attr(metadata_group.attrs.get("file_path", ""))
+            self.file_path = decode_metadata_attr(
+                metadata_group.attrs.get("file_path", ""),
+            )
             # Load file_source if it exists, otherwise set it equal to file_path
             if "file_source" in metadata_group.attrs:
-                self.file_source = decode_metadata_attr(metadata_group.attrs.get("file_source", ""))
+                self.file_source = decode_metadata_attr(
+                    metadata_group.attrs.get("file_source", ""),
+                )
             else:
                 self.file_source = self.file_path
-            self.file_type = decode_metadata_attr(metadata_group.attrs.get("file_type", ""))
+            self.file_type = decode_metadata_attr(
+                metadata_group.attrs.get("file_type", ""),
+            )
             self.label = decode_metadata_attr(metadata_group.attrs.get("label", ""))
             # Load parameters from JSON in metadata
@@ -1007,19 +1140,23 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                 # Convert "None" strings and NaN values to proper null values
                 for col in self.scans_df.columns:
                     if self.scans_df[col].dtype == pl.Utf8:  # String columns
-                        self.scans_df = self.scans_df.with_columns([
-                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
-                            .then(None)
-                            .otherwise(pl.col(col))
-                            .alias(col),
-                        ])
+                        self.scans_df = self.scans_df.with_columns(
+                            [
+                                pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                                .then(None)
+                                .otherwise(pl.col(col))
+                                .alias(col),
+                            ],
+                        )
                     elif self.scans_df[col].dtype in [
                         pl.Float64,
                         pl.Float32,
                     ]:  # Float columns
-                        self.scans_df = self.scans_df.with_columns([
-                            pl.col(col).fill_nan(None).alias(col),
-                        ])
+                        self.scans_df = self.scans_df.with_columns(
+                            [
+                                pl.col(col).fill_nan(None).alias(col),
+                            ],
+                        )
                 # update all columns with schema types
                 for col in self.scans_df.columns:
@@ -1037,7 +1174,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
+                                            pl.col(col)
+                                            .str.to_integer()
+                                            .cast(eval(dtype_str)),
                                         )
                                     elif self.scans_df[col].dtype in [
                                         pl.Float64,
@@ -1057,7 +1196,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
+                                            pl.col(col)
+                                            .str.to_decimal()
+                                            .cast(eval(dtype_str)),
                                         )
                                     else:
                                         # Try direct casting
@@ -1081,7 +1222,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    lambda x: x.decode("utf-8")
+                                                    if isinstance(x, bytes)
+                                                    else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .cast(target_dtype),
@@ -1090,7 +1233,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    lambda x: x.decode("utf-8")
+                                                    if isinstance(x, bytes)
+                                                    else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_integer()
@@ -1100,7 +1245,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    lambda x: x.decode("utf-8")
+                                                    if isinstance(x, bytes)
+                                                    else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_decimal()
@@ -1129,7 +1276,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
             if "scans_df" in schema and "columns" in schema["scans_df"]:
                 schema_column_order = list(schema["scans_df"]["columns"].keys())
                 # Only reorder columns that exist in both schema and DataFrame
-                existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
+                existing_columns = [
+                    col for col in schema_column_order if col in self.scans_df.columns
+                ]
                 if existing_columns:
                     self.scans_df = self.scans_df.select(existing_columns)
@@ -1208,12 +1357,18 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                             spectrum_list = []
                                             for spec_data in json.loads(item):
                                                 if spec_data is not None:
-                                                    spectrum = Spectrum.from_json(spec_data)
+                                                    spectrum = Spectrum.from_json(
+                                                        spec_data,
+                                                    )
                                                     spectrum_list.append(spectrum)
                                                 else:
                                                     spectrum_list.append(None)
                                             reconstructed_data.append(spectrum_list)
-                                        except (json.JSONDecodeError, ValueError, TypeError):
+                                        except (
+                                            json.JSONDecodeError,
+                                            ValueError,
+                                            TypeError,
+                                        ):
                                             reconstructed_data.append(None)
                                 data[col] = reconstructed_data
@@ -1229,10 +1384,13 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                 # Separate Object columns from regular columns to avoid astuple issues
                 object_columns = {}
                 regular_columns = {}
                 for col, values in data.items():
                     if col in schema.get("features_df", {}).get("columns", {}):
-                        if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
+                        if "Object" in schema["features_df"]["columns"][col].get(
+                            "dtype",
+                            "",
+                        ):
                             object_columns[col] = values
                         else:
                             regular_columns[col] = values
@@ -1245,38 +1403,48 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                 else:
                     # If no regular columns, create empty DataFrame
                     self.features_df = pl.DataFrame()
                 # Add Object columns one by one
                 for col, values in object_columns.items():
                     if not self.features_df.is_empty():
                         self.features_df = self.features_df.with_columns(
-                            pl.Series(col, values, dtype=pl.Object).alias(col)
+                            pl.Series(col, values, dtype=pl.Object).alias(col),
                         )
                     else:
                         # Create DataFrame with just this Object column
-                        self.features_df = pl.DataFrame({col: values}, schema={col: pl.Object})
+                        self.features_df = pl.DataFrame(
+                            {col: values},
+                            schema={col: pl.Object},
+                        )
                 # Convert "None" strings and NaN values to proper null values for regular columns first
                 for col in self.features_df.columns:
                     # Skip Object columns - they're already properly reconstructed
                     if col in schema.get("features_df", {}).get("columns", {}):
-                        if "Object" in schema["features_df"]["columns"][col].get("dtype", ""):
+                        if "Object" in schema["features_df"]["columns"][col].get(
+                            "dtype",
+                            "",
+                        ):
                             continue
                     if self.features_df[col].dtype == pl.Utf8:  # String columns
-                        self.features_df = self.features_df.with_columns([
-                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
-                            .then(None)
-                            .otherwise(pl.col(col))
-                            .alias(col),
-                        ])
+                        self.features_df = self.features_df.with_columns(
+                            [
+                                pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                                .then(None)
+                                .otherwise(pl.col(col))
+                                .alias(col),
+                            ],
+                        )
                     elif self.features_df[col].dtype in [
                         pl.Float64,
                         pl.Float32,
                     ]:  # Float columns
-                        self.features_df = self.features_df.with_columns([
-                            pl.col(col).fill_nan(None).alias(col),
-                        ])
+                        self.features_df = self.features_df.with_columns(
+                            [
+                                pl.col(col).fill_nan(None).alias(col),
+                            ],
+                        )
                 # update all columns with schema types
                 for col in self.features_df.columns:
@@ -1293,16 +1461,25 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     # Convert to numeric first, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col)
+                                                .str.to_integer()
+                                                .cast(eval(dtype_str)),
+                                            )
                                         )
                                     elif self.features_df[col].dtype in [
                                         pl.Float64,
                                         pl.Float32,
                                     ]:
                                         # Float data - cast to integer with null handling for NaN values
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).cast(eval(dtype_str), strict=False),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col).cast(
+                                                    eval(dtype_str),
+                                                    strict=False,
+                                                ),
+                                            )
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -1313,50 +1490,70 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .cast(target_dtype),
                                                     )
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_integer()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_integer()
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_decimal()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_decimal()
-                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col).cast(target_dtype),
+                                                    )
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
                                             )
                                 elif "Float" in dtype_str:
                                     # Convert to float, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col)
+                                                .str.to_decimal()
+                                                .cast(eval(dtype_str)),
+                                            )
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -1367,43 +1564,59 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .cast(target_dtype),
                                                     )
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_integer()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_integer()
-                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                        return_dtype=pl.Utf8,
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col)
+                                                        .map_elements(
+                                                            lambda x: x.decode("utf-8")
+                                                            if isinstance(x, bytes)
+                                                            else str(x),
+                                                            return_dtype=pl.Utf8,
+                                                        )
+                                                        .str.to_decimal()
+                                                        .cast(target_dtype),
                                                     )
-                                                    .str.to_decimal()
-                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
+                                                self.features_df = (
+                                                    self.features_df.with_columns(
+                                                        pl.col(col).cast(target_dtype),
+                                                    )
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
                                             )
                                 elif "Utf8" in dtype_str:
                                     # Ensure it's string type
@@ -1419,43 +1632,59 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     if "Binary" in str(current_dtype):
                                         # Convert binary to string first, then to target type
                                         if target_dtype == pl.Utf8:
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col)
-                                                .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                    return_dtype=pl.Utf8,
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8")
+                                                        if isinstance(x, bytes)
+                                                        else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .cast(target_dtype),
                                                 )
-                                                .cast(target_dtype),
                                             )
                                         elif "Int" in str(target_dtype):
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col)
-                                                .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                    return_dtype=pl.Utf8,
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8")
+                                                        if isinstance(x, bytes)
+                                                        else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
                                                 )
-                                                .str.to_integer()
-                                                .cast(target_dtype),
                                             )
                                         elif "Float" in str(target_dtype):
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col)
-                                                .map_elements(
-                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
-                                                    return_dtype=pl.Utf8,
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8")
+                                                        if isinstance(x, bytes)
+                                                        else str(x),
+                                                        return_dtype=pl.Utf8,
+                                                    )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
                                                 )
-                                                .str.to_decimal()
-                                                .cast(target_dtype),
                                             )
                                         else:
                                             # Try direct casting
-                                            self.features_df = self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
+                                            self.features_df = (
+                                                self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
+                                                )
                                             )
                                     else:
                                         # Try direct casting for non-binary types
-                                        self.features_df = self.features_df.with_columns(
-                                            pl.col(col).cast(target_dtype),
+                                        self.features_df = (
+                                            self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
+                                            )
                                         )
                         except Exception as e:
                             self.logger.warning(
@@ -1470,23 +1699,31 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                 # This ensures "None" strings introduced by failed conversions are properly handled
                 for col in self.features_df.columns:
                     if self.features_df[col].dtype == pl.Utf8:  # String columns
-                        self.features_df = self.features_df.with_columns([
-                            pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
-                            .then(None)
-                            .otherwise(pl.col(col))
-                            .alias(col),
-                        ])
+                        self.features_df = self.features_df.with_columns(
+                            [
+                                pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                                .then(None)
+                                .otherwise(pl.col(col))
+                                .alias(col),
+                            ],
+                        )
                     # Float columns
                     elif self.features_df[col].dtype in [pl.Float64, pl.Float32]:
-                        self.features_df = self.features_df.with_columns([
-                            pl.col(col).fill_nan(None).alias(col),
-                        ])
+                        self.features_df = self.features_df.with_columns(
+                            [
+                                pl.col(col).fill_nan(None).alias(col),
+                            ],
+                        )
                 # Ensure column order matches schema order
                 if "features_df" in schema and "columns" in schema["features_df"]:
                     schema_column_order = list(schema["features_df"]["columns"].keys())
                     # Only reorder columns that exist in both schema and DataFrame
-                    existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
+                    existing_columns = [
+                        col
+                        for col in schema_column_order
+                        if col in self.features_df.columns
+                    ]
                     if existing_columns:
                         self.features_df = self.features_df.select(existing_columns)
@@ -1516,7 +1753,9 @@ def _load_sample5_study(self, filename: str, map: bool = False):
     # set self.label to basename without extension
     if self.label is None or self.label == "":
         self.label = os.path.splitext(os.path.basename(filename))[0]
-    self.logger.info(f"Sample loaded successfully from {filename} (optimized for study)")
+    self.logger.info(
+        f"Sample loaded successfully from {filename} (optimized for study)",
+    )
 def load_schema(schema_path: str) -> Dict[str, Any]:
@@ -1564,13 +1803,20 @@ def clean_null_values_polars(df: pl.DataFrame) -> pl.DataFrame:
     cleaned_df = df
     for col in df.columns:
         if df[col].dtype == pl.Utf8:  # String columns
-            cleaned_df = cleaned_df.with_columns([
-                pl.when(pl.col(col).is_in(["None", "", "null", "NULL"])).then(None).otherwise(pl.col(col)).alias(col),
-            ])
+            cleaned_df = cleaned_df.with_columns(
+                [
+                    pl.when(pl.col(col).is_in(["None", "", "null", "NULL"]))
+                    .then(None)
+                    .otherwise(pl.col(col))
+                    .alias(col),
+                ],
+            )
         elif df[col].dtype in [pl.Float64, pl.Float32]:  # Float columns
-            cleaned_df = cleaned_df.with_columns([
-                pl.col(col).fill_nan(None).alias(col),
-            ])
+            cleaned_df = cleaned_df.with_columns(
+                [
+                    pl.col(col).fill_nan(None).alias(col),
+                ],
+            )
     return cleaned_df
@@ -1606,7 +1852,12 @@ def cast_column_by_dtype(df: pl.DataFrame, col: str, dtype_str: str) -> pl.DataF
         return df
-def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
+def _cast_to_int(
+    df: pl.DataFrame,
+    col: str,
+    current_dtype: pl.DataType,
+    target_dtype: pl.DataType,
+) -> pl.DataFrame:
     """Helper function to cast column to integer type."""
     if current_dtype == pl.Utf8:
         return df.with_columns(
@@ -1618,7 +1869,12 @@ def _cast_to_int(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_
         return _cast_with_binary_handling(df, col, current_dtype, target_dtype)
-def _cast_to_float(df: pl.DataFrame, col: str, current_dtype: pl.DataType, target_dtype: pl.DataType) -> pl.DataFrame:
+def _cast_to_float(
+    df: pl.DataFrame,
+    col: str,
+    current_dtype: pl.DataType,
+    target_dtype: pl.DataType,
+) -> pl.DataFrame:
     """Helper function to cast column to float type."""
     if current_dtype == pl.Utf8:
         return df.with_columns(
@@ -1639,20 +1895,29 @@ def _cast_with_binary_handling(
         if target_dtype == pl.Utf8:
             return df.with_columns(
                 pl.col(col)
-                .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
+                .map_elements(
+                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                    return_dtype=pl.Utf8,
+                )
                 .cast(target_dtype),
             )
         elif "Int" in str(target_dtype):
             return df.with_columns(
                 pl.col(col)
-                .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
+                .map_elements(
+                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                    return_dtype=pl.Utf8,
+                )
                 .str.to_integer()
                 .cast(target_dtype),
             )
         elif "Float" in str(target_dtype):
             return df.with_columns(
                 pl.col(col)
-                .map_elements(lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x), return_dtype=pl.Utf8)
+                .map_elements(
+                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                    return_dtype=pl.Utf8,
+                )
                 .str.to_decimal()
                 .cast(target_dtype),
             )
@@ -1661,7 +1926,11 @@ def _cast_with_binary_handling(
     return df.with_columns(pl.col(col).cast(target_dtype))
-def apply_schema_to_dataframe(df: pl.DataFrame, schema: Dict[str, Any], df_name: str) -> pl.DataFrame:
+def apply_schema_to_dataframe(
+    df: pl.DataFrame,
+    schema: Dict[str, Any],
+    df_name: str,
+) -> pl.DataFrame:
     """
     Apply schema type casting to a Polars DataFrame.
@@ -1819,7 +2088,9 @@ def _create_dataframe_with_object_columns(
     schema_columns = schema.get(df_name, {}).get("columns", {})
     object_columns = {
-        k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
+        k: v
+        for k, v in data.items()
+        if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
     }
     regular_columns = {k: v for k, v in data.items() if k not in object_columns}
@@ -1874,13 +2145,17 @@ def load_ms1_dataframe_from_h5_group(
                 dtype_str = schema_columns[col]["dtype"]
                 try:
                     if "Int" in dtype_str:
-                        ms1_df = ms1_df.with_columns([
-                            pl.col(col).cast(pl.Int64, strict=False),
-                        ])
+                        ms1_df = ms1_df.with_columns(
+                            [
+                                pl.col(col).cast(pl.Int64, strict=False),
+                            ],
+                        )
                     elif "Float" in dtype_str:
-                        ms1_df = ms1_df.with_columns([
-                            pl.col(col).cast(pl.Float64, strict=False),
-                        ])
+                        ms1_df = ms1_df.with_columns(
+                            [
+                                pl.col(col).cast(pl.Float64, strict=False),
+                            ],
+                        )
                 except Exception as e:
                     if logger:
                         logger.warning(
@@ -1891,7 +2166,9 @@ def load_ms1_dataframe_from_h5_group(
     return clean_null_values_polars(ms1_df)
-def load_parameters_from_metadata(metadata_group: h5py.Group) -> Optional[Dict[str, Any]]:
+def load_parameters_from_metadata(
+    metadata_group: h5py.Group,
+) -> Optional[Dict[str, Any]]:
     """
     Load parameters from HDF5 metadata group.
@@ -1938,6 +2215,8 @@ def create_h5_metadata_group(
     metadata_group = f.create_group("metadata")
     metadata_group.attrs["format"] = "master-sample5-1"
     metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
-    metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
+    metadata_group.attrs["file_source"] = (
+        str(file_source) if file_source is not None else ""
+    )
     metadata_group.attrs["file_type"] = str(file_type) if file_type is not None else ""
     metadata_group.attrs["label"] = str(label) if label is not None else ""

masster 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

Potentially problematic release.

masster 0.4.4py3-none-any.whl → 0.4.6py3-none-any.whl