PyPI - masster - Versions diffs - 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl - Mend

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (37) hide show

masster/_version.py +1 -1
masster/logger.py +35 -19
masster/sample/adducts.py +15 -29
masster/sample/defaults/find_adducts_def.py +1 -3
masster/sample/defaults/sample_def.py +4 -4
masster/sample/h5.py +203 -361
masster/sample/helpers.py +14 -30
masster/sample/lib.py +3 -3
masster/sample/load.py +21 -29
masster/sample/plot.py +222 -132
masster/sample/processing.py +42 -55
masster/sample/sample.py +37 -46
masster/sample/save.py +37 -61
masster/sample/sciex.py +13 -11
masster/sample/thermo.py +69 -74
masster/spectrum.py +15 -15
masster/study/analysis.py +650 -586
masster/study/defaults/identify_def.py +1 -3
masster/study/defaults/merge_def.py +6 -7
masster/study/defaults/study_def.py +1 -5
masster/study/export.py +35 -96
masster/study/h5.py +134 -211
masster/study/helpers.py +385 -459
masster/study/id.py +239 -290
masster/study/importers.py +84 -93
masster/study/load.py +159 -178
masster/study/merge.py +1112 -1098
masster/study/plot.py +195 -149
masster/study/processing.py +144 -191
masster/study/save.py +14 -13
masster/study/study.py +89 -130
masster/wizard/wizard.py +764 -714
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
{masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0

masster/sample/h5.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import json
+import json
 import os
 import h5py
@@ -62,7 +62,7 @@ def _save_sample5(
             return
     # synchronize feature_map if it exists
-    #if hasattr(self, "_feature_map") and self._feature_map is not None:
+    # if hasattr(self, "_feature_map") and self._feature_map is not None:
     #    self._features_sync()
     # if no extension is given, add .sample5
@@ -94,7 +94,7 @@ def _save_sample5(
             metadata_group.attrs["file_source"] = str(self.file_source)
         else:
             metadata_group.attrs["file_source"] = ""
-        if hasattr(self, 'type') and self.type is not None:
+        if hasattr(self, "type") and self.type is not None:
             metadata_group.attrs["file_type"] = str(self.type)
         else:
             metadata_group.attrs["file_type"] = ""
@@ -127,11 +127,7 @@ def _save_sample5(
                             numeric_data = np.array(
                                 [
                                     float(x)
-                                    if x is not None
-                                    and str(x)
-                                    .replace(".", "")
-                                    .replace("-", "")
-                                    .isdigit()
+                                    if x is not None and str(x).replace(".", "").replace("-", "").isdigit()
                                     else np.nan
                                     for x in data
                                 ],
@@ -289,21 +285,21 @@ def _save_sample5(
         # Store parameters/history as JSON
         # Always ensure we sync instance attributes to parameters before saving
-        if hasattr(self, 'parameters') and self.parameters is not None:
-            if hasattr(self, 'polarity') and self.polarity is not None:
+        if hasattr(self, "parameters") and self.parameters is not None:
+            if hasattr(self, "polarity") and self.polarity is not None:
                 self.parameters.polarity = self.polarity
-            if hasattr(self, 'type') and self.type is not None:
+            if hasattr(self, "type") and self.type is not None:
                 self.parameters.type = self.type
         # Prepare save data
         save_data = {}
         # Add parameters as a dictionary
-        if hasattr(self, 'parameters') and self.parameters is not None:
+        if hasattr(self, "parameters") and self.parameters is not None:
             save_data["sample"] = self.parameters.to_dict()
         # Add history data (but ensure it's JSON serializable)
-        if hasattr(self, 'history') and self.history is not None:
+        if hasattr(self, "history") and self.history is not None:
             # Convert any non-JSON-serializable objects to strings/dicts
             serializable_history = {}
             for key, value in self.history.items():
@@ -318,7 +314,7 @@ def _save_sample5(
                     # Convert to string if not serializable
                     serializable_history[key] = str(value)
             save_data.update(serializable_history)
         # Save as JSON
         params_json = json.dumps(save_data, indent=2)
         metadata_group.attrs["parameters"] = params_json
@@ -480,9 +476,7 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col)
-                                            .str.to_integer()
-                                            .cast(eval(dtype_str)),
+                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
                                         )
                                     elif self.scans_df[col].dtype in [
                                         pl.Float64,
@@ -502,9 +496,7 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col)
-                                            .str.to_decimal()
-                                            .cast(eval(dtype_str)),
+                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
                                         )
                                     else:
                                         # Try direct casting
@@ -528,9 +520,7 @@ def _load_sample5(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8")
-                                                    if isinstance(x, bytes)
-                                                    else str(x),
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .cast(target_dtype),
@@ -539,9 +529,7 @@ def _load_sample5(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8")
-                                                    if isinstance(x, bytes)
-                                                    else str(x),
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_integer()
@@ -551,9 +539,7 @@ def _load_sample5(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8")
-                                                    if isinstance(x, bytes)
-                                                    else str(x),
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_decimal()
@@ -582,9 +568,7 @@ def _load_sample5(self, filename: str, map: bool = False):
             if "scans_df" in schema and "columns" in schema["scans_df"]:
                 schema_column_order = list(schema["scans_df"]["columns"].keys())
                 # Only reorder columns that exist in both schema and DataFrame
-                existing_columns = [
-                    col for col in schema_column_order if col in self.scans_df.columns
-                ]
+                existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
                 if existing_columns:
                     self.scans_df = self.scans_df.select(existing_columns)
@@ -730,9 +714,7 @@ def _load_sample5(self, filename: str, map: bool = False):
                         if k in schema.get("features_df", {}).get("columns", {})
                         and schema["features_df"]["columns"][k]["dtype"] == "pl.Object"
                     }
-                    regular_columns = {
-                        k: v for k, v in data.items() if k not in object_columns
-                    }
+                    regular_columns = {k: v for k, v in data.items() if k not in object_columns}
                     # Create DataFrame with regular columns first
                     if regular_columns:
@@ -769,25 +751,19 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     # Convert to numeric first, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col)
-                                                .str.to_integer()
-                                                .cast(eval(dtype_str)),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
                                         )
                                     elif self.features_df[col].dtype in [
                                         pl.Float64,
                                         pl.Float32,
                                     ]:
                                         # Float data - cast to integer with null handling for NaN values
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col).cast(
-                                                    eval(dtype_str),
-                                                    strict=False,
-                                                ),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).cast(
+                                                eval(dtype_str),
+                                                strict=False,
+                                            ),
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -798,70 +774,50 @@ def _load_sample5(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_integer()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_decimal()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col).cast(target_dtype),
-                                                    )
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
-                                                )
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
                                             )
                                 elif "Float" in dtype_str:
                                     # Convert to float, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col)
-                                                .str.to_decimal()
-                                                .cast(eval(dtype_str)),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -872,59 +828,43 @@ def _load_sample5(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_integer()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_decimal()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col).cast(target_dtype),
-                                                    )
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
-                                                )
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
                                             )
                                 elif "Utf8" in dtype_str:
                                     # Ensure it's string type
@@ -940,59 +880,43 @@ def _load_sample5(self, filename: str, map: bool = False):
                                     if "Binary" in str(current_dtype):
                                         # Convert binary to string first, then to target type
                                         if target_dtype == pl.Utf8:
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8")
-                                                        if isinstance(x, bytes)
-                                                        else str(x),
-                                                        return_dtype=pl.Utf8,
-                                                    )
-                                                    .cast(target_dtype),
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
                                                 )
+                                                .cast(target_dtype),
                                             )
                                         elif "Int" in str(target_dtype):
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8")
-                                                        if isinstance(x, bytes)
-                                                        else str(x),
-                                                        return_dtype=pl.Utf8,
-                                                    )
-                                                    .str.to_integer()
-                                                    .cast(target_dtype),
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
                                                 )
+                                                .str.to_integer()
+                                                .cast(target_dtype),
                                             )
                                         elif "Float" in str(target_dtype):
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8")
-                                                        if isinstance(x, bytes)
-                                                        else str(x),
-                                                        return_dtype=pl.Utf8,
-                                                    )
-                                                    .str.to_decimal()
-                                                    .cast(target_dtype),
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
                                                 )
+                                                .str.to_decimal()
+                                                .cast(target_dtype),
                                             )
                                         else:
                                             # Try direct casting
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
-                                                )
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
                                             )
                                     else:
                                         # Try direct casting for non-binary types
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).cast(target_dtype),
                                         )
                         except Exception as e:
                             self.logger.warning(
@@ -1027,11 +951,7 @@ def _load_sample5(self, filename: str, map: bool = False):
                 if "features_df" in schema and "columns" in schema["features_df"]:
                     schema_column_order = list(schema["features_df"]["columns"].keys())
                     # Only reorder columns that exist in both schema and DataFrame
-                    existing_columns = [
-                        col
-                        for col in schema_column_order
-                        if col in self.features_df.columns
-                    ]
+                    existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
                     if existing_columns:
                         self.features_df = self.features_df.select(existing_columns)
@@ -1087,7 +1007,7 @@ def _load_sample5(self, filename: str, map: bool = False):
         # Parameters are now loaded from metadata JSON (see above)
         # Lib and lib_match are no longer saved/loaded
-    #if map:
+    # if map:
     #    featureXML = filename.replace(".sample5", ".featureXML")
     #    if os.path.exists(featureXML):
     #        self._load_featureXML(featureXML)
@@ -1102,14 +1022,14 @@ def _load_sample5(self, filename: str, map: bool = False):
     # set self.label to basename without extension
     if self.label is None or self.label == "":
         self.label = os.path.splitext(os.path.basename(filename))[0]
     # Sync instance attributes from loaded parameters
-    if hasattr(self, 'parameters') and self.parameters is not None:
-        if hasattr(self.parameters, 'polarity') and self.parameters.polarity is not None:
+    if hasattr(self, "parameters") and self.parameters is not None:
+        if hasattr(self.parameters, "polarity") and self.parameters.polarity is not None:
             self.polarity = self.parameters.polarity
-        if hasattr(self.parameters, 'type') and self.parameters.type is not None:
+        if hasattr(self.parameters, "type") and self.parameters.type is not None:
             self.type = self.parameters.type
     self.logger.info(f"Sample loaded from {filename}")
@@ -1247,9 +1167,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col)
-                                            .str.to_integer()
-                                            .cast(eval(dtype_str)),
+                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
                                         )
                                     elif self.scans_df[col].dtype in [
                                         pl.Float64,
@@ -1269,9 +1187,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     if self.scans_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
                                         self.scans_df = self.scans_df.with_columns(
-                                            pl.col(col)
-                                            .str.to_decimal()
-                                            .cast(eval(dtype_str)),
+                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
                                         )
                                     else:
                                         # Try direct casting
@@ -1295,9 +1211,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8")
-                                                    if isinstance(x, bytes)
-                                                    else str(x),
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .cast(target_dtype),
@@ -1306,9 +1220,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8")
-                                                    if isinstance(x, bytes)
-                                                    else str(x),
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_integer()
@@ -1318,9 +1230,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                             self.scans_df = self.scans_df.with_columns(
                                                 pl.col(col)
                                                 .map_elements(
-                                                    lambda x: x.decode("utf-8")
-                                                    if isinstance(x, bytes)
-                                                    else str(x),
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
                                                     return_dtype=pl.Utf8,
                                                 )
                                                 .str.to_decimal()
@@ -1349,9 +1259,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
             if "scans_df" in schema and "columns" in schema["scans_df"]:
                 schema_column_order = list(schema["scans_df"]["columns"].keys())
                 # Only reorder columns that exist in both schema and DataFrame
-                existing_columns = [
-                    col for col in schema_column_order if col in self.scans_df.columns
-                ]
+                existing_columns = [col for col in schema_column_order if col in self.scans_df.columns]
                 if existing_columns:
                     self.scans_df = self.scans_df.select(existing_columns)
@@ -1556,25 +1464,19 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     # Convert to numeric first, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to integer
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col)
-                                                .str.to_integer()
-                                                .cast(eval(dtype_str)),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).str.to_integer().cast(eval(dtype_str)),
                                         )
                                     elif self.features_df[col].dtype in [
                                         pl.Float64,
                                         pl.Float32,
                                     ]:
                                         # Float data - cast to integer with null handling for NaN values
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col).cast(
-                                                    eval(dtype_str),
-                                                    strict=False,
-                                                ),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).cast(
+                                                eval(dtype_str),
+                                                strict=False,
+                                            ),
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -1585,70 +1487,50 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_integer()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_decimal()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col).cast(target_dtype),
-                                                    )
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
-                                                )
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
                                             )
                                 elif "Float" in dtype_str:
                                     # Convert to float, handling different input types
                                     if self.features_df[col].dtype == pl.Utf8:
                                         # String data - convert to float
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col)
-                                                .str.to_decimal()
-                                                .cast(eval(dtype_str)),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).str.to_decimal().cast(eval(dtype_str)),
                                         )
                                     else:
                                         # Handle special cases and try direct casting for other types
@@ -1659,59 +1541,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                         if "Binary" in str(current_dtype):
                                             # Convert binary to string first, then to target type
                                             if target_dtype == pl.Utf8:
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Int" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_integer()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_integer()
+                                                    .cast(target_dtype),
                                                 )
                                             elif "Float" in str(target_dtype):
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col)
-                                                        .map_elements(
-                                                            lambda x: x.decode("utf-8")
-                                                            if isinstance(x, bytes)
-                                                            else str(x),
-                                                            return_dtype=pl.Utf8,
-                                                        )
-                                                        .str.to_decimal()
-                                                        .cast(target_dtype),
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col)
+                                                    .map_elements(
+                                                        lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                        return_dtype=pl.Utf8,
                                                     )
+                                                    .str.to_decimal()
+                                                    .cast(target_dtype),
                                                 )
                                             else:
                                                 # Try direct casting
-                                                self.features_df = (
-                                                    self.features_df.with_columns(
-                                                        pl.col(col).cast(target_dtype),
-                                                    )
+                                                self.features_df = self.features_df.with_columns(
+                                                    pl.col(col).cast(target_dtype),
                                                 )
                                         else:
                                             # Try direct casting for non-binary types
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
-                                                )
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
                                             )
                                 elif "Utf8" in dtype_str:
                                     # Ensure it's string type
@@ -1727,59 +1593,43 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                                     if "Binary" in str(current_dtype):
                                         # Convert binary to string first, then to target type
                                         if target_dtype == pl.Utf8:
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8")
-                                                        if isinstance(x, bytes)
-                                                        else str(x),
-                                                        return_dtype=pl.Utf8,
-                                                    )
-                                                    .cast(target_dtype),
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
                                                 )
+                                                .cast(target_dtype),
                                             )
                                         elif "Int" in str(target_dtype):
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8")
-                                                        if isinstance(x, bytes)
-                                                        else str(x),
-                                                        return_dtype=pl.Utf8,
-                                                    )
-                                                    .str.to_integer()
-                                                    .cast(target_dtype),
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
                                                 )
+                                                .str.to_integer()
+                                                .cast(target_dtype),
                                             )
                                         elif "Float" in str(target_dtype):
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col)
-                                                    .map_elements(
-                                                        lambda x: x.decode("utf-8")
-                                                        if isinstance(x, bytes)
-                                                        else str(x),
-                                                        return_dtype=pl.Utf8,
-                                                    )
-                                                    .str.to_decimal()
-                                                    .cast(target_dtype),
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col)
+                                                .map_elements(
+                                                    lambda x: x.decode("utf-8") if isinstance(x, bytes) else str(x),
+                                                    return_dtype=pl.Utf8,
                                                 )
+                                                .str.to_decimal()
+                                                .cast(target_dtype),
                                             )
                                         else:
                                             # Try direct casting
-                                            self.features_df = (
-                                                self.features_df.with_columns(
-                                                    pl.col(col).cast(target_dtype),
-                                                )
+                                            self.features_df = self.features_df.with_columns(
+                                                pl.col(col).cast(target_dtype),
                                             )
                                     else:
                                         # Try direct casting for non-binary types
-                                        self.features_df = (
-                                            self.features_df.with_columns(
-                                                pl.col(col).cast(target_dtype),
-                                            )
+                                        self.features_df = self.features_df.with_columns(
+                                            pl.col(col).cast(target_dtype),
                                         )
                         except Exception as e:
                             self.logger.warning(
@@ -1814,11 +1664,7 @@ def _load_sample5_study(self, filename: str, map: bool = False):
                 if "features_df" in schema and "columns" in schema["features_df"]:
                     schema_column_order = list(schema["features_df"]["columns"].keys())
                     # Only reorder columns that exist in both schema and DataFrame
-                    existing_columns = [
-                        col
-                        for col in schema_column_order
-                        if col in self.features_df.columns
-                    ]
+                    existing_columns = [col for col in schema_column_order if col in self.features_df.columns]
                     if existing_columns:
                         self.features_df = self.features_df.select(existing_columns)
@@ -1848,14 +1694,14 @@ def _load_sample5_study(self, filename: str, map: bool = False):
     # set self.label to basename without extension
     if self.label is None or self.label == "":
         self.label = os.path.splitext(os.path.basename(filename))[0]
     # Sync instance attributes from loaded parameters
-    if hasattr(self, 'parameters') and self.parameters is not None:
-        if hasattr(self.parameters, 'polarity') and self.parameters.polarity is not None:
+    if hasattr(self, "parameters") and self.parameters is not None:
+        if hasattr(self.parameters, "polarity") and self.parameters.polarity is not None:
             self.polarity = self.parameters.polarity
-        if hasattr(self.parameters, 'type') and self.parameters.type is not None:
+        if hasattr(self.parameters, "type") and self.parameters.type is not None:
             self.type = self.parameters.type
     self.logger.info(
         f"Sample loaded successfully from {filename} (optimized for study)",
     )
@@ -2191,9 +2037,7 @@ def _create_dataframe_with_object_columns(
     schema_columns = schema.get(df_name, {}).get("columns", {})
     object_columns = {
-        k: v
-        for k, v in data.items()
-        if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
+        k: v for k, v in data.items() if k in schema_columns and schema_columns[k]["dtype"] == "pl.Object"
     }
     regular_columns = {k: v for k, v in data.items() if k not in object_columns}
@@ -2318,8 +2162,6 @@ def create_h5_metadata_group(
     metadata_group = f.create_group("metadata")
     metadata_group.attrs["format"] = "masster-sample5-1"
     metadata_group.attrs["file_path"] = str(file_path) if file_path is not None else ""
-    metadata_group.attrs["file_source"] = (
-        str(file_source) if file_source is not None else ""
-    )
+    metadata_group.attrs["file_source"] = str(file_source) if file_source is not None else ""
     metadata_group.attrs["file_type"] = str(type) if type is not None else ""
     metadata_group.attrs["label"] = str(label) if label is not None else ""

masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

Potentially problematic release.

masster 0.5.22py3-none-any.whl → 0.5.24py3-none-any.whl