PyPI - masster - Versions diffs - 0.5.21__py3-none-any.whl → 0.5.23__py3-none-any.whl - Mend

masster 0.5.21py3-none-any.whl → 0.5.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (37) hide show

masster/_version.py +1 -1
masster/logger.py +35 -19
masster/sample/adducts.py +15 -29
masster/sample/defaults/find_adducts_def.py +1 -3
masster/sample/defaults/sample_def.py +4 -4
masster/sample/h5.py +203 -361
masster/sample/helpers.py +14 -30
masster/sample/lib.py +3 -3
masster/sample/load.py +21 -29
masster/sample/plot.py +222 -132
masster/sample/processing.py +42 -55
masster/sample/sample.py +37 -46
masster/sample/save.py +37 -61
masster/sample/sciex.py +13 -11
masster/sample/thermo.py +69 -74
masster/spectrum.py +15 -15
masster/study/analysis.py +650 -586
masster/study/defaults/identify_def.py +1 -3
masster/study/defaults/merge_def.py +6 -7
masster/study/defaults/study_def.py +1 -5
masster/study/export.py +35 -96
masster/study/h5.py +100 -204
masster/study/helpers.py +385 -459
masster/study/id.py +239 -290
masster/study/importers.py +84 -93
masster/study/load.py +159 -178
masster/study/merge.py +1112 -1098
masster/study/plot.py +195 -149
masster/study/processing.py +144 -191
masster/study/save.py +14 -13
masster/study/study.py +89 -130
masster/wizard/wizard.py +765 -714
{masster-0.5.21.dist-info → masster-0.5.23.dist-info}/METADATA +27 -1
{masster-0.5.21.dist-info → masster-0.5.23.dist-info}/RECORD +37 -37
{masster-0.5.21.dist-info → masster-0.5.23.dist-info}/WHEEL +0 -0
{masster-0.5.21.dist-info → masster-0.5.23.dist-info}/entry_points.txt +0 -0
{masster-0.5.21.dist-info → masster-0.5.23.dist-info}/licenses/LICENSE +0 -0

masster/study/h5.py CHANGED Viewed

@@ -61,18 +61,18 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
     if df_name not in schema:
         # Fallback to basic empty DataFrame if schema not found
         return pl.DataFrame()
     df_schema = schema[df_name]["columns"]
     empty_data = {}
     polars_schema = {}
     for col_name, col_info in df_schema.items():
         dtype_str = col_info["dtype"]
         # Convert string representation to actual Polars dtype
         if dtype_str == "pl.Int64":
             polars_dtype = pl.Int64
         elif dtype_str == "pl.Int32":
-            polars_dtype = pl.Int32
+            polars_dtype = pl.Int32
         elif dtype_str == "pl.Float64":
             polars_dtype = pl.Float64
         elif dtype_str == "pl.Utf8":
@@ -88,10 +88,10 @@ def _create_empty_dataframe_from_schema(df_name: str, schema: dict) -> pl.DataFr
         else:
             # Fallback to string if unknown type
             polars_dtype = pl.String
         empty_data[col_name] = []
         polars_schema[col_name] = polars_dtype
     return pl.DataFrame(empty_data, schema=polars_schema)
@@ -313,7 +313,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                         serialized_chunk.append(json.dumps(item.tolist()))
                     except (AttributeError, TypeError):
                         # Fallback for non-numpy data
-                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
                 else:
                     serialized_chunk.append("None")
         elif col_name == "ms1_spec":
@@ -325,7 +325,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                         serialized_chunk.append(json.dumps(item.tolist()))
                     except (AttributeError, TypeError):
                         # Fallback for non-numpy data
-                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
                 else:
                     serialized_chunk.append("None")
         else:
@@ -392,10 +392,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                             )
                             # Fallback to simple string conversion for this chunk
                             chunk = data_list[chunk_start : chunk_start + chunk_size]
-                            results[chunk_start] = [
-                                str(item) if item is not None else "None"
-                                for item in chunk
-                            ]
+                            results[chunk_start] = [str(item) if item is not None else "None" for item in chunk]
                     # Reassemble in correct order
                     for i in range(0, total_items, chunk_size):
@@ -598,7 +595,7 @@ def _save_dataframe_column_legacy(
                         data_as_json_strings.append(json.dumps(item.tolist()))
                     except (AttributeError, TypeError):
                         # Fallback for non-numpy data
-                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
                 else:
                     data_as_json_strings.append("None")
             group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
@@ -612,7 +609,7 @@ def _save_dataframe_column_legacy(
                         data_as_json_strings.append(json.dumps(item.tolist()))
                     except (AttributeError, TypeError):
                         # Fallback for non-numpy data
-                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, "__iter__") else []))
                 else:
                     data_as_json_strings.append("None")
             group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
@@ -712,9 +709,7 @@ def _reconstruct_object_column(data_col, col_name: str):
                                 "adduct": str(adduct_row[0]),
                                 "count": int(float(adduct_row[1])),
                                 "percentage": float(adduct_row[2]),
-                                "mass": float(adduct_row[3])
-                                if len(adduct_row) > 3
-                                else 0.0,
+                                "mass": float(adduct_row[3]) if len(adduct_row) > 3 else 0.0,
                             },
                         )
                 reconstructed_data.append(converted_adducts)
@@ -722,6 +717,7 @@ def _reconstruct_object_column(data_col, col_name: str):
                 # Handle isotope patterns (numpy arrays with [mz, intensity] data)
                 try:
                     import numpy as np
                     iso_data = json.loads(item)
                     # Convert back to numpy array
                     reconstructed_data.append(np.array(iso_data) if iso_data else None)
@@ -731,6 +727,7 @@ def _reconstruct_object_column(data_col, col_name: str):
                 # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
                 try:
                     import numpy as np
                     ms1_spec_data = json.loads(item)
                     # Convert back to numpy array
                     reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
@@ -821,25 +818,25 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
     # First check all data for numpy object arrays and move them to object columns
     additional_object_cols = []
     for k, v in data.items():
-        if k not in object_columns and hasattr(v, 'dtype') and str(v.dtype) == 'object':
+        if k not in object_columns and hasattr(v, "dtype") and str(v.dtype) == "object":
             # This is a numpy object array that should be treated as object
             additional_object_cols.append(k)
             object_columns.append(k)
     if additional_object_cols:
         # Re-run reconstruction for these columns
         for col in additional_object_cols:
             data[col] = _reconstruct_object_column(data[col], col)
     object_data = {k: v for k, v in data.items() if k in object_columns}
     regular_data = {k: v for k, v in data.items() if k not in object_columns}
     # Final check: ensure no numpy object arrays in regular_data
     problematic_cols = []
     for k, v in regular_data.items():
-        if hasattr(v, 'dtype') and str(v.dtype) == 'object':
+        if hasattr(v, "dtype") and str(v.dtype) == "object":
             problematic_cols.append(k)
     if problematic_cols:
         # Move these to object_data
         for col in problematic_cols:
@@ -878,7 +875,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
         # and handle numpy scalars within lists
         safe_regular_data = {}
         import numpy as np
         def convert_numpy_scalars(value):
             """Convert numpy scalars to Python native types recursively."""
             if isinstance(value, np.generic):
@@ -887,17 +884,19 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
                 return [convert_numpy_scalars(item) for item in value]
             else:
                 return value
         for k, v in regular_data.items():
-            if hasattr(v, 'dtype') and str(v.dtype) == 'object':
+            if hasattr(v, "dtype") and str(v.dtype) == "object":
                 # Convert numpy object array to Python list
-                safe_regular_data[k] = [convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, 'tolist') else list(v))]
+                safe_regular_data[k] = [
+                    convert_numpy_scalars(item) for item in (v.tolist() if hasattr(v, "tolist") else list(v))
+                ]
             elif isinstance(v, list):
                 # Handle lists that might contain numpy scalars
                 safe_regular_data[k] = [convert_numpy_scalars(item) for item in v]
             else:
                 safe_regular_data[k] = convert_numpy_scalars(v)
         # Create DataFrame with proper error handling
         try:
             df = pl.DataFrame(safe_regular_data)
@@ -914,7 +913,7 @@ def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataF
                     except Exception:
                         # Last resort: skip the column entirely
                         continue
         # Add Object columns one by one
         for col, values in object_data.items():
             # print(f"DEBUG: Adding object column '{col}', type: {type(values)}, length: {len(values) if values is not None else 'None'}")
@@ -993,9 +992,7 @@ def _load_dataframe_from_group(
     )
     schema_section = schema.get(df_name, {}) if isinstance(schema, dict) else {}
     logger.debug(f"Schema section for {df_name}: {schema_section}")
-    schema_columns = (
-        schema_section.get("columns", []) if isinstance(schema_section, dict) else []
-    )
+    schema_columns = schema_section.get("columns", []) if isinstance(schema_section, dict) else []
     logger.debug(f"Schema columns for {df_name}: {schema_columns}")
     if schema_columns is None:
         schema_columns = []
@@ -1158,11 +1155,7 @@ def _load_dataframe_from_group(
         }
         migrated_old_names = set(column_migrations.keys())
-    extra_columns = [
-        col
-        for col in hdf5_columns
-        if col not in (schema_columns or []) and col not in migrated_old_names
-    ]
+    extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
     for col in extra_columns:
         logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
@@ -1188,10 +1181,7 @@ def _load_dataframe_from_group(
                         object_columns.append(col)
                 else:
                     # Regular string data
-                    data[col] = [
-                        item.decode("utf-8") if isinstance(item, bytes) else item
-                        for item in column_data
-                    ]
+                    data[col] = [item.decode("utf-8") if isinstance(item, bytes) else item for item in column_data]
             except Exception:
                 # If decoding fails, treat as regular data
                 data[col] = column_data
@@ -1204,19 +1194,10 @@ def _load_dataframe_from_group(
     # Handle byte string conversion for non-object columns
     # Only convert to strings for columns that should actually be strings
     for col, values in data.items():
-        if (
-            col not in object_columns
-            and values is not None
-            and len(values) > 0
-            and isinstance(values[0], bytes)
-        ):
+        if col not in object_columns and values is not None and len(values) > 0 and isinstance(values[0], bytes):
             # Check schema to see if this should be a string column
             should_be_string = False
-            if (
-                df_name in schema
-                and "columns" in schema[df_name]
-                and col in schema[df_name]["columns"]
-            ):
+            if df_name in schema and "columns" in schema[df_name] and col in schema[df_name]["columns"]:
                 dtype_str = schema[df_name]["columns"][col]["dtype"]
                 should_be_string = dtype_str == "pl.Utf8"
@@ -1237,25 +1218,25 @@ def _load_dataframe_from_group(
                 logger.debug(
                     f"Object column '{col}': length={len(data[col]) if data[col] is not None else 'None'}",
                 )
         # Debug: check for problematic data types in all columns before DataFrame creation
         for col, values in data.items():
-            if hasattr(values, 'dtype') and str(values.dtype) == 'object':
+            if hasattr(values, "dtype") and str(values.dtype) == "object":
                 logger.warning(f"Column '{col}' has numpy object dtype but is not in object_columns: {object_columns}")
                 if col not in object_columns:
                     object_columns.append(col)
         df = _create_dataframe_with_objects(data, object_columns)
     else:
         # Debug: check for problematic data types when no object columns are expected
         for col, values in data.items():
-            if hasattr(values, 'dtype') and str(values.dtype) == 'object':
+            if hasattr(values, "dtype") and str(values.dtype) == "object":
                 logger.warning(f"Column '{col}' has numpy object dtype but no object_columns specified!")
                 # Treat as object column
                 if object_columns is None:
                     object_columns = []
                 object_columns.append(col)
         if object_columns:
             df = _create_dataframe_with_objects(data, object_columns)
         else:
@@ -1302,34 +1283,21 @@ def _save_study5_compressed(self, filename):
             dataframes_to_save.append(("features", len(self.features_df)))
         if self.consensus_df is not None and not self.consensus_df.is_empty():
             dataframes_to_save.append(("consensus", len(self.consensus_df)))
-        if (
-            self.consensus_mapping_df is not None
-            and not self.consensus_mapping_df.is_empty()
-        ):
+        if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
             dataframes_to_save.append(
                 ("consensus_mapping", len(self.consensus_mapping_df)),
             )
         if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
             dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
-        if (
-            hasattr(self, "lib_df")
-            and self.lib_df is not None
-            and not self.lib_df.is_empty()
-        ):
+        if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
             dataframes_to_save.append(("lib", len(self.lib_df)))
-        if (
-            hasattr(self, "id_df")
-            and self.id_df is not None
-            and not self.id_df.is_empty()
-        ):
+        if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
             dataframes_to_save.append(("id", len(self.id_df)))
         total_steps = len(dataframes_to_save) + 1  # +1 for metadata
         # Show progress for large saves
-        tdqm_disable = (
-            self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
-        )
+        tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
         with tqdm(
             total=total_steps,
@@ -1347,14 +1315,8 @@ def _save_study5_compressed(self, filename):
             # Store metadata
             metadata_group.attrs["format"] = "masster-study-1"
-            metadata_group.attrs["folder"] = (
-                str(self.folder) if self.folder is not None else ""
-            )
-            metadata_group.attrs["label"] = (
-                str(self.label)
-                if hasattr(self, "label") and self.label is not None
-                else ""
-            )
+            metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
+            metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
             # Store parameters as JSON
             if hasattr(self, "parameters") and self.history is not None:
@@ -1419,10 +1381,7 @@ def _save_study5_compressed(self, filename):
                 pbar.update(1)
             # Store consensus_mapping_df - keep existing fast method
-            if (
-                self.consensus_mapping_df is not None
-                and not self.consensus_mapping_df.is_empty()
-            ):
+            if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
                 consensus_mapping = self.consensus_mapping_df.clone()
                 self.logger.debug(
                     f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
@@ -1458,11 +1417,7 @@ def _save_study5_compressed(self, filename):
                 pbar.update(1)
             # Store lib_df - library data
-            if (
-                hasattr(self, "lib_df")
-                and self.lib_df is not None
-                and not self.lib_df.is_empty()
-            ):
+            if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
                 self.logger.debug(
                     f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
                 )
@@ -1476,11 +1431,7 @@ def _save_study5_compressed(self, filename):
                 pbar.update(1)
             # Store id_df - identification results
-            if (
-                hasattr(self, "id_df")
-                and self.id_df is not None
-                and not self.id_df.is_empty()
-            ):
+            if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
                 self.logger.debug(
                     f"Saving id_df with {len(self.id_df)} rows using optimized method",
                 )
@@ -1636,34 +1587,21 @@ def _save_study5(self, filename):
             dataframes_to_save.append(("features", len(self.features_df)))
         if self.consensus_df is not None and not self.consensus_df.is_empty():
             dataframes_to_save.append(("consensus", len(self.consensus_df)))
-        if (
-            self.consensus_mapping_df is not None
-            and not self.consensus_mapping_df.is_empty()
-        ):
+        if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
             dataframes_to_save.append(
                 ("consensus_mapping", len(self.consensus_mapping_df)),
             )
         if self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
             dataframes_to_save.append(("consensus_ms2", len(self.consensus_ms2)))
-        if (
-            hasattr(self, "lib_df")
-            and self.lib_df is not None
-            and not self.lib_df.is_empty()
-        ):
+        if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
             dataframes_to_save.append(("lib", len(self.lib_df)))
-        if (
-            hasattr(self, "id_df")
-            and self.id_df is not None
-            and not self.id_df.is_empty()
-        ):
+        if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
             dataframes_to_save.append(("id", len(self.id_df)))
         total_steps = len(dataframes_to_save) + 1  # +1 for metadata
         # Show progress for large saves
-        tdqm_disable = (
-            self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
-        )
+        tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"] or total_steps < 2
         with tqdm(
             total=total_steps,
@@ -1681,14 +1619,8 @@ def _save_study5(self, filename):
             # Store metadata
             metadata_group.attrs["format"] = "masster-study-1"
-            metadata_group.attrs["folder"] = (
-                str(self.folder) if self.folder is not None else ""
-            )
-            metadata_group.attrs["label"] = (
-                str(self.label)
-                if hasattr(self, "label") and self.label is not None
-                else ""
-            )
+            metadata_group.attrs["folder"] = str(self.folder) if self.folder is not None else ""
+            metadata_group.attrs["label"] = str(self.label) if hasattr(self, "label") and self.label is not None else ""
             # Store parameters as JSON
             if hasattr(self, "parameters") and self.history is not None:
@@ -1756,10 +1688,7 @@ def _save_study5(self, filename):
                 pbar.update(1)
             # Store consensus_mapping_df - keep existing fast method
-            if (
-                self.consensus_mapping_df is not None
-                and not self.consensus_mapping_df.is_empty()
-            ):
+            if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
                 consensus_mapping = self.consensus_mapping_df.clone()
                 self.logger.debug(
                     f"Saving consensus_mapping_df with {len(consensus_mapping)} rows",
@@ -1795,11 +1724,7 @@ def _save_study5(self, filename):
                 pbar.update(1)
             # Store lib_df - library data
-            if (
-                hasattr(self, "lib_df")
-                and self.lib_df is not None
-                and not self.lib_df.is_empty()
-            ):
+            if hasattr(self, "lib_df") and self.lib_df is not None and not self.lib_df.is_empty():
                 self.logger.debug(
                     f"Saving lib_df with {len(self.lib_df)} rows using optimized method",
                 )
@@ -1813,11 +1738,7 @@ def _save_study5(self, filename):
                 pbar.update(1)
             # Store id_df - identification results
-            if (
-                hasattr(self, "id_df")
-                and self.id_df is not None
-                and not self.id_df.is_empty()
-            ):
+            if hasattr(self, "id_df") and self.id_df is not None and not self.id_df.is_empty():
                 self.logger.debug(
                     f"Saving id_df with {len(self.id_df)} rows using optimized method",
                 )
@@ -1896,12 +1817,7 @@ def _load_study5(self, filename=None):
     with h5py.File(filename, "r") as f:
         # Use progress bar to show loading progress
-        with tqdm(
-            total=len(loading_steps),
-            desc="Loading study",
-            disable=tdqm_disable,
-            unit="step"
-        ) as pbar:
+        with tqdm(total=len(loading_steps), desc="Loading study", disable=tdqm_disable, unit="step") as pbar:
             # Load metadata
             pbar.set_description(
                 f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata",
@@ -1963,24 +1879,14 @@ def _load_study5(self, filename=None):
                 # Synchronize instance attributes with parameters (similar to __init__)
                 # Note: folder and label are already loaded from metadata attributes above
                 # but we ensure they match the parameters for consistency
-                if (
-                    hasattr(self.parameters, "folder")
-                    and self.parameters.folder is not None
-                ):
+                if hasattr(self.parameters, "folder") and self.parameters.folder is not None:
                     self.folder = self.parameters.folder
-                if (
-                    hasattr(self.parameters, "label")
-                    and self.parameters.label is not None
-                ):
+                if hasattr(self.parameters, "label") and self.parameters.label is not None:
                     self.label = self.parameters.label
                 if hasattr(self.parameters, "log_level"):
                     self.log_level = self.parameters.log_level
                 if hasattr(self.parameters, "log_label"):
-                    self.log_label = (
-                        self.parameters.log_label
-                        if self.parameters.log_label is not None
-                        else ""
-                    )
+                    self.log_label = self.parameters.log_label if self.parameters.log_label is not None else ""
                 if hasattr(self.parameters, "log_sink"):
                     self.log_sink = self.parameters.log_sink
             pbar.update(1)
@@ -2017,7 +1923,7 @@ def _load_study5(self, filename=None):
                     self.logger,
                     object_columns,
                 )
                 # Sanity check: replace any missing rt_original with rt values
                 if self.features_df is not None and not self.features_df.is_empty():
                     if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
@@ -2061,16 +1967,11 @@ def _load_study5(self, filename=None):
                 # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
                 if self.consensus_df is not None:
-                    if (
-                        "adducts" not in self.consensus_df.columns
-                        or self.consensus_df["adducts"].dtype == pl.Null
-                    ):
+                    if "adducts" not in self.consensus_df.columns or self.consensus_df["adducts"].dtype == pl.Null:
                         self.logger.info(
                             "Adding missing 'adducts' column for backward compatibility",
                         )
-                        empty_adducts: list[list] = [
-                            [] for _ in range(len(self.consensus_df))
-                        ]
+                        empty_adducts: list[list] = [[] for _ in range(len(self.consensus_df))]
                         # If column exists but is Null, drop it first
                         if "adducts" in self.consensus_df.columns:
@@ -2163,11 +2064,7 @@ def _load_study5(self, filename=None):
             pbar.update(1)
     # Check and migrate old string-based map_id to integer indices
-    if (
-        self.samples_df is not None
-        and not self.samples_df.is_empty()
-        and self.samples_df["map_id"].dtype == pl.Utf8
-    ):
+    if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
         self.logger.info(
             "Detected old string-based map_id format, migrating to integer indices",
         )
@@ -2191,26 +2088,26 @@ def _load_study5(self, filename=None):
     _sanitize_nulls(self)
     self.logger.debug("Study loaded")
 def _load_ms1(self, filename: str) -> pl.DataFrame:
     """
     Optimized method to load only MS1 data from a sample5 file for isotope detection.
     This method efficiently loads only the ms1_df from a sample5 HDF5 file without
     loading other potentially large datasets like features_df, scans_df, etc.
     Args:
         sample_path (str): Path to the sample5 HDF5 file
     Returns:
-        pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
+        pl.DataFrame: MS1 data with columns [cycle, scan_uid, rt, mz, inty]
                      Returns empty DataFrame if no MS1 data found or file cannot be read
     Note:
         Used by find_iso() for efficient isotope pattern detection without full sample loading
     """
-    #try:
+    # try:
     # add .sample5 extension if not provided
     if not filename.endswith(".sample5"):
         filename += ".sample5"
@@ -2219,45 +2116,46 @@ def _load_ms1(self, filename: str) -> pl.DataFrame:
         if "ms1" not in f:
             self.logger.debug(f"No MS1 data found in {filename}")
             return pl.DataFrame()
         ms1_group = f["ms1"]
         # Load MS1 data efficiently
         ms1_data = {}
         for col in ms1_group.keys():
             ms1_data[col] = ms1_group[col][:]
         if not ms1_data:
             self.logger.debug(f"Empty MS1 data in {filename}")
             return pl.DataFrame()
         # Create DataFrame with proper schema
         ms1_df = pl.DataFrame(ms1_data)
         # Apply expected schema for MS1 data
         expected_schema = {
             "cycle": pl.Int64,
-            "scan_uid": pl.Int64,
+            "scan_uid": pl.Int64,
             "rt": pl.Float64,
             "mz": pl.Float64,
-            "inty": pl.Float64
+            "inty": pl.Float64,
         }
         # Cast columns to expected types if they exist
         cast_expressions = []
         for col, dtype in expected_schema.items():
             if col in ms1_df.columns:
                 cast_expressions.append(pl.col(col).cast(dtype))
         if cast_expressions:
             ms1_df = ms1_df.with_columns(cast_expressions)
         self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
         return ms1_df
-   #except Exception as e:
-   #     self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
-   #     return pl.DataFrame()
+# except Exception as e:
+#     self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
+#     return pl.DataFrame()
 def _sanitize_nulls(self):
@@ -2269,14 +2167,14 @@ def _sanitize_nulls(self):
     import uuid
     import polars as pl
     import time
     # Sanitize features_df feature_id column
-    if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
+    if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
         # Check for null feature_ids
         null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
         if null_feature_ids > 0:
             self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
             # Find the maximum existing feature_id (convert strings to int if possible)
             max_existing_id = 0
             existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
@@ -2287,13 +2185,13 @@ def _sanitize_nulls(self):
                 except (ValueError, TypeError):
                     # Skip non-integer IDs
                     pass
             # Generate new sequential integer IDs starting from max + timestamp offset
             # Use timestamp to ensure uniqueness across different sanitization runs
             base_id = max(max_existing_id + 1, int(time.time() * 1000000))  # Microsecond timestamp
             new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
             uid_index = 0
             # Create a list to store all feature_ids
             feature_ids = []
             for feature_id in self.features_df["feature_id"].to_list():
@@ -2302,25 +2200,23 @@ def _sanitize_nulls(self):
                     uid_index += 1
                 else:
                     feature_ids.append(feature_id)
             # Update the DataFrame with sanitized feature_ids
-            self.features_df = self.features_df.with_columns(
-                pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
-            )
+            self.features_df = self.features_df.with_columns(pl.Series("feature_id", feature_ids, dtype=pl.Utf8))
             self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
     # Sanitize consensus_df consensus_id column
-    if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
+    if hasattr(self, "consensus_df") and self.consensus_df is not None and not self.consensus_df.is_empty():
         if "consensus_id" in self.consensus_df.columns:
             null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
             if null_consensus_ids > 0:
                 self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
                 # Generate new UIDs for null values using the same method as merge()
-                new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
+                new_uids = [str(uuid.uuid4()).replace("-", "")[:16] for _ in range(null_consensus_ids)]
                 uid_index = 0
                 # Create a list to store all consensus_ids
                 consensus_ids = []
                 for consensus_id in self.consensus_df["consensus_id"].to_list():
@@ -2329,7 +2225,7 @@ def _sanitize_nulls(self):
                         uid_index += 1
                     else:
                         consensus_ids.append(consensus_id)
                 # Update the DataFrame with sanitized consensus_ids
                 self.consensus_df = self.consensus_df.with_columns(
                     pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
@@ -2338,7 +2234,7 @@ def _sanitize_nulls(self):
                 self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
     # Sanitize rt_original in features_df by replacing null or NaN values with rt values
-    if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
+    if hasattr(self, "features_df") and self.features_df is not None and not self.features_df.is_empty():
         if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
             # Check for null or NaN values in rt_original
             null_or_nan_rt_original = self.features_df.filter(
@@ -2352,4 +2248,4 @@ def _sanitize_nulls(self):
                     .otherwise(pl.col("rt_original"))
                     .alias("rt_original")
                 )
-                self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")
+                self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")

masster 0.5.21__py3-none-any.whl → 0.5.23__py3-none-any.whl

Potentially problematic release.

masster 0.5.21py3-none-any.whl → 0.5.23py3-none-any.whl