PyPI - masster - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

masster 0.5.1py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (25) hide show

masster/_version.py +1 -1
masster/sample/adducts.py +1 -1
masster/sample/h5.py +11 -11
masster/sample/helpers.py +2 -2
masster/sample/load.py +10 -8
masster/sample/processing.py +1 -1
masster/sample/sample.py +7 -3
masster/study/defaults/align_def.py +0 -204
masster/study/defaults/fill_def.py +9 -1
masster/study/defaults/merge_def.py +20 -69
masster/study/export.py +25 -5
masster/study/h5.py +230 -42
masster/study/helpers.py +430 -53
masster/study/load.py +986 -158
masster/study/merge.py +683 -1076
masster/study/plot.py +95 -73
masster/study/processing.py +337 -280
masster/study/study.py +58 -135
masster/wizard/wizard.py +20 -6
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/METADATA +1 -1
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/RECORD +24 -25
masster/study/defaults/fill_chrom_def.py +0 -260
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/WHEEL +0 -0
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/entry_points.txt +0 -0
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/licenses/LICENSE +0 -0

masster/study/h5.py CHANGED Viewed

@@ -304,6 +304,30 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                     serialized_chunk.append(item.to_json())
                 else:
                     serialized_chunk.append("None")
+        elif col_name == "iso":
+            # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+            for item in chunk_data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        serialized_chunk.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    serialized_chunk.append("None")
+        elif col_name == "ms1_spec":
+            # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+            for item in chunk_data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        serialized_chunk.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    serialized_chunk.append("None")
         else:
             logger.warning(
                 f"Unknown object column '{col_name}', using default serialization",
@@ -564,6 +588,34 @@ def _save_dataframe_column_legacy(
                 else:
                     data_as_str.append("None")
             group.create_dataset(col, data=data_as_str, compression=compression)
+        elif col == "iso":
+            # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+            data_as_json_strings = []
+            for item in data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        data_as_json_strings.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    data_as_json_strings.append("None")
+            group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
+        elif col == "ms1_spec":
+            # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+            data_as_json_strings = []
+            for item in data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        data_as_json_strings.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    data_as_json_strings.append("None")
+            group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
         else:
             logger.warning(
                 f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
@@ -666,6 +718,24 @@ def _reconstruct_object_column(data_col, col_name: str):
                             },
                         )
                 reconstructed_data.append(converted_adducts)
+            elif col_name == "iso":
+                # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+                try:
+                    import numpy as np
+                    iso_data = json.loads(item)
+                    # Convert back to numpy array
+                    reconstructed_data.append(np.array(iso_data) if iso_data else None)
+                except (json.JSONDecodeError, ValueError, ImportError):
+                    reconstructed_data.append(None)
+            elif col_name == "ms1_spec":
+                # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+                try:
+                    import numpy as np
+                    ms1_spec_data = json.loads(item)
+                    # Convert back to numpy array
+                    reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
+                except (json.JSONDecodeError, ValueError, ImportError):
+                    reconstructed_data.append(None)
             else:
                 # Unknown object column
                 reconstructed_data.append(None)
@@ -974,7 +1044,7 @@ def _load_dataframe_from_group(
     # Second pass: handle missing columns
     for col in missing_columns:
-        logger.info(f"Column '{col}' not found in {df_name}.")
+        logger.debug(f"Column '{col}' not found in {df_name}.")
         # For missing columns, create appropriately sized array with appropriate defaults
         if col in object_columns:
             data[col] = [None] * expected_length
@@ -1857,6 +1927,26 @@ def _load_study5(self, filename=None):
                     self.logger,
                     object_columns,
                 )
+                # Sanity check: replace any missing rt_original with rt values
+                if self.features_df is not None and not self.features_df.is_empty():
+                    if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
+                        null_rt_original_count = self.features_df.filter(pl.col("rt_original").is_null()).height
+                        if null_rt_original_count > 0:
+                            self.logger.info(f"Replacing {null_rt_original_count} missing rt_original values with rt")
+                            self.features_df = self.features_df.with_columns(
+                                pl.when(pl.col("rt_original").is_null())
+                                .then(pl.col("rt"))
+                                .otherwise(pl.col("rt_original"))
+                                .alias("rt_original")
+                            )
+                        else:
+                            self.logger.debug("All rt_original values are present")
+                    else:
+                        if "rt_original" not in self.features_df.columns:
+                            self.logger.debug("rt_original column not found in features_df")
+                        if "rt" not in self.features_df.columns:
+                            self.logger.debug("rt column not found in features_df")
             else:
                 self.features_df = _create_empty_dataframe_from_schema("features_df", schema)
             pbar.update(1)
@@ -2008,12 +2098,12 @@ def _load_study5(self, filename=None):
         )
     # Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
-    self._sanitize_null_ids()
+    _sanitize_nulls(self)
     self.logger.debug("Study loaded")
-def _load_ms1(self, sample_path: str) -> pl.DataFrame:
+def _load_ms1(self, filename: str) -> pl.DataFrame:
     """
     Optimized method to load only MS1 data from a sample5 file for isotope detection.
@@ -2030,48 +2120,146 @@ def _load_ms1(self, sample_path: str) -> pl.DataFrame:
     Note:
         Used by find_iso() for efficient isotope pattern detection without full sample loading
     """
-    try:
-        with h5py.File(sample_path, "r") as f:
-            # Check if ms1 group exists
-            if "ms1" not in f:
-                self.logger.debug(f"No MS1 data found in {sample_path}")
-                return pl.DataFrame()
-            ms1_group = f["ms1"]
+    #try:
+    # add .sample5 extension if not provided
+    if not filename.endswith(".sample5"):
+        filename += ".sample5"
+    with h5py.File(filename, "r") as f:
+        # Check if ms1 group exists
+        if "ms1" not in f:
+            self.logger.debug(f"No MS1 data found in {filename}")
+            return pl.DataFrame()
+        ms1_group = f["ms1"]
+        # Load MS1 data efficiently
+        ms1_data = {}
+        for col in ms1_group.keys():
+            ms1_data[col] = ms1_group[col][:]
+        if not ms1_data:
+            self.logger.debug(f"Empty MS1 data in {filename}")
+            return pl.DataFrame()
-            # Load MS1 data efficiently
-            ms1_data = {}
-            for col in ms1_group.keys():
-                ms1_data[col] = ms1_group[col][:]
+        # Create DataFrame with proper schema
+        ms1_df = pl.DataFrame(ms1_data)
+        # Apply expected schema for MS1 data
+        expected_schema = {
+            "cycle": pl.Int64,
+            "scan_uid": pl.Int64,
+            "rt": pl.Float64,
+            "mz": pl.Float64,
+            "inty": pl.Float64
+        }
+        # Cast columns to expected types if they exist
+        cast_expressions = []
+        for col, dtype in expected_schema.items():
+            if col in ms1_df.columns:
+                cast_expressions.append(pl.col(col).cast(dtype))
+        if cast_expressions:
+            ms1_df = ms1_df.with_columns(cast_expressions)
+        self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {filename}")
+        return ms1_df
-            if not ms1_data:
-                self.logger.debug(f"Empty MS1 data in {sample_path}")
-                return pl.DataFrame()
-            # Create DataFrame with proper schema
-            ms1_df = pl.DataFrame(ms1_data)
+   #except Exception as e:
+   #     self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
+   #     return pl.DataFrame()
+def _sanitize_nulls(self):
+    """
+    Sanitize null feature_id and consensus_id values by replacing them with new integer IDs.
+    For feature_id: generates large sequential integers that can be converted by merge/align functions.
+    For consensus_id: uses 16-character UUID strings (as expected by merge function).
+    """
+    import uuid
+    import polars as pl
+    import time
+    # Sanitize features_df feature_id column
+    if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
+        # Check for null feature_ids
+        null_feature_ids = self.features_df.filter(pl.col("feature_id").is_null()).shape[0]
+        if null_feature_ids > 0:
+            self.logger.debug(f"Sanitizing {null_feature_ids} null feature_id values with new integer IDs")
-            # Apply expected schema for MS1 data
-            expected_schema = {
-                "cycle": pl.Int64,
-                "scan_uid": pl.Int64,
-                "rt": pl.Float64,
-                "mz": pl.Float64,
-                "inty": pl.Float64
-            }
+            # Find the maximum existing feature_id (convert strings to int if possible)
+            max_existing_id = 0
+            existing_ids = self.features_df.filter(pl.col("feature_id").is_not_null())["feature_id"].to_list()
+            for fid in existing_ids:
+                try:
+                    int_id = int(fid)
+                    max_existing_id = max(max_existing_id, int_id)
+                except (ValueError, TypeError):
+                    # Skip non-integer IDs
+                    pass
-            # Cast columns to expected types if they exist
-            cast_expressions = []
-            for col, dtype in expected_schema.items():
-                if col in ms1_df.columns:
-                    cast_expressions.append(pl.col(col).cast(dtype))
+            # Generate new sequential integer IDs starting from max + timestamp offset
+            # Use timestamp to ensure uniqueness across different sanitization runs
+            base_id = max(max_existing_id + 1, int(time.time() * 1000000))  # Microsecond timestamp
+            new_int_ids = [str(base_id + i) for i in range(null_feature_ids)]
+            uid_index = 0
-            if cast_expressions:
-                ms1_df = ms1_df.with_columns(cast_expressions)
+            # Create a list to store all feature_ids
+            feature_ids = []
+            for feature_id in self.features_df["feature_id"].to_list():
+                if feature_id is None:
+                    feature_ids.append(new_int_ids[uid_index])
+                    uid_index += 1
+                else:
+                    feature_ids.append(feature_id)
-            self.logger.debug(f"Loaded {len(ms1_df)} MS1 peaks from {sample_path}")
-            return ms1_df
+            # Update the DataFrame with sanitized feature_ids
+            self.features_df = self.features_df.with_columns(
+                pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
+            )
-    except Exception as e:
-        self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
-        return pl.DataFrame()
+            self.logger.debug(f"Successfully sanitized {null_feature_ids} feature_id values")
+    # Sanitize consensus_df consensus_id column
+    if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
+        if "consensus_id" in self.consensus_df.columns:
+            null_consensus_ids = self.consensus_df.filter(pl.col("consensus_id").is_null()).shape[0]
+            if null_consensus_ids > 0:
+                self.logger.debug(f"Sanitizing {null_consensus_ids} null consensus_id values with new UIDs")
+                # Generate new UIDs for null values using the same method as merge()
+                new_uids = [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(null_consensus_ids)]
+                uid_index = 0
+                # Create a list to store all consensus_ids
+                consensus_ids = []
+                for consensus_id in self.consensus_df["consensus_id"].to_list():
+                    if consensus_id is None:
+                        consensus_ids.append(new_uids[uid_index])
+                        uid_index += 1
+                    else:
+                        consensus_ids.append(consensus_id)
+                # Update the DataFrame with sanitized consensus_ids
+                self.consensus_df = self.consensus_df.with_columns(
+                    pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
+                )
+                self.logger.debug(f"Successfully sanitized {null_consensus_ids} consensus_id values")
+    # Sanitize rt_original in features_df by replacing null or NaN values with rt values
+    if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
+        if "rt_original" in self.features_df.columns and "rt" in self.features_df.columns:
+            # Check for null or NaN values in rt_original
+            null_or_nan_rt_original = self.features_df.filter(
+                pl.col("rt_original").is_null() | pl.col("rt_original").is_nan()
+            ).shape[0]
+            if null_or_nan_rt_original > 0:
+                self.logger.debug(f"Sanitizing {null_or_nan_rt_original} null or NaN rt_original values with rt values")
+                self.features_df = self.features_df.with_columns(
+                    pl.when(pl.col("rt_original").is_null() | pl.col("rt_original").is_nan())
+                    .then(pl.col("rt"))
+                    .otherwise(pl.col("rt_original"))
+                    .alias("rt_original")
+                )
+                self.logger.debug(f"Successfully sanitized {null_or_nan_rt_original} rt_original values")

masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

Potentially problematic release.

masster 0.5.1py3-none-any.whl → 0.5.4py3-none-any.whl