PyPI - masster - Versions diffs - 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl - Mend

masster 0.3.17py3-none-any.whl → 0.3.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (18) hide show

masster/_version.py +1 -1
masster/sample/h5.py +1 -1
masster/sample/helpers.py +3 -7
masster/sample/load.py +2 -2
masster/sample/plot.py +2 -1
masster/study/export.py +27 -10
masster/study/h5.py +58 -40
masster/study/helpers.py +275 -225
masster/study/helpers_optimized.py +5 -5
masster/study/load.py +148 -121
masster/study/plot.py +306 -106
masster/study/processing.py +9 -5
masster/study/study.py +2 -6
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/METADATA +1 -1
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/RECORD +18 -18
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/WHEEL +0 -0
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/entry_points.txt +0 -0
{masster-0.3.17.dist-info → masster-0.3.19.dist-info}/licenses/LICENSE +0 -0

masster/_version.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.3.17"
+__version__ = "0.3.19"
 def get_version():

masster/sample/h5.py CHANGED Viewed

@@ -900,7 +900,7 @@ def _load_sample5(self, filename: str, map: bool = True):
 def _load_sample5_study(self, filename: str, map: bool = True):
     """
     Optimized variant of _load_sample5 for study loading that skips reading ms1_df.
     This is used when adding samples to studies where ms1_df data is not needed,
     improving loading throughput by skipping the potentially large ms1_df dataset.

masster/sample/helpers.py CHANGED Viewed

@@ -176,7 +176,7 @@ def _get_feature_uids(self, features=None, verbose=True):
                 if not isinstance(features, pd.DataFrame):
                     if verbose:
                         self.logger.error(
-                            "Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame."
+                            "Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.",
                         )
                     return []
@@ -298,7 +298,7 @@ def get_eic(self, mz, mz_tol=None):
     """
     # Use default mz_tol from sample parameters if not provided
     if mz_tol is None:
-        if hasattr(self, 'parameters') and hasattr(self.parameters, 'eic_mz_tol'):
+        if hasattr(self, "parameters") and hasattr(self.parameters, "eic_mz_tol"):
             mz_tol = self.parameters.eic_mz_tol
         else:
             mz_tol = 0.01  # fallback default
@@ -323,11 +323,7 @@ def get_eic(self, mz, mz_tol=None):
             return None
         # Aggregate intensities per retention time. Use sum in case multiple points per rt.
-        chrom = (
-            matches.group_by("rt")
-            .agg([pl.col("inty").sum().alias("inty")])
-            .sort("rt")
-        )
+        chrom = matches.group_by("rt").agg([pl.col("inty").sum().alias("inty")]).sort("rt")
         # Attach to Sample
         self.chrom_df = chrom

masster/sample/load.py CHANGED Viewed

@@ -119,7 +119,7 @@ def load_study(
 ):
     """
     Optimized load method for study use that skips loading ms1_df for better performance.
     This method is identical to load() but uses _load_sample5_study() for .sample5 files,
     which skips reading the potentially large ms1_df dataset to improve throughput when
     adding samples to studies.
@@ -983,7 +983,7 @@ def index_file(self):
             self.set_source(self.file_source.replace(".sample5", ".mzml"))
         else:
             raise FileNotFoundError(
-                f"File {self.file_source} not found. Did the path change? Consider running source()."
+                f"File {self.file_source} not found. Did the path change? Consider running source().",
             )
         self.index_file()
     else:

masster/sample/plot.py CHANGED Viewed

@@ -87,9 +87,10 @@ def _is_notebook_environment():
         # Check if marimo is in modules
         if "marimo" in sys.modules:
             return True
         # Check for marimo in the call stack or environment
         import inspect
         frame = inspect.currentframe()
         try:
             while frame:

masster/study/export.py CHANGED Viewed

@@ -445,7 +445,7 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     mtd_lines.append("MTD\tsmall_molecule-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
     mtd_lines.append("MTD\tsmall_molecule_feature-quantification_unit\t[MS, MS:1001844, MS1 feature area, ]")
     mtd_lines.append(
-        "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]"
+        "MTD\tsmall_molecule-identification_reliability\t[MS, MS:1002955, hr-ms compound identification confidence level, ]",
     )
     mtd_lines.append("MTD\tid_confidence_measure[1]\t[MS, MS:1002888, small molecule confidence measure, ]")
     mtd_lines.append("")
@@ -499,8 +499,16 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
     # Use the matrix as-is since it already has the correct sample columns
     # The matrix columns are sample names, which is what we want for the assay columns
-    # round to int
-    abundance_matrix = abundance_matrix.round(0)
+    # round to int - handle both Polars and Pandas DataFrames
+    if hasattr(abundance_matrix, 'with_columns'):
+        # Polars DataFrame
+        numeric_cols = [col for col in abundance_matrix.columns if abundance_matrix[col].dtype.is_numeric()]
+        abundance_matrix = abundance_matrix.with_columns([
+            abundance_matrix[col].round(0) for col in numeric_cols
+        ])
+    else:
+        # Pandas DataFrame
+        abundance_matrix = abundance_matrix.round(0)
     # Use actual number of samples from the abundance matrix
     n_assays = len(abundance_matrix.columns)
@@ -570,9 +578,14 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
         ]
         # Add abundance values for each assay
         consensus_uid = row["consensus_uid"]
-        if consensus_uid in abundance_matrix.index:
-            abundance_values = abundance_matrix.loc[consensus_uid].tolist()
-            sml_row += [str(val) if pd.notna(val) else "null" for val in abundance_values]
+        # Check if consensus_uid exists in the abundance_matrix (Polars)
+        filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
+        if filtered_matrix.height > 0:
+            # Get the first (and should be only) matching row
+            abundance_row = filtered_matrix.row(0, named=True)
+            # Extract values excluding the consensus_uid column
+            abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
+            sml_row += [str(val) if val is not None else "null" for val in abundance_values]
         else:
             sml_row += ["null"] * n_assays
         sml_row += ["null", "null"]
@@ -615,11 +628,15 @@ def export_mztab(self, filename: str = None, include_mgf=True, **kwargs) -> None
             str(row.get("retention_time_in_seconds_start", "null")),
             str(row.get("retention_time_in_seconds_end", "null")),
         ]
-        # Add abundance values for each assay - same as SML
+        # Add abundance values for each assay - same as SML (Polars)
         consensus_uid = row["consensus_uid"]
-        if consensus_uid in abundance_matrix.index:
-            abundance_values = abundance_matrix.loc[consensus_uid].tolist()
-            smf_row += [str(val) if pd.notna(val) else "null" for val in abundance_values]
+        filtered_matrix = abundance_matrix.filter(pl.col("consensus_uid") == consensus_uid)
+        if filtered_matrix.height > 0:
+            # Get the first (and should be only) matching row
+            abundance_row = filtered_matrix.row(0, named=True)
+            # Extract values excluding the consensus_uid column
+            abundance_values = [abundance_row[col] for col in abundance_matrix.columns if col != "consensus_uid"]
+            smf_row += [str(val) if val is not None else "null" for val in abundance_values]
         else:
             smf_row += ["null"] * n_assays
         smf_lines.append("\t".join(smf_row))

masster/study/h5.py CHANGED Viewed

@@ -94,7 +94,7 @@ def _save_dataframe_optimized(df, group, schema, df_name, logger, chunk_size=100
                 numeric_cols.append(col)
         logger.debug(
-            f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
+            f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns",
         )
         # Process numeric columns in batch (most efficient)
@@ -277,7 +277,7 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                             results[chunk_start] = chunk_result
                         except Exception as e:
                             logger.warning(
-                                f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}"
+                                f"Failed to serialize chunk starting at {chunk_start} for column '{col}': {e}",
                             )
                             # Fallback to simple string conversion for this chunk
                             chunk = data_list[chunk_start : chunk_start + chunk_size]
@@ -435,7 +435,7 @@ def _save_dataframe_column_legacy(group, col: str, data, dtype: str, logger, com
             group.create_dataset(col, data=data_as_str, compression=compression)
         else:
             logger.warning(
-                f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column."
+                f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
             )
     elif dtype == "string":
         # Handle string columns
@@ -698,17 +698,17 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     # Get available columns from HDF5 file
     hdf5_columns = list(group.keys())
     logger.debug(f"HDF5 columns available: {hdf5_columns}")
     # Handle column name migrations for backward compatibility first
     if df_name == "samples_df":
         # Migrate old column names to new names
         column_migrations = {
             "size": "num_features",
-            "file_source": "sample_source",
+            "file_source": "sample_source",
             "ms1": "num_ms1",
-            "ms2": "num_ms2"
+            "ms2": "num_ms2",
         }
         # Create a mapping of what's actually available after migrations
         effective_columns = hdf5_columns.copy()
         for old_name, new_name in column_migrations.items():
@@ -720,14 +720,14 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     # First pass: load all existing columns (including migrated ones)
     for col in schema_columns or []:
         source_col = col
         # Check if we need to load from a migrated column name
         if df_name == "samples_df":
             column_migrations = {
                 "size": "num_features",
-                "file_source": "sample_source",
+                "file_source": "sample_source",
                 "ms1": "num_ms1",
-                "ms2": "num_ms2"
+                "ms2": "num_ms2",
             }
             # Reverse lookup - find old name for new name
             reverse_migrations = {v: k for k, v in column_migrations.items()}
@@ -736,7 +736,7 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
                 if old_name in group:
                     source_col = old_name
                     logger.info(f"Loading '{col}' from old column name '{old_name}'")
         if source_col not in group:
             missing_columns.append(col)
             continue
@@ -829,12 +829,12 @@ def _load_dataframe_from_group(group, schema: dict, df_name: str, logger, object
     if df_name == "samples_df":
         column_migrations = {
             "size": "num_features",
-            "file_source": "sample_source",
+            "file_source": "sample_source",
             "ms1": "num_ms1",
-            "ms2": "num_ms2"
+            "ms2": "num_ms2",
         }
         migrated_old_names = set(column_migrations.keys())
     extra_columns = [col for col in hdf5_columns if col not in (schema_columns or []) and col not in migrated_old_names]
     for col in extra_columns:
@@ -974,7 +974,7 @@ def _save_study5_compressed(self, filename=None):
             pbar.update(1)
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes",
             )
             # Store samples_df - use optimized batch processing
@@ -987,7 +987,7 @@ def _save_study5_compressed(self, filename=None):
             # Store features_df - use fast method that skips chrom and ms2_specs columns
             if self.features_df is not None and not self.features_df.is_empty():
                 self.logger.debug(
-                    f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)"
+                    f"Fast saving features_df with {len(self.features_df)} rows (skipping chrom and ms2_specs)",
                 )
                 _save_dataframe_optimized_fast(self.features_df, features_group, schema, "features_df", self.logger)
                 pbar.update(1)
@@ -1066,7 +1066,7 @@ def _save_dataframe_optimized_fast(df, group, schema, df_name, logger, chunk_siz
                 numeric_cols.append(col)
         logger.debug(
-            f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns"
+            f"Saving {df_name}: {total_rows} rows, {len(numeric_cols)} numeric, {len(string_cols)} string, {len(object_cols)} object columns",
         )
         # Process numeric columns in batch (most efficient)
@@ -1184,7 +1184,7 @@ def _save_study5(self, filename=None):
             pbar.update(1)
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {getattr(self, 'log_label', '')}Saving dataframes",
             )
             # Store samples_df - use optimized batch processing
@@ -1309,7 +1309,7 @@ def _load_study5(self, filename=None):
         ) as pbar:
             # Load metadata
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading metadata",
             )
             if "metadata" in f:
                 metadata = f["metadata"]
@@ -1371,7 +1371,7 @@ def _load_study5(self, filename=None):
             # Load samples_df
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples",
             )
             if "samples" in f and len(f["samples"].keys()) > 0:
                 self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
@@ -1411,7 +1411,7 @@ def _load_study5(self, filename=None):
             pbar.update(1)
             # Load samples_df
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading samples",
             )
             if "samples" in f and len(f["samples"].keys()) > 0:
                 self.samples_df = _load_dataframe_from_group(f["samples"], schema, "samples_df", self.logger)
@@ -1452,12 +1452,16 @@ def _load_study5(self, filename=None):
             # Load features_df
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading features",
             )
             if "features" in f and len(f["features"].keys()) > 0:
                 object_columns = ["chrom", "ms2_scans", "ms2_specs"]
                 self.features_df = _load_dataframe_from_group(
-                    f["features"], schema, "features_df", self.logger, object_columns
+                    f["features"],
+                    schema,
+                    "features_df",
+                    self.logger,
+                    object_columns,
                 )
             else:
                 self.features_df = None
@@ -1465,7 +1469,7 @@ def _load_study5(self, filename=None):
             # Load consensus_df
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus",
             )
             if "consensus" in f and len(f["consensus"].keys()) > 0:
                 # Only include adducts in object_columns if it actually exists in the file
@@ -1474,7 +1478,11 @@ def _load_study5(self, filename=None):
                     object_columns.append("adducts")
                 self.consensus_df = _load_dataframe_from_group(
-                    f["consensus"], schema, "consensus_df", self.logger, object_columns
+                    f["consensus"],
+                    schema,
+                    "consensus_df",
+                    self.logger,
+                    object_columns,
                 )
                 # Backward compatibility: If adducts column doesn't exist, initialize with empty lists
@@ -1507,22 +1515,28 @@ def _load_study5(self, filename=None):
             # Load consensus_mapping_df
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping",
             )
             if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
                 self.consensus_mapping_df = _load_dataframe_from_group(
-                    f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
+                    f["consensus_mapping"],
+                    schema,
+                    "consensus_mapping_df",
+                    self.logger,
                 )
             else:
                 self.consensus_mapping_df = None
             pbar.update(1)
             # Load consensus_mapping_df
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus mapping",
             )
             if "consensus_mapping" in f and len(f["consensus_mapping"].keys()) > 0:
                 self.consensus_mapping_df = _load_dataframe_from_group(
-                    f["consensus_mapping"], schema, "consensus_mapping_df", self.logger
+                    f["consensus_mapping"],
+                    schema,
+                    "consensus_mapping_df",
+                    self.logger,
                 )
             else:
                 self.consensus_mapping_df = None
@@ -1530,34 +1544,38 @@ def _load_study5(self, filename=None):
             # Load consensus_ms2
             pbar.set_description(
-                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus MS2"
+                f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Loading consensus MS2",
             )
             if "consensus_ms2" in f and len(f["consensus_ms2"].keys()) > 0:
                 object_columns = ["spec"]
                 self.consensus_ms2 = _load_dataframe_from_group(
-                    f["consensus_ms2"], schema, "consensus_ms2", self.logger, object_columns
+                    f["consensus_ms2"],
+                    schema,
+                    "consensus_ms2",
+                    self.logger,
+                    object_columns,
                 )
             else:
                 self.consensus_ms2 = None
             pbar.update(1)
     # Check and migrate old string-based map_id to integer indices
-    if (self.samples_df is not None and
-        not self.samples_df.is_empty() and
-        self.samples_df['map_id'].dtype == pl.Utf8):
+    if self.samples_df is not None and not self.samples_df.is_empty() and self.samples_df["map_id"].dtype == pl.Utf8:
         self.logger.info("Detected old string-based map_id format, migrating to integer indices")
         # Convert string-based map_id to integer indices
         sample_count = len(self.samples_df)
         new_map_ids = list(range(sample_count))
         self.samples_df = self.samples_df.with_columns(
-            pl.lit(new_map_ids).alias("map_id")
+            pl.lit(new_map_ids).alias("map_id"),
         )
         # Ensure the column is Int64 type
         self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
-        self.logger.info(f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})")
+        self.logger.info(
+            f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
+        )
     self.logger.debug("Study loaded")

masster 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

Potentially problematic release.

masster 0.3.17py3-none-any.whl → 0.3.19py3-none-any.whl