PyPI - masster - Versions diffs - 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

masster 0.5.8py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (22) hide show

masster/_version.py +1 -1
masster/logger.py +58 -43
masster/sample/adducts.py +2 -2
masster/sample/h5.py +1 -1
masster/sample/helpers.py +47 -15
masster/sample/plot.py +706 -578
masster/sample/processing.py +4 -4
masster/sample/sample.py +91 -48
masster/sample/save.py +5 -5
masster/study/h5.py +32 -14
masster/study/helpers.py +27 -8
masster/study/id.py +3 -3
masster/study/load.py +1 -164
masster/study/merge.py +6 -12
masster/study/plot.py +105 -35
masster/study/processing.py +7 -7
masster/study/study5_schema.json +3 -0
{masster-0.5.8.dist-info → masster-0.5.10.dist-info}/METADATA +3 -1
{masster-0.5.8.dist-info → masster-0.5.10.dist-info}/RECORD +22 -22
{masster-0.5.8.dist-info → masster-0.5.10.dist-info}/WHEEL +0 -0
{masster-0.5.8.dist-info → masster-0.5.10.dist-info}/entry_points.txt +0 -0
{masster-0.5.8.dist-info → masster-0.5.10.dist-info}/licenses/LICENSE +0 -0

masster/sample/processing.py CHANGED Viewed

@@ -796,7 +796,7 @@ def find_features(self, **kwargs):
     self.features_df = df
     #self._features_sync()
-    self.logger.info(f"Feature detection completed. Total features: {len(df)}")
+    self.logger.success(f"Feature detection completed. Total features: {len(df)}")
     # store params
     self.update_history(["find_features"], params.to_dict())
@@ -1263,8 +1263,8 @@ def find_ms2(self, **kwargs):
         )
     # Log completion
-    self.logger.info(
-        f"MS2 linking completed. Total features with MS2 data: {c}",
+    self.logger.success(
+        f"MS2 linking completed. Features with MS2 data: {c}.",
     )
     self.features_df = features_df
@@ -1425,7 +1425,7 @@ def find_iso(self, rt_tolerance: float = 0.1, **kwargs):
     # Log results
     non_null_count = len([spec for spec in ms1_specs if spec is not None])
-    self.logger.info(f"Extracted isotopic distributions for {non_null_count}/{len(ms1_specs)} features.")
+    self.logger.success(f"Extracted isotopic distributions for {non_null_count}/{len(ms1_specs)} features.")
     # Store parameters in history
     params_dict = {"rt_tolerance": rt_tolerance}

masster/sample/sample.py CHANGED Viewed

@@ -1,35 +1,98 @@
 """
-sample.py
+sample.py - Mass Spectrometry Sample Analysis Module
-This module provides tools for processing and analyzing Data-Dependent Acquisition (DDA) mass spectrometry data.
-It defines the `Sample` class, which offers methods to load, process, analyze, and visualize mass spectrometry data
-from various file formats, including mzML, Thermo RAW, and Sciex WIFF formats.
+This module provides comprehensive tools for processing and analyzing Data-Dependent Acquisition (DDA)
+mass spectrometry data. It defines the `Sample` class, which offers methods to load, process, analyze,
+and visualize mass spectrometry data from various file formats.
+Supported File Formats:
+    - mzML (standard XML format for mass spectrometry data)
+    - Thermo RAW (native Thermo Fisher Scientific format)
+    - Sciex WIFF (native Sciex format)
+    - Sample5 (MASSter's native HDF5-based format for optimized storage)
 Key Features:
-- **File Handling**: Load and save data in multiple formats.
-- **Feature Detection**: Detect and process mass spectrometry features.
-- **Spectrum Analysis**: Retrieve and analyze MS1/MS2 spectra.
-- **Visualization**: Generate interactive and static plots for spectra and chromatograms.
-- **Statistics**: Compute and export detailed DDA run statistics.
-Dependencies:
-- `pyopenms`: For file handling and feature detection.
-- `polars` and `pandas`: For data manipulation.
-- `numpy`: For numerical computations.
-- `bokeh`, `panel`, `holoviews`, `datashader`: For interactive visualizations.
+    - **File Handling**: Load and save data in multiple formats with automatic format detection
+    - **Feature Detection**: Detect and process mass spectrometry features using advanced algorithms
+    - **Spectrum Analysis**: Retrieve and analyze MS1/MS2 spectra with comprehensive metadata
+    - **Adduct Detection**: Find and annotate adducts and in-source fragments
+    - **Isotope Analysis**: Detect and process isotopic patterns
+    - **Chromatogram Extraction**: Extract and analyze chromatograms (EIC, BPC, TIC)
+    - **Visualization**: Generate interactive and static plots for spectra, chromatograms, and 2D maps
+    - **Statistics**: Compute and export detailed DDA run statistics and quality metrics
+    - **Data Export**: Export processed data to various formats (XLSX, MGF, etc.)
+    - **Memory Management**: Efficient handling of large datasets with on-disk storage options
+Core Dependencies:
+    - `pyopenms`: OpenMS library for file handling and feature detection algorithms
+    - `polars`: High-performance data manipulation and analysis
+    - `numpy`: Numerical computations and array operations
+    - `bokeh`, `panel`, `holoviews`, `datashader`: Interactive visualizations and dashboards
+    - `h5py`: HDF5 file format support for Sample5 files
 Classes:
-- `Sample`: Main class for handling DDA data, providing methods for data import, processing, and visualization.
-Example Usage:
-```python
-from masster.sample import Sample
+    Sample: Main class for handling DDA mass spectrometry data, providing methods for
+            data import, processing, analysis, and visualization.
-sample = Sample(file="example.mzML")
-sample.find_features()
-sample.plot_2d()
-```
+Typical Workflow:
+    1. Load mass spectrometry data file
+    2. Detect features using find_features()
+    3. Optionally find MS2 spectra with find_ms2()
+    4. Analyze and visualize results
+    5. Export processed data
+Example Usage:
+    Basic analysis workflow:
+    ```python
+    from masster.sample import Sample
+    # Load a mass spectrometry file
+    sample = Sample(filename="experiment.mzML")
+    # Detect features
+    sample.find_features()
+    # Find MS2 spectra for features
+    sample.find_ms2()
+    # Generate 2D visualization
+    sample.plot_2d()
+    # Export results
+    sample.export_features("features.xlsx")
+    ```
+    Advanced usage with custom parameters:
+    ```python
+    from masster.sample import Sample
+    from masster.sample.defaults import sample_defaults, find_features_defaults
+    # Create custom parameters
+    params = sample_defaults(log_level="DEBUG", label="My Experiment")
+    ff_params = find_features_defaults(noise_threshold_int=1000)
+    # Initialize with custom parameters
+    sample = Sample(params=params)
+    sample.load("data.raw")
+    # Feature detection with custom parameters
+    sample.find_features(params=ff_params)
+    # Generate comprehensive statistics
+    stats = sample.get_dda_stats()
+    sample.plot_dda_stats()
+    ```
+Notes:
+    - The Sample class maintains processing history and parameters for reproducibility
+    - Large files can be processed with on-disk storage to manage memory usage
+    - All visualizations are interactive by default and can be exported as static images
+    - The module supports both individual sample analysis and batch processing workflows
+Version: Part of the MASSter mass spectrometry analysis framework
+Author: Zamboni Lab, ETH Zurich
 """
 import importlib
@@ -49,16 +112,12 @@ from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
 # Sample-specific imports - keeping these private, only for internal use
 from masster.sample.h5 import _load_sample5
-# from masster.sample.h5 import _load_sample5_study
 from masster.sample.h5 import _save_sample5
-# from masster.sample.helpers import _delete_ms2
 from masster.sample.helpers import _estimate_memory_usage
 from masster.sample.helpers import _get_scan_uids
 from masster.sample.helpers import _get_feature_uids
-# from masster.sample.helpers import _features_sync - made internal only
 from masster.sample.adducts import find_adducts
 from masster.sample.adducts import _get_adducts
-# Removed _get_adducts - only used in study modules
 from masster.sample.helpers import features_delete
 from masster.sample.helpers import features_filter
 from masster.sample.helpers import features_select
@@ -70,23 +129,17 @@ from masster.sample.helpers import get_eic
 from masster.sample.helpers import set_source
 from masster.sample.helpers import _recreate_feature_map
 from masster.sample.helpers import _get_feature_map
-# Load functions - keeping only specific ones needed for external API
-# from masster.sample.load import _load_featureXML - made internal only
-# from masster.sample.load import _load_ms2data - made internal only
-# from masster.sample.load import _load_mzML - made internal only
-# from masster.sample.load import _load_raw - made internal only
-# from masster.sample.load import _load_wiff - made internal only
 from masster.sample.load import chrom_extract
 from masster.sample.load import _index_file
 from masster.sample.load import load
 from masster.sample.load import load_noms1
-from masster.sample.load import _load_ms1  # Renamed from load_study
+from masster.sample.load import _load_ms1
 from masster.sample.load import sanitize
 from masster.sample.plot import plot_2d
 from masster.sample.plot import plot_2d_oracle
 from masster.sample.plot import plot_dda_stats
 from masster.sample.plot import plot_chrom
-from masster.sample.plot import plot_features_stats  # Renamed from plot_feature_stats
+from masster.sample.plot import plot_features_stats
 from masster.sample.plot import plot_ms2_cycle
 from masster.sample.plot import plot_ms2_eic
 from masster.sample.plot import plot_ms2_q1
@@ -113,7 +166,6 @@ from masster.sample.save import export_features
 from masster.sample.save import export_mgf
 from masster.sample.save import export_xlsx
 from masster.sample.save import save
-# Removed internal-only import: _save_featureXML
 class Sample:
@@ -402,6 +454,7 @@ class Sample:
             f"{base_modname}.chromatogram",
             f"{base_modname}.spectrum",
             f"{base_modname}.logger",
+            f"{base_modname}.lib",
         ]
         # Add study submodules
@@ -414,17 +467,9 @@ class Sample:
             ):
                 study_modules.append(module_name)
-        """ # Add parameters submodules
-        parameters_modules = []
-        parameters_module_prefix = f"{base_modname}.parameters."
-        for module_name in sys.modules:
-            if module_name.startswith(parameters_module_prefix) and module_name != current_module:
-                parameters_modules.append(module_name)
-        """
         all_modules_to_reload = (
             core_modules + sample_modules + study_modules
-        )  # + parameters_modules
+        )
         # Reload all discovered modules
         for full_module_name in all_modules_to_reload:
@@ -466,8 +511,6 @@ class Sample:
         else:
             str += "Features: 0\n"
             str += "Features with MS2 spectra: 0\n"
-        # estimate memory usage
         mem_usage = self._estimate_memory_usage()
         str += f"Estimated memory usage: {mem_usage:.2f} MB\n"

masster/sample/save.py CHANGED Viewed

@@ -148,10 +148,10 @@ def export_features(self, filename="features.csv"):
     )
     if filename.lower().endswith((".xls", ".xlsx")):
         clean_df.to_pandas().to_excel(filename, index=False)
-        self.logger.info(f"Features exported to {filename} (Excel format)")
+        self.logger.success(f"Features exported to {filename} (Excel format)")
     else:
         clean_df.write_csv(filename)
-        self.logger.info(f"Features exported to {filename}")
+        self.logger.success(f"Features exported to {filename}")
 def export_mgf(
@@ -649,7 +649,7 @@ def export_mgf(
                             elif result == "empty_ms2":
                                 empty_ms2_count += 1
-    self.logger.info(f"Exported {ms1_spec_used_count} MS1 spectra and {c} MS2 spectra to {filename}")
+    self.logger.success(f"Exported {ms1_spec_used_count} MS1 spectra and {c} MS2 spectra to {filename}")
     if empty_ms2_count > 0:
         self.logger.info(f"Skipped {empty_ms2_count} empty MS2 spectra")
     if ms1_fallback_count > 0:
@@ -824,7 +824,7 @@ def export_dda_stats(self, filename="stats.csv"):
         for line in lines:
             f.write(line + "\n")
-    self.logger.info(f"DDA statistics exported to {filename}")
+    self.logger.success(f"DDA statistics exported to {filename}")
 def export_xlsx(self, filename="features.xlsx"):
@@ -877,7 +877,7 @@ def export_xlsx(self, filename="features.xlsx"):
     pandas_df = clean_df.to_pandas()
     pandas_df.to_excel(filename, index=False)
-    self.logger.info(f"Features exported to {filename} (Excel format)")
+    self.logger.success(f"Features exported to {filename} (Excel format)")
     self.logger.debug(f"Exported {len(clean_df)} features with {len(exportable_columns)} columns")

masster/study/h5.py CHANGED Viewed

@@ -818,6 +818,19 @@ def _reorder_columns_by_schema(
 def _create_dataframe_with_objects(data: dict, object_columns: list) -> pl.DataFrame:
     """Create DataFrame handling Object columns properly."""
+    # First check all data for numpy object arrays and move them to object columns
+    additional_object_cols = []
+    for k, v in data.items():
+        if k not in object_columns and hasattr(v, 'dtype') and str(v.dtype) == 'object':
+            # This is a numpy object array that should be treated as object
+            additional_object_cols.append(k)
+            object_columns.append(k)
+    if additional_object_cols:
+        # Re-run reconstruction for these columns
+        for col in additional_object_cols:
+            data[col] = _reconstruct_object_column(data[col], col)
     object_data = {k: v for k, v in data.items() if k in object_columns}
     regular_data = {k: v for k, v in data.items() if k not in object_columns}
@@ -1103,11 +1116,18 @@ def _load_dataframe_from_group(
         logger.info(f"Loading extra column '{col}' not in schema for {df_name}")
         column_data = group[col][:]
-        # Try to determine if this should be treated as an object column
-        # by checking if the data looks like JSON strings
-        if len(column_data) > 0 and isinstance(column_data[0], bytes):
+        # Check if this is a known object column by name
+        known_object_columns = {"ms1_spec", "chrom", "ms2_scans", "ms2_specs", "spec", "adducts", "iso"}
+        is_known_object = col in known_object_columns
+        if is_known_object:
+            # Known object column, always reconstruct
+            data[col] = _reconstruct_object_column(column_data, col)
+            if col not in object_columns:
+                object_columns.append(col)
+        elif len(column_data) > 0 and isinstance(column_data[0], bytes):
             try:
-                # Check if it looks like JSON
+                # Check if it looks like JSON for unknown columns
                 test_decode = column_data[0].decode("utf-8")
                 if test_decode.startswith("[") or test_decode.startswith("{"):
                     # Looks like JSON, treat as object column
@@ -1738,9 +1758,7 @@ def _save_study5(self, filename):
                 )
                 pbar.update(1)
-    self.logger.info(f"Study saved successfully to {filename}")
-    self.logger.debug(f"Save completed for {filename}")
-    self.logger.debug(f"Save completed for {filename}")
+    self.logger.success(f"Study saved to {filename}")
 def _load_study5(self, filename=None):
@@ -1859,7 +1877,7 @@ def _load_study5(self, filename=None):
                             )
                         else:
                             self.logger.debug(
-                                "Successfully updated parameters from loaded history",
+                                "Updated parameters from loaded history",
                             )
                     else:
                         self.logger.debug(
@@ -2093,8 +2111,8 @@ def _load_study5(self, filename=None):
         # Ensure the column is Int64 type
         self.samples_df = self.samples_df.cast({"map_id": pl.Int64})
-        self.logger.info(
-            f"Successfully migrated {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
+        self.logger.debug(
+            f"Sanitized {sample_count} samples to indexed map_id format (0 to {sample_count - 1})",
         )
     # Sanitize null feature_id and consensus_id values with new UIDs (same method as merge)
@@ -2218,7 +2236,7 @@ def _sanitize_nulls(self):
                 pl.Series("feature_id", feature_ids, dtype=pl.Utf8)
             )
-            self.logger.debug(f"Successfully sanitized {null_feature_ids} feature_id values")
+            self.logger.debug(f"Sanitized {null_feature_ids} feature_id values")
     # Sanitize consensus_df consensus_id column
     if hasattr(self, 'consensus_df') and self.consensus_df is not None and not self.consensus_df.is_empty():
@@ -2244,8 +2262,8 @@ def _sanitize_nulls(self):
                 self.consensus_df = self.consensus_df.with_columns(
                     pl.Series("consensus_id", consensus_ids, dtype=pl.Utf8)
                 )
-                self.logger.debug(f"Successfully sanitized {null_consensus_ids} consensus_id values")
+                self.logger.debug(f"Sanitized {null_consensus_ids} consensus_id values")
     # Sanitize rt_original in features_df by replacing null or NaN values with rt values
     if hasattr(self, 'features_df') and self.features_df is not None and not self.features_df.is_empty():
@@ -2262,4 +2280,4 @@ def _sanitize_nulls(self):
                     .otherwise(pl.col("rt_original"))
                     .alias("rt_original")
                 )
-                self.logger.debug(f"Successfully sanitized {null_or_nan_rt_original} rt_original values")
+                self.logger.debug(f"Sanitized {null_or_nan_rt_original} rt_original values")

masster/study/helpers.py CHANGED Viewed

@@ -1440,7 +1440,7 @@ def compress(self, features=True, ms2=True, chrom=False, ms2_max=5):
         self.compress_ms2(max_replicates=ms2_max)
     if chrom:
         self.compress_chrom()
-    self.logger.info("Compression completed")
+    self.logger.success("Compression completed")
 def compress_features(self):
@@ -1630,7 +1630,7 @@ def restore_features(self, samples=None, maps=False):
             self.logger.error(f"Failed to load sample {sample_name}: {e}")
             continue
-    self.logger.info(
+    self.logger.success(
         f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples",
     )
@@ -1886,7 +1886,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
             self.logger.error(f"Failed to gap-fill sample {sample_name}: {e}")
             continue
-    self.logger.info(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
+    self.logger.success(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
     # Final summary
     final_non_null = self.features_df.filter(pl.col("chrom").is_not_null()).height
@@ -2051,7 +2051,7 @@ def sample_name_replace(self, replace_dict):
         pl.Series("sample_name", new_names).alias("sample_name"),
     )
-    self.logger.info(f"Successfully replaced {replaced_count} sample names")
+    self.logger.success(f"Successfully replaced {replaced_count} sample names")
 def sample_name_reset(self):
@@ -2940,6 +2940,7 @@ def features_delete(self, features):
 def consensus_select(
     self,
+    uid=None,
     mz=None,
     rt=None,
     inty_mean=None,
@@ -2956,14 +2957,12 @@ def consensus_select(
     rt_delta_mean=None,
     id_top_score=None,
     identified=None,
-    # New adduct filter parameters
     adduct_top=None,
     adduct_charge_top=None,
     adduct_mass_neutral_top=None,
     adduct_mass_shift_top=None,
     adduct_group=None,
     adduct_of=None,
-    # New identification filter parameters
     id_top_name=None,
     id_top_class=None,
     id_top_adduct=None,
@@ -2976,6 +2975,11 @@ def consensus_select(
     OPTIMIZED VERSION: Enhanced performance with lazy evaluation, vectorized operations, and efficient filtering.
     Parameters:
+        uid: consensus UID filter with flexible formats:
+            - None: include all consensus features (default)
+            - int: single specific consensus_uid
+            - tuple: range of consensus_uids (consensus_uid_min, consensus_uid_max)
+            - list: specific list of consensus_uid values
         mz: m/z filter with flexible formats:
             - float: m/z value ± default tolerance (uses study.parameters.eic_mz_tol)
             - tuple (mz_min, mz_max): range where mz_max > mz_min
@@ -3023,7 +3027,7 @@ def consensus_select(
         return pl.DataFrame()
     # Early return optimization - check if any filters are provided
-    filter_params = [mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
+    filter_params = [uid, mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
                     number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
                     chrom_prominence_scaled_mean, chrom_height_scaled_mean,
                     rt_delta_mean, id_top_score, identified,
@@ -3044,6 +3048,21 @@ def consensus_select(
     warnings = []
     # Build all filter conditions efficiently
+    # Handle uid parameter first (consensus_uid filter with flexible formats)
+    if uid is not None:
+        if isinstance(uid, int):
+            # Single specific consensus_uid
+            filter_conditions.append(pl.col("consensus_uid") == uid)
+        elif isinstance(uid, tuple) and len(uid) == 2:
+            # Range of consensus_uids (consensus_uid_min, consensus_uid_max)
+            min_uid, max_uid = uid
+            filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
+        elif isinstance(uid, list):
+            # Specific list of consensus_uid values
+            filter_conditions.append(pl.col("consensus_uid").is_in(uid))
+        else:
+            self.logger.warning(f"Invalid uid parameter type: {type(uid)}. Expected int, tuple, or list.")
     if mz is not None:
         if isinstance(mz, tuple) and len(mz) == 2:
             if mz[1] < mz[0]:
@@ -4622,7 +4641,7 @@ def decompress(self, features=True, ms2=True, chrom=True, samples=None, **kwargs
             self.restore_ms2(samples=samples, **ms2_kwargs)
-        self.logger.info("Adaptive decompression completed successfully")
+        self.logger.success("Adaptive decompression completed successfully")
     except Exception as e:
         self.logger.error(f"Decompression failed: {e}")

masster/study/id.py CHANGED Viewed

@@ -1093,7 +1093,7 @@ def id_reset(study):
         del study.history["identify"]
     if logger:
-        logger.info("Identification data reset completed")
+        logger.success("Identification data reset completed")
 def lib_reset(study):
@@ -1198,7 +1198,7 @@ def lib_reset(study):
             del study.history["lib_to_consensus"]
     if logger:
-        logger.info("Library and identification data reset completed")
+        logger.success("Library and identification data reset completed")
 def _get_adducts(study, adducts_list: list | None = None, **kwargs):
@@ -1978,4 +1978,4 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
             logger.warning(f"find_ms2 failed: {e}")
     if logger:
-        logger.info(f"lib_to_consensus completed: {len(consensus_metadata)} features added")
+        logger.success(f"lib_to_consensus completed: {len(consensus_metadata)} features added")

masster 0.5.8__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

masster 0.5.8py3-none-any.whl → 0.5.10py3-none-any.whl