PyPI - masster - Versions diffs - 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl - Mend

masster 0.4.20py3-none-any.whl → 0.4.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (29) hide show

masster/__init__.py +6 -0
masster/_version.py +1 -1
masster/sample/h5.py +58 -1
masster/sample/load.py +7 -1
masster/sample/plot.py +56 -65
masster/sample/processing.py +158 -0
masster/sample/sample.py +2 -0
masster/sample/sample5_schema.json +3 -0
masster/sample/save.py +135 -59
masster/spectrum.py +58 -9
masster/study/export.py +240 -154
masster/study/h5.py +65 -1
masster/study/helpers.py +3 -3
masster/study/load.py +39 -3
masster/study/merge.py +25 -10
masster/study/plot.py +162 -192
masster/study/processing.py +362 -12
masster/study/save.py +48 -5
masster/study/study.py +16 -3
masster/study/study5_schema.json +3 -0
masster/wizard/__init__.py +5 -2
masster/wizard/wizard.py +435 -1871
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/METADATA +1 -1
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/RECORD +27 -29
masster/wizard/test_structure.py +0 -49
masster/wizard/test_wizard.py +0 -285
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/WHEEL +0 -0
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/entry_points.txt +0 -0
{masster-0.4.20.dist-info → masster-0.4.22.dist-info}/licenses/LICENSE +0 -0

masster/study/processing.py CHANGED Viewed

@@ -15,6 +15,85 @@ from masster.study.defaults import (
 )
+def _generate_feature_maps_on_demand_for_align(study):
+    """
+    Generate feature maps on-demand from study.features_df for alignment operations.
+    Returns temporary feature maps that are not cached in the study.
+    Args:
+        study: Study object containing features_df and samples_df
+    Returns:
+        list: List of temporary FeatureMap objects
+    """
+    import polars as pl
+    import pyopenms as oms
+    if study.features_df is None or len(study.features_df) == 0:
+        study.logger.error("No features_df available for generating feature maps")
+        return []
+    temp_feature_maps = []
+    # Process each sample in order
+    for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
+        sample_uid = row_dict["sample_uid"]
+        sample_name = row_dict["sample_name"]
+        # Get features for this sample from features_df
+        sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
+        # Create new FeatureMap
+        feature_map = oms.FeatureMap()
+        # Convert DataFrame features to OpenMS Features
+        for feature_row in sample_features.iter_rows(named=True):
+            feature = oms.Feature()
+            # Set properties from DataFrame (handle missing values gracefully)
+            try:
+                # Skip features with missing critical data
+                if feature_row["mz"] is None:
+                    study.logger.warning("Skipping feature due to missing mz")
+                    continue
+                if feature_row["rt"] is None:
+                    study.logger.warning("Skipping feature due to missing rt")
+                    continue
+                if feature_row["inty"] is None:
+                    study.logger.warning("Skipping feature due to missing inty")
+                    continue
+                # Handle missing feature_id by generating a new one
+                if feature_row["feature_id"] is None:
+                    # Use a simple incremental ID for alignment purposes
+                    feature_id = len(temp_feature_maps) * 100000 + feature_map.size() + 1
+                    study.logger.debug(f"Generated new feature_id {feature_id} for feature with missing ID in sample {sample_name}")
+                else:
+                    feature_id = int(feature_row["feature_id"])
+                feature.setUniqueId(feature_id)
+                feature.setMZ(float(feature_row["mz"]))
+                feature.setRT(float(feature_row["rt"]))
+                feature.setIntensity(float(feature_row["inty"]))
+                # Handle optional fields that might be None
+                if feature_row.get("quality") is not None:
+                    feature.setOverallQuality(float(feature_row["quality"]))
+                if feature_row.get("charge") is not None:
+                    feature.setCharge(int(feature_row["charge"]))
+                # Add to feature map
+                feature_map.push_back(feature)
+            except (ValueError, TypeError) as e:
+                study.logger.warning(f"Skipping feature due to conversion error: {e}")
+                continue
+        temp_feature_maps.append(feature_map)
+    study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df for alignment")
+    return temp_feature_maps
 def align(self, **kwargs):
     """Align feature maps using pose clustering or KD algorithm and update feature RTs.
@@ -59,6 +138,17 @@ def align(self, **kwargs):
     """
     # parameters initialization
     params = align_defaults()
+    # Handle 'params' keyword argument specifically (like merge does)
+    if 'params' in kwargs:
+        provided_params = kwargs.pop('params')
+        if isinstance(provided_params, align_defaults):
+            params = provided_params
+            self.logger.debug("Using provided align_defaults parameters from 'params' argument")
+        else:
+            self.logger.warning("'params' argument is not an align_defaults instance, ignoring")
+    # Process remaining kwargs
     for key, value in kwargs.items():
         if isinstance(value, align_defaults):
             params = value
@@ -72,20 +162,16 @@ def align(self, **kwargs):
                         f"Failed to set parameter {key} = {value} (validation failed)",
                     )
             else:
-                self.logger.debug(f"Unknown parameter {key} ignored")
+                self.logger.warning(f"Unknown parameter '{key}' ignored")
     # end of parameter initialization
     # Store parameters in the Study object
     self.store_history(["align"], params.to_dict())
     self.logger.debug("Parameters stored to align")
-    if len(self.features_maps) < len(self.samples_df):
-        self.features_maps = []
-        self.load_features()
-    # self.logger.debug("Starting alignment")
-    fmaps = self.features_maps
+    # Generate temporary feature maps on-demand from features_df instead of using cached data
+    self.logger.debug("Generating feature maps on-demand from features_df for alignment")
+    fmaps = _generate_feature_maps_on_demand_for_align(self)
     # Choose alignment algorithm
     algorithm = params.get("algorithm").lower()
@@ -97,6 +183,9 @@ def align(self, **kwargs):
         _align_kd_algorithm(self, fmaps, params)
     else:
         self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
+        # Clean up temporary feature maps to release memory
+        del fmaps
+        return
     # check if rt_original exists in features_df, if not, add it after rt
     if "rt_original" not in self.features_df.columns:
@@ -245,6 +334,10 @@ def align(self, **kwargs):
     if params.get("save_features"):
         self.save_samples()
+    # Clean up temporary feature maps to release memory
+    del fmaps
+    self.logger.debug("Temporary feature maps deleted to release memory")
 def find_ms2(self, **kwargs):
     """
@@ -776,10 +869,22 @@ def _align_pose_clustering(study_obj, fmaps, params):
             and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
         ):
             continue
-        trafo = oms.TransformationDescription()
-        aligner.align(fm, trafo)
-        transformer = oms.MapAlignmentTransformer()
-        transformer.transformRetentionTimes(fm, trafo, True)
+        # Skip feature maps with insufficient data points for alignment
+        if fm.size() < 2:
+            sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
+            study_obj.logger.warning(f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)")
+            continue
+        try:
+            trafo = oms.TransformationDescription()
+            aligner.align(fm, trafo)
+            transformer = oms.MapAlignmentTransformer()
+            transformer.transformRetentionTimes(fm, trafo, True)
+        except RuntimeError as e:
+            sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
+            study_obj.logger.warning(f"Failed to align sample '{sample_name}': {e}")
+            continue
     study_obj.alignment_ref_index = ref_index
@@ -825,6 +930,11 @@ def _align_kd_algorithm(study_obj, fmaps, params):
         f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
     )
+    # Check if feature maps are empty before proceeding
+    if not fmaps:
+        study_obj.logger.error("No feature maps available for alignment. Cannot proceed with alignment.")
+        raise ValueError("No feature maps available for alignment. This usually indicates that all samples failed to load properly.")
     # Choose reference map (largest number of features)
     ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
     ref_map = fmaps[ref_index]
@@ -1003,3 +1113,243 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
         transformer.transformRetentionTimes(fm, trafo, True)
     study_obj.alignment_ref_index = ref_index
+def find_iso(self, rt_tol=0.1, mz_tol=0.01):
+    """
+    Find isotope patterns for consensus features by searching raw MS1 data.
+    OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
+    For each consensus feature:
+    1. Find the associated feature with highest intensity
+    2. Load the corresponding sample5 file to access raw MS1 data
+    3. Use original_rt (before alignment) to find the correct scan
+    4. Search for isotope patterns in raw MS1 spectra
+    5. Look for isotope patterns: 0.33, 0.50, 0.66, 1.00, 1.50, 2.00, 3.00, 4.00, 5.00 Da
+    6. Store results as numpy arrays with [mz, inty] in the iso column
+    Parameters:
+        rt_tol (float): RT tolerance for scan matching in seconds
+        mz_tol (float): Additional m/z tolerance for isotope matching in Da
+    """
+    if self.consensus_df is None or self.consensus_df.is_empty():
+        self.logger.error("No consensus features found. Please run merge() first.")
+        return
+    if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
+        self.logger.error("No consensus mapping found. Please run merge() first.")
+        return
+    if self.features_df is None or self.features_df.is_empty():
+        self.logger.error("No features found.")
+        return
+    if self.samples_df is None or self.samples_df.is_empty():
+        self.logger.error("No samples found.")
+        return
+    # Add iso column if it doesn't exist
+    if "iso" not in self.consensus_df.columns:
+        self.consensus_df = self.consensus_df.with_columns(
+            pl.lit(None, dtype=pl.Object).alias("iso")
+        )
+    self.logger.info("Extracting isotopomers from raw MS1 data...")
+    # Isotope mass shifts to search for (up to 7x 13C isotopes)
+    isotope_shifts = [
+        0.33,
+        0.50,
+        0.66,
+        1.00335,
+        1.50502,
+        2.00670,
+        3.01005,
+        4.01340,
+        5.01675,
+        6.02010,
+        7.02345,
+    ]
+    consensus_iso_data = {}
+    # SUPER OPTIMIZATION: Vectorized pre-calculation using joins (10-100x faster)
+    self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
+    # Step 1: Join consensus_mapping with features to get intensities in one operation
+    consensus_with_features = self.consensus_mapping_df.join(
+        self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
+        on=['feature_uid', 'sample_uid'],
+        how='left'
+    )
+    # Step 2: Find the best feature (highest intensity) for each consensus using window functions
+    best_features = consensus_with_features.with_columns(
+        pl.col('inty').fill_null(0)  # Handle null intensities
+    ).with_columns(
+        pl.col('inty').max().over('consensus_uid').alias('max_inty')
+    ).filter(
+        pl.col('inty') == pl.col('max_inty')
+    ).group_by('consensus_uid').first()  # Take first if there are ties
+    # Step 3: Join with samples to get sample paths in one operation
+    best_features_with_paths = best_features.join(
+        self.samples_df.select(['sample_uid', 'sample_path']),
+        on='sample_uid',
+        how='left'
+    ).filter(
+        pl.col('sample_path').is_not_null()
+    )
+    # Step 4: Group by sample path for batch processing (much faster than nested loops)
+    sample_to_consensus = {}
+    for row in best_features_with_paths.iter_rows(named=True):
+        sample_path = row['sample_path']
+        consensus_uid = row['consensus_uid']
+        # Create feature data dictionary for compatibility
+        feature_data = {
+            'mz': row['mz'],
+            'rt': row['rt'],
+            'rt_original': row.get('rt_original', row['rt']),
+            'inty': row['inty']
+        }
+        if sample_path not in sample_to_consensus:
+            sample_to_consensus[sample_path] = []
+        sample_to_consensus[sample_path].append((consensus_uid, feature_data))
+    # Initialize failed consensus features (those not in the mapping)
+    processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
+    for consensus_row in self.consensus_df.iter_rows(named=True):
+        consensus_uid = consensus_row["consensus_uid"]
+        if consensus_uid not in processed_consensus_uids:
+            consensus_iso_data[consensus_uid] = None
+    self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(self.consensus_df)} consensus features")
+    tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
+    # OPTIMIZATION 2: Process by sample file (load each file only once)
+    for sample_path, consensus_list in tqdm(
+        sample_to_consensus.items(),
+        desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Read files",
+        disable=tdqm_disable,
+    ):
+        try:
+            # Load MS1 data once per sample
+            ms1_df = self._load_ms1(sample_path)
+            if ms1_df is None or ms1_df.is_empty():
+                # Mark all consensus features from this sample as failed
+                for consensus_uid, _ in consensus_list:
+                    consensus_iso_data[consensus_uid] = None
+                continue
+            # Process all consensus features for this sample
+            for consensus_uid, best_feature in consensus_list:
+                # Get the original RT (before alignment correction)
+                base_mz = best_feature["mz"]
+                original_rt = best_feature.get("rt_original", best_feature["rt"])
+                # Find MS1 scans near the original RT
+                rt_min = original_rt - rt_tol
+                rt_max = original_rt + rt_tol
+                # Filter MS1 data for scans within RT window
+                ms1_window = ms1_df.filter(
+                    (pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
+                )
+                if ms1_window.is_empty():
+                    consensus_iso_data[consensus_uid] = None
+                    continue
+                isotope_matches = []
+                # Search for each isotope shift
+                for shift in isotope_shifts:
+                    target_mz = base_mz + shift
+                    mz_min_iso = target_mz - mz_tol
+                    mz_max_iso = target_mz + mz_tol
+                    # Find peaks in MS1 data within m/z tolerance
+                    isotope_peaks = ms1_window.filter(
+                        (pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso)
+                    )
+                    if not isotope_peaks.is_empty():
+                        # Get the peak with maximum intensity for this isotope
+                        max_peak = isotope_peaks.filter(
+                            pl.col("inty") == pl.col("inty").max()
+                        ).row(0, named=True)
+                        # Store as float with specific precision: m/z to 4 decimals, intensity rounded to integer
+                        mz_formatted = round(float(max_peak["mz"]), 4)
+                        inty_formatted = float(round(max_peak["inty"]))  # Round to integer, but keep as float
+                        isotope_matches.append([mz_formatted, inty_formatted])
+                # Store results as numpy array
+                if isotope_matches:
+                    consensus_iso_data[consensus_uid] = np.array(isotope_matches)
+                else:
+                    consensus_iso_data[consensus_uid] = None
+        except Exception as e:
+            self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
+            # Mark all consensus features from this sample as failed
+            for consensus_uid, _ in consensus_list:
+                consensus_iso_data[consensus_uid] = None
+            continue
+    # Update consensus_df with isotope data
+    # Create mapping function for update
+    def get_iso_data(uid):
+        return consensus_iso_data.get(uid, None)
+    # Update the iso column
+    self.consensus_df = self.consensus_df.with_columns(
+        pl.col("consensus_uid").map_elements(
+            lambda uid: get_iso_data(uid),
+            return_dtype=pl.Object
+        ).alias("iso")
+    )
+    # Count how many consensus features have isotope data
+    iso_count = sum(1 for data in consensus_iso_data.values() if data is not None and len(data) > 0)
+    self.logger.info(f"Optimized isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features.")
+def reset_iso(self):
+    """
+    Reset the iso column in consensus_df to None, clearing all isotope data.
+    This function clears any previously computed isotope patterns from the
+    consensus_df, setting the 'iso' column to None for all features. This
+    is useful before re-running isotope detection with different parameters
+    or to clear isotope data entirely.
+    Returns:
+        None
+    """
+    if self.consensus_df is None:
+        self.logger.warning("No consensus_df found. Nothing to reset.")
+        return
+    if "iso" not in self.consensus_df.columns:
+        self.logger.warning("No 'iso' column found in consensus_df. Nothing to reset.")
+        return
+    # Count how many features currently have isotope data
+    iso_count = self.consensus_df.select(
+        pl.col("iso").is_not_null().sum().alias("count")
+    ).item(0, "count")
+    # Reset the iso column to None
+    self.consensus_df = self.consensus_df.with_columns(
+        pl.lit(None, dtype=pl.Object).alias("iso")
+    )
+    self.logger.info(f"Reset isotope data for {iso_count} features. All 'iso' values set to None.")

masster/study/save.py CHANGED Viewed

@@ -154,13 +154,56 @@ def save_samples(self, samples=None):
 def _save_consensusXML(self, filename: str):
-    if self.consensus_map is None:
-        self.logger.error("No consensus map found.")
+    if self.consensus_df is None or self.consensus_df.is_empty():
+        self.logger.error("No consensus features found.")
         return
+    # Build consensus map from consensus_df with proper consensus_id values
+    import pyopenms as oms
+    consensus_map = oms.ConsensusMap()
+    # Set up file descriptions for all samples
+    file_descriptions = consensus_map.getColumnHeaders()
+    if hasattr(self, 'samples_df') and not self.samples_df.is_empty():
+        for i, sample_row in enumerate(self.samples_df.iter_rows(named=True)):
+            file_description = file_descriptions.get(i, oms.ColumnHeader())
+            file_description.filename = sample_row.get("sample_name", f"sample_{i}")
+            file_description.size = 0  # Will be updated if needed
+            file_description.unique_id = i + 1
+            file_descriptions[i] = file_description
+        consensus_map.setColumnHeaders(file_descriptions)
+    # Add consensus features to the map (simplified version without individual features)
+    for consensus_row in self.consensus_df.iter_rows(named=True):
+        consensus_feature = oms.ConsensusFeature()
+        # Set basic properties
+        consensus_feature.setRT(float(consensus_row.get("rt", 0.0)))
+        consensus_feature.setMZ(float(consensus_row.get("mz", 0.0)))
+        consensus_feature.setIntensity(float(consensus_row.get("inty_mean", 0.0)))
+        consensus_feature.setQuality(float(consensus_row.get("quality", 1.0)))
+        # Set the unique consensus_id as the unique ID
+        consensus_id_str = consensus_row.get("consensus_id", "")
+        if consensus_id_str and len(consensus_id_str) == 16:
+            try:
+                # Convert 16-character hex string to integer for OpenMS
+                consensus_uid = int(consensus_id_str, 16)
+                consensus_feature.setUniqueId(consensus_uid)
+            except ValueError:
+                # Fallback to hash if not hex
+                consensus_feature.setUniqueId(hash(consensus_id_str) & 0x7FFFFFFFFFFFFFFF)
+        else:
+            # Fallback to consensus_uid
+            consensus_feature.setUniqueId(consensus_row.get("consensus_uid", 0))
+        consensus_map.push_back(consensus_feature)
+    # Save the consensus map
     fh = oms.ConsensusXMLFile()
-    fh.store(filename, self.consensus_map)
-    self.logger.debug(f"Saved consensus map to {filename}")
+    fh.store(filename, consensus_map)
+    self.logger.debug(f"Saved consensus map with {len(self.consensus_df)} features to {filename}")
+    self.logger.debug("Features use unique 16-character consensus_id strings")
 def save_consensus(self, **kwargs):

masster/study/study.py CHANGED Viewed

@@ -55,6 +55,7 @@ import polars as pl
 from masster.study.h5 import _load_study5
 from masster.study.h5 import _save_study5
 from masster.study.h5 import _save_study5_compressed
+from masster.study.h5 import _load_ms1
 from masster.study.helpers import _get_consensus_uids
 from masster.study.helpers import _get_feature_uids
 from masster.study.helpers import _get_sample_uids
@@ -126,6 +127,8 @@ from masster.study.merge import _finalize_merge
 from masster.study.merge import _count_tight_clusters
 from masster.study.processing import integrate
 from masster.study.processing import find_ms2
+from masster.study.processing import find_iso
+from masster.study.processing import reset_iso
 from masster.study.parameters import store_history
 from masster.study.parameters import get_parameters
 from masster.study.parameters import update_parameters
@@ -385,6 +388,9 @@ class Study:
     merge = merge
     find_consensus = merge  # Backward compatibility alias
     find_ms2 = find_ms2
+    find_iso = find_iso
+    reset_iso = reset_iso
+    iso_reset = reset_iso
     integrate = integrate
     integrate_chrom = integrate  # Backward compatibility alias
     fill = fill
@@ -421,9 +427,11 @@ class Study:
     set_source = set_source
     sample_color = sample_color
     sample_color_reset = sample_color_reset
+    reset_sample_color = sample_color_reset
     name_replace = sample_name_replace
     name_reset = sample_name_reset
+    reset_name = sample_name_reset
     # === Data Compression and Storage ===
     compress = compress
     compress_features = compress_features
@@ -436,8 +444,10 @@ class Study:
     # === Reset Operations ===
     fill_reset = fill_reset
+    reset_fill = fill_reset
     align_reset = align_reset
+    reset_align = align_reset
     # === Plotting and Visualization ===
     plot_alignment = plot_alignment
     plot_chrom = plot_chrom
@@ -461,8 +471,10 @@ class Study:
     identify = identify
     get_id = get_id
     id_reset = id_reset
+    reset_id = id_reset
     lib_reset = lib_reset
+    reset_lib = lib_reset
     # === Parameter Management ===
     store_history = store_history
     get_parameters = get_parameters
@@ -478,6 +490,7 @@ class Study:
     _load_study5 = _load_study5
     _save_study5 = _save_study5
     _save_study5_compressed = _save_study5_compressed
+    _load_ms1 = _load_ms1
     _get_consensus_uids = _get_consensus_uids
     _get_feature_uids = _get_feature_uids
     _get_sample_uids = _get_sample_uids

masster/study/study5_schema.json CHANGED Viewed

@@ -70,6 +70,9 @@
       "chrom_height_scaled_mean": {
         "dtype": "pl.Float64"
       },
+      "iso": {
+        "dtype": "pl.Object"
+      },
       "iso_mean": {
         "dtype": "pl.Float64"
       },

masster/wizard/__init__.py CHANGED Viewed

@@ -7,8 +7,11 @@ alignment, merging, plotting, and export.
 The create_script() function allows immediate generation of standalone analysis
 scripts without creating a Wizard instance first.
+The execute() function combines create_script() with immediate execution of the
+generated script for fully automated processing.
 """
-from .wizard import Wizard, wizard_def, create_script
+from .wizard import Wizard, wizard_def, create_script, execute
-__all__ = ["Wizard", "wizard_def", "create_script"]
+__all__ = ["Wizard", "wizard_def", "create_script", "execute"]

masster 0.4.20__py3-none-any.whl → 0.4.22__py3-none-any.whl

Potentially problematic release.

masster 0.4.20py3-none-any.whl → 0.4.22py3-none-any.whl