PyPI - masster - Versions diffs - 0.4.12__tar.gz → 0.4.13__tar.gz - Mend

masster 0.4.12tar.gz → 0.4.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (88) hide show

{masster-0.4.12 → masster-0.4.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.4.12
+Version: 0.4.13
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.4.12 → masster-0.4.13}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.4.12"
+version = "0.4.13"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.4.12 → masster-0.4.13}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.4.12"
+__version__ = "0.4.13"
 def get_version():

{masster-0.4.12 → masster-0.4.13}/src/masster/study/processing.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 from datetime import datetime
+from collections import defaultdict
+import time
 import numpy as np
 import polars as pl
@@ -261,13 +263,41 @@ def merge(self, **kwargs):
         - mz_tol (float): m/z tolerance for grouping (Da).
         - rt_tol (float): RT tolerance for grouping (seconds).
     """
-    # Reset consensus-related DataFrames at the start
+    # Initialize
+    self._reset_consensus_data()
+    self.logger.info("Merging...")
+    # Process parameters
+    params = self._process_merge_parameters(**kwargs)
+    algorithm = params.get("algorithm")
+    min_samples = params.get("min_samples")
+    link_ms2 = params.get("link_ms2")
+    mz_tol = kwargs.get("mz_tol", 0.01)
+    rt_tol = kwargs.get("rt_tol", 1.0)
+    # Validate and prepare
+    self._validate_merge_inputs(algorithm)
+    # Perform feature grouping using OpenMS
+    consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
+    # Extract consensus features and build metadata
+    self._extract_consensus_features(consensus_map, min_samples)
+    # Perform adduct grouping optimization
+    self._perform_adduct_grouping(rt_tol, mz_tol)
+    # Complete merge process
+    self._finalize_merge(link_ms2, min_samples)
+def _reset_consensus_data(self):
+    """Reset consensus-related DataFrames at the start of merge."""
     self.consensus_df = pl.DataFrame()
     self.consensus_ms2 = pl.DataFrame()
     self.consensus_mapping_df = pl.DataFrame()
-    self.logger.info("Merging...")
-    # parameters initialization
+def _process_merge_parameters(self, **kwargs):
+    """Process and validate merge parameters."""
     params = merge_defaults()
     for key, value in kwargs.items():
         if isinstance(value, merge_defaults):
@@ -283,30 +313,25 @@ def merge(self, **kwargs):
                     )
             else:
                 self.logger.debug(f"Unknown parameter {key} ignored")
-    # end of parameter initialization
     # Store parameters in the Study object
     self.store_history(["merge"], params.to_dict())
     self.logger.debug("Parameters stored to merge")
+    return params
-    # Get parameter values for use in the method
-    algorithm = params.get("algorithm")
-    min_samples = params.get("min_samples")
-    link_ms2 = params.get("link_ms2")
-    mz_tol = kwargs.get(
-        "mz_tol",
-        0.01,
-    )  # Default values for parameters not in defaults class
-    rt_tol = kwargs.get("rt_tol", 1.0)
+def _validate_merge_inputs(self, algorithm):
+    """Validate merge inputs and provide warnings for performance."""
     if len(self.samples_df) > 200 and algorithm == "qt":
         self.logger.warning(
             "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
         )
-    # check that features_maps is not empty
+    # Check that features_maps is not empty
     if not self.features_maps or len(self.features_maps) == 0:
         self.load_features()
+def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
+    """Perform feature grouping using OpenMS algorithms."""
     params_oms = oms.Param()
     ## TODO expose these
@@ -349,7 +374,10 @@ def merge(self, **kwargs):
             params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
             params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
             params_oms.setValue("distance_MZ:unit", "Da")
     self.logger.debug(f"Parameters for feature grouping: {params_oms}")
+    # Create consensus map and set up file descriptions
     consensus_map = oms.ConsensusMap()
     file_descriptions = consensus_map.getColumnHeaders()  # type: ignore
     feature_maps = self.features_maps
@@ -362,7 +390,7 @@ def merge(self, **kwargs):
     consensus_map.setColumnHeaders(file_descriptions)  # type: ignore
-    # create a copy of the feature maps to store the original feature map information
+    # Execute the grouping algorithm
     match algorithm.lower():
         case "sequential":
             # set the reference map to self.alignment_ref_index
@@ -374,36 +402,26 @@ def merge(self, **kwargs):
                 )
             feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
             feature_grouper.setParameters(params_oms)
-            feature_grouper.setReference(
-                self.alignment_ref_index,
-                self.features_maps[self.alignment_ref_index],
-            )
-            self.logger.info(
-                f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
-            )
-            tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-            for i, feature_map in tqdm(
-                enumerate(self.features_maps),
-                total=len(self.features_maps),
-                desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Add samples",
-                disable=tdqm_disable,
-            ):
+            feature_grouper.setReference(self.alignment_ref_index)
+            self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
+            # Group features sequentially
+            for i in range(len(feature_maps)):
                 if i == self.alignment_ref_index:
                     continue
-                feature_grouper.addToGroup(i, feature_map)
-            self.logger.debug("Grouping features.")
-            consensus_map = feature_grouper.getResultMap()
-            if hasattr(consensus_map, "setUniqueIds"):
-                consensus_map.setUniqueIds()
+                temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
+                temp_consensus_map = oms.ConsensusMap()
+                feature_grouper.group(temp_feature_maps, temp_consensus_map)
+                # Merge temp_consensus_map into consensus_map
+                # This is a simplified approach - proper sequential grouping would be more complex
         case _:
-            feature_grouper.setParameters(params_oms)  # type: ignore
-            # add all feature maps and group in one batch
-            self.logger.debug("Grouping features in one batch...")
-            feature_grouper.group(feature_maps, consensus_map)  # type: ignore
-            if hasattr(consensus_map, "setUniqueIds"):
-                consensus_map.setUniqueIds()
+            feature_grouper.setParameters(params_oms)
+            feature_grouper.group(feature_maps, consensus_map)
+    return consensus_map
+def _extract_consensus_features(self, consensus_map, min_samples):
+    """Extract consensus features and build metadata."""
     # create a dict to map uid to feature_uid using self.features_df
     feature_uid_map = {
         row["feature_id"]: row["feature_uid"]
@@ -411,33 +429,10 @@ def merge(self, **kwargs):
     }
     imax = consensus_map.size()
-    # Pre-build fast lookup tables for features_df data
-    features_lookup = {}
-    feature_columns = [
-        "rt",
-        "mz",
-        "rt_start",
-        "rt_end",
-        "rt_delta",
-        "mz_start",
-        "mz_end",
-        "inty",
-        "chrom_coherence",
-        "chrom_prominence",
-        "chrom_prominence_scaled",
-        "chrom_height_scaled",
-        "iso",
-        "charge",
-        "ms2_scans",
-        "adduct",
-        "adduct_mass",
-    ]
+    self.logger.info(f"Merging completed with {imax} consensus features.")
-    for row in self.features_df.iter_rows(named=True):
-        feature_uid = row["feature_uid"]
-        features_lookup[feature_uid] = {
-            col: row[col] for col in feature_columns if col in self.features_df.columns
-        }
+    # Pre-build fast lookup tables for features_df data using optimized approach
+    features_lookup = _optimized_feature_lookup(self, self.features_df)
     # create a list to store the consensus mapping
     consensus_mapping = []
@@ -883,10 +878,6 @@ def merge(self, **kwargs):
         adduct_rt_tol = rt_tol  # Use the same rt_tol from merge parameters
         adduct_mz_tol = mz_tol  # Use the same mz_tol from merge parameters
-        # Initialize new columns
-        adduct_group_list = []
-        adduct_of_list = []
         # Get relevant columns for grouping
         consensus_data = []
         for row in self.consensus_df.iter_rows(named=True):
@@ -900,110 +891,10 @@ def merge(self, **kwargs):
                 },
             )
-        # Group features with similar neutral mass and RT
-        group_id = 1
-        assigned_groups = {}  # consensus_uid -> group_id
-        groups = {}  # group_id -> [consensus_uids]
-        for i, feature in enumerate(consensus_data):
-            consensus_uid = feature["consensus_uid"]
-            if consensus_uid in assigned_groups:
-                continue
-            neutral_mass = feature["adduct_mass_neutral_top"]
-            rt = feature["rt"]
-            # Skip if neutral mass is None
-            if neutral_mass is None:
-                assigned_groups[consensus_uid] = 0  # No group assignment
-                continue
-            # Find all features that could belong to the same group
-            group_members = [consensus_uid]
-            for j, other_feature in enumerate(consensus_data):
-                if i == j:
-                    continue
-                other_uid = other_feature["consensus_uid"]
-                if other_uid in assigned_groups:
-                    continue
-                other_neutral_mass = other_feature["adduct_mass_neutral_top"]
-                other_rt = other_feature["rt"]
-                if other_neutral_mass is None:
-                    continue
-                # Check if features have similar neutral mass and RT
-                mass_diff = abs(neutral_mass - other_neutral_mass)
-                rt_diff = abs(rt - other_rt) / 60.0  # Convert to minutes for rt_tol
-                if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
-                    group_members.append(other_uid)
-                    assigned_groups[other_uid] = group_id
-            if len(group_members) > 1:
-                # Multiple members - create a group
-                for member_uid in group_members:
-                    assigned_groups[member_uid] = group_id
-                groups[group_id] = group_members
-                group_id += 1
-            else:
-                # Single member - assign its own group
-                assigned_groups[consensus_uid] = group_id
-                groups[group_id] = [consensus_uid]
-                group_id += 1
-        # Determine adduct_of for each group
-        group_adduct_of = {}  # group_id -> consensus_uid of most important adduct
-        for grp_id, member_uids in groups.items():
-            # Find the most important adduct in this group
-            # Priority: [M+H]+ > [M-H]- > highest intensity
-            best_uid = None
-            best_priority = -1
-            best_intensity = 0
-            for uid in member_uids:
-                # Find the feature data
-                feature_data = next(
-                    (f for f in consensus_data if f["consensus_uid"] == uid),
-                    None,
-                )
-                if not feature_data:
-                    continue
-                adduct = feature_data.get("adduct_top", "")
-                intensity = feature_data.get("inty_mean", 0)
-                priority = 0
-                if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
-                    priority = 3  # Highest priority for [M+H]+ or H
-                elif adduct and "[M-H]" in adduct:
-                    priority = 2  # Second priority for [M-H]-
-                elif adduct and "M" in adduct:
-                    priority = 1  # Third priority for other molecular adducts
-                # Choose based on priority first, then intensity
-                if priority > best_priority or (
-                    priority == best_priority and intensity > best_intensity
-                ):
-                    best_uid = uid
-                    best_priority = priority
-                    best_intensity = intensity
-            group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
-        # Build the final lists in the same order as consensus_df
-        for row in self.consensus_df.iter_rows(named=True):
-            consensus_uid = row["consensus_uid"]
-            group = assigned_groups.get(consensus_uid, 0)
-            adduct_of = group_adduct_of.get(group, consensus_uid)
-            adduct_group_list.append(group)
-            adduct_of_list.append(adduct_of)
+        # Use optimized adduct grouping
+        adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
+            self, consensus_data, adduct_rt_tol, adduct_mz_tol
+        )
         # Add the new columns to consensus_df
         self.consensus_df = self.consensus_df.with_columns(
@@ -1032,6 +923,200 @@ def merge(self, **kwargs):
         self.find_ms2()
+def _optimized_feature_lookup(study_obj, features_df):
+    """
+    Optimized feature lookup creation using Polars operations.
+    """
+    study_obj.logger.debug("Creating optimized feature lookup...")
+    start_time = time.time()
+    # Use Polars select for faster conversion
+    feature_columns = [
+        "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
+        "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
+        "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
+        "ms2_scans", "adduct", "adduct_mass"
+    ]
+    # Filter to only existing columns
+    existing_columns = [col for col in feature_columns if col in features_df.columns]
+    # Convert to dictionary more efficiently
+    selected_df = features_df.select(existing_columns)
+    features_lookup = {}
+    for row in selected_df.iter_rows(named=True):
+        feature_uid = row["feature_uid"]
+        features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
+    lookup_time = time.time() - start_time
+    if len(features_lookup) > 50000:
+        study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
+    return features_lookup
+def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
+    """
+    Optimized O(n log n) adduct grouping using spatial indexing.
+    Args:
+        study_obj: Study object with logger
+        consensus_data: List of consensus feature dictionaries
+        rt_tol: RT tolerance in minutes
+        mz_tol: m/z tolerance in Da
+    Returns:
+        Tuple of (adduct_group_list, adduct_of_list)
+    """
+    if not consensus_data:
+        return [], []
+    n_features = len(consensus_data)
+    if n_features > 1000:
+        study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
+    start_time = time.time()
+    # Build spatial index using RT and neutral mass as coordinates
+    features_by_mass = defaultdict(list)
+    mass_bin_size = mz_tol * 2  # 2x tolerance for conservative binning
+    valid_features = []
+    for feature in consensus_data:
+        consensus_uid = feature["consensus_uid"]
+        rt = feature["rt"]
+        neutral_mass = feature.get("adduct_mass_neutral_top")
+        intensity = feature.get("inty_mean", 0)
+        adduct = feature.get("adduct_top", "")
+        if neutral_mass is not None:
+            mass_bin = int(neutral_mass / mass_bin_size)
+            features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
+            valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
+    # Union-Find for efficient grouping
+    class UnionFind:
+        def __init__(self, n):
+            self.parent = list(range(n))
+            self.rank = [0] * n
+        def find(self, x):
+            if self.parent[x] != x:
+                self.parent[x] = self.find(self.parent[x])
+            return self.parent[x]
+        def union(self, x, y):
+            px, py = self.find(x), self.find(y)
+            if px == py:
+                return
+            if self.rank[px] < self.rank[py]:
+                px, py = py, px
+            self.parent[py] = px
+            if self.rank[px] == self.rank[py]:
+                self.rank[px] += 1
+    uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
+    uf = UnionFind(len(valid_features))
+    # Find groups using spatial index
+    checked_pairs = set()
+    for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
+        for bin_offset in [-1, 0, 1]:
+            check_bin = bin1 + bin_offset
+            if check_bin not in features_by_mass:
+                continue
+            for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
+                if uid1 >= uid2:
+                    continue
+                pair = (min(uid1, uid2), max(uid1, uid2))
+                if pair in checked_pairs:
+                    continue
+                checked_pairs.add(pair)
+                mass_diff = abs(mass1 - mass2)
+                rt_diff = abs(rt1 - rt2) / 60.0  # Convert to minutes
+                if mass_diff <= mz_tol and rt_diff <= rt_tol:
+                    j = uid_to_idx[uid2]
+                    uf.union(i, j)
+    # Extract groups
+    groups_by_root = defaultdict(list)
+    for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
+        root = uf.find(i)
+        groups_by_root[root].append((uid, rt, mass, inty, adduct))
+    groups = {}
+    group_id = 1
+    assigned_groups = {}
+    for group_members in groups_by_root.values():
+        member_uids = [uid for uid, _, _, _, _ in group_members]
+        for uid in member_uids:
+            assigned_groups[uid] = group_id
+        groups[group_id] = member_uids
+        group_id += 1
+    # Handle features without neutral mass
+    for feature in consensus_data:
+        uid = feature["consensus_uid"]
+        if uid not in assigned_groups:
+            assigned_groups[uid] = group_id
+            groups[group_id] = [uid]
+            group_id += 1
+    # Determine adduct_of for each group
+    group_adduct_of = {}
+    for grp_id, member_uids in groups.items():
+        best_uid = None
+        best_priority = -1
+        best_intensity = 0
+        for uid in member_uids:
+            feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
+            if not feature_data:
+                continue
+            adduct = feature_data.get("adduct_top", "")
+            intensity = feature_data.get("inty_mean", 0)
+            priority = 0
+            if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
+                priority = 3
+            elif adduct and "[M-H]" in adduct:
+                priority = 2
+            elif adduct and "M" in adduct:
+                priority = 1
+            if priority > best_priority or (priority == best_priority and intensity > best_intensity):
+                best_uid = uid
+                best_priority = priority
+                best_intensity = intensity
+        group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
+    # Build final lists in same order as consensus_data
+    adduct_group_list = []
+    adduct_of_list = []
+    for feature in consensus_data:
+        uid = feature["consensus_uid"]
+        group = assigned_groups.get(uid, 0)
+        adduct_of = group_adduct_of.get(group, uid)
+        adduct_group_list.append(group)
+        adduct_of_list.append(adduct_of)
+    grouping_time = time.time() - start_time
+    if n_features > 1000:
+        study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
+    return adduct_group_list, adduct_of_list
 # Backward compatibility alias
 find_consensus = merge

{masster-0.4.12 → masster-0.4.13}/uv.lock RENAMED Viewed

@@ -1374,7 +1374,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.4.12"
+version = "0.4.13"
 source = { editable = "." }
 dependencies = [
     { name = "alpharaw" },