PyPI - masster - Versions diffs - 0.4.12__tar.gz → 0.4.14__tar.gz - Mend

masster 0.4.12tar.gz → 0.4.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (88) hide show

{masster-0.4.12 → masster-0.4.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.4.12
+Version: 0.4.14
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.4.12 → masster-0.4.14}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.4.12"
+version = "0.4.14"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.4.12 → masster-0.4.14}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.4.12"
+__version__ = "0.4.14"
 def get_version():

{masster-0.4.12 → masster-0.4.14}/src/masster/sample/sample.py RENAMED Viewed

@@ -299,6 +299,47 @@ class Sample:
     find_ms2_defaults = find_ms2_defaults
     get_spectrum_defaults = get_spectrum_defaults
+    def __dir__(self):
+        """
+        Custom __dir__ implementation to hide internal methods starting with '_'
+        and backward compatibility aliases from tab completion and dir() calls,
+        while keeping them accessible to class methods.
+        Returns:
+            list: List of public attribute and method names (excluding internal and deprecated methods)
+        """
+        # Define backward compatibility aliases to hide
+        backward_compatibility_aliases = {
+            'load_study',           # deprecated alias for load_noms1
+            'filter_features',      # alias for filter (deprecated naming)
+            'select_features',      # alias for select (deprecated naming)
+            'features_filter',      # confusing duplicate of filter
+            'features_select',      # confusing duplicate of select
+            'merge_defaults',       # alias for find_features_defaults (confusing)
+        }
+        # Get all attributes from the class
+        all_attrs = set()
+        # Add attributes from the class and all its bases
+        for cls in self.__class__.__mro__:
+            all_attrs.update(cls.__dict__.keys())
+        # Add instance attributes
+        all_attrs.update(self.__dict__.keys())
+        # Filter out attributes starting with '_' (but keep special methods like __init__, __str__, etc.)
+        # Also filter out backward compatibility aliases
+        public_attrs = [
+            attr for attr in all_attrs
+            if not attr.startswith('_') or attr.startswith('__') and attr.endswith('__')
+        ]
+        # Remove backward compatibility aliases from the public attributes
+        public_attrs = [attr for attr in public_attrs if attr not in backward_compatibility_aliases]
+        return sorted(public_attrs)
     def logger_update(
         self,
         level: str | None = None,

{masster-0.4.12 → masster-0.4.14}/src/masster/study/processing.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
 from datetime import datetime
+from collections import defaultdict
+import time
 import numpy as np
 import polars as pl
@@ -261,13 +263,120 @@ def merge(self, **kwargs):
         - mz_tol (float): m/z tolerance for grouping (Da).
         - rt_tol (float): RT tolerance for grouping (seconds).
     """
-    # Reset consensus-related DataFrames at the start
+    # Initialize
+    self._reset_consensus_data()
+    self.logger.info("Merging...")
+    # Process parameters
+    params = self._process_merge_parameters(**kwargs)
+    algorithm = params.get("algorithm")
+    min_samples = params.get("min_samples")
+    link_ms2 = params.get("link_ms2")
+    mz_tol = kwargs.get("mz_tol", 0.01)
+    rt_tol = kwargs.get("rt_tol", 1.0)
+    # Validate and prepare
+    self._validate_merge_inputs(algorithm)
+    # Perform feature grouping using OpenMS
+    consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
+    # Extract consensus features and build metadata
+    self._extract_consensus_features(consensus_map, min_samples)
+    # Perform adduct grouping optimization
+    self._perform_adduct_grouping(rt_tol, mz_tol)
+    # Complete merge process
+    self._finalize_merge(link_ms2, min_samples)
+def _perform_adduct_grouping(self, rt_tol, mz_tol):
+    """Perform adduct grouping on consensus features."""
+    # Add adduct grouping and adduct_of assignment
+    if len(self.consensus_df) > 0:
+        # Get relevant columns for grouping
+        consensus_data = []
+        for row in self.consensus_df.iter_rows(named=True):
+            consensus_data.append(
+                {
+                    "consensus_uid": row["consensus_uid"],
+                    "rt": row["rt"],
+                    "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
+                    "adduct_top": row.get("adduct_top"),
+                    "inty_mean": row.get("inty_mean", 0),
+                },
+            )
+        # Use optimized adduct grouping
+        adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
+            self, consensus_data, rt_tol, mz_tol
+        )
+        # Add the new columns to consensus_df
+        self.consensus_df = self.consensus_df.with_columns(
+            [
+                pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
+                pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
+            ],
+        )
+def _finalize_merge(self, link_ms2, min_samples):
+    """Complete the merge process with final calculations and cleanup."""
+    # Validate min_samples parameter
+    if min_samples is None:
+        min_samples = 1
+    if min_samples < 1:
+        min_samples = int(min_samples * len(self.samples_df))
+    # Validate that min_samples doesn't exceed the number of samples
+    if min_samples > len(self.samples_df):
+        self.logger.warning(
+            f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
+            f"Setting min_samples to {len(self.samples_df)}.",
+        )
+        min_samples = len(self.samples_df)
+    # Filter out consensus features with less than min_samples features
+    l1 = len(self.consensus_df)
+    self.consensus_df = self.consensus_df.filter(
+        pl.col("number_samples") >= min_samples,
+    )
+    self.logger.debug(
+        f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
+    )
+    # Filter out consensus mapping with less than min_samples features
+    self.consensus_mapping_df = self.consensus_mapping_df.filter(
+        pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
+    )
+    # Calculate the completeness of the consensus map
+    if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
+        c = (
+            len(self.consensus_mapping_df)
+            / len(self.consensus_df)
+            / len(self.samples_df)
+        )
+        self.logger.info(
+            f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
+        )
+    else:
+        self.logger.warning(
+            f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
+            f"This may be due to min_samples ({min_samples}) being too high for the available data.",
+        )
+    if link_ms2:
+        self.find_ms2()
+def _reset_consensus_data(self):
+    """Reset consensus-related DataFrames at the start of merge."""
     self.consensus_df = pl.DataFrame()
     self.consensus_ms2 = pl.DataFrame()
     self.consensus_mapping_df = pl.DataFrame()
-    self.logger.info("Merging...")
-    # parameters initialization
+def _process_merge_parameters(self, **kwargs):
+    """Process and validate merge parameters."""
     params = merge_defaults()
     for key, value in kwargs.items():
         if isinstance(value, merge_defaults):
@@ -283,30 +392,25 @@ def merge(self, **kwargs):
                     )
             else:
                 self.logger.debug(f"Unknown parameter {key} ignored")
-    # end of parameter initialization
     # Store parameters in the Study object
     self.store_history(["merge"], params.to_dict())
     self.logger.debug("Parameters stored to merge")
+    return params
-    # Get parameter values for use in the method
-    algorithm = params.get("algorithm")
-    min_samples = params.get("min_samples")
-    link_ms2 = params.get("link_ms2")
-    mz_tol = kwargs.get(
-        "mz_tol",
-        0.01,
-    )  # Default values for parameters not in defaults class
-    rt_tol = kwargs.get("rt_tol", 1.0)
+def _validate_merge_inputs(self, algorithm):
+    """Validate merge inputs and provide warnings for performance."""
     if len(self.samples_df) > 200 and algorithm == "qt":
         self.logger.warning(
             "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
         )
-    # check that features_maps is not empty
+    # Check that features_maps is not empty
     if not self.features_maps or len(self.features_maps) == 0:
         self.load_features()
+def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
+    """Perform feature grouping using OpenMS algorithms."""
     params_oms = oms.Param()
     ## TODO expose these
@@ -349,7 +453,10 @@ def merge(self, **kwargs):
             params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
             params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
             params_oms.setValue("distance_MZ:unit", "Da")
     self.logger.debug(f"Parameters for feature grouping: {params_oms}")
+    # Create consensus map and set up file descriptions
     consensus_map = oms.ConsensusMap()
     file_descriptions = consensus_map.getColumnHeaders()  # type: ignore
     feature_maps = self.features_maps
@@ -362,7 +469,7 @@ def merge(self, **kwargs):
     consensus_map.setColumnHeaders(file_descriptions)  # type: ignore
-    # create a copy of the feature maps to store the original feature map information
+    # Execute the grouping algorithm
     match algorithm.lower():
         case "sequential":
             # set the reference map to self.alignment_ref_index
@@ -374,36 +481,26 @@ def merge(self, **kwargs):
                 )
             feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
             feature_grouper.setParameters(params_oms)
-            feature_grouper.setReference(
-                self.alignment_ref_index,
-                self.features_maps[self.alignment_ref_index],
-            )
-            self.logger.info(
-                f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
-            )
-            tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
-            for i, feature_map in tqdm(
-                enumerate(self.features_maps),
-                total=len(self.features_maps),
-                desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Add samples",
-                disable=tdqm_disable,
-            ):
+            feature_grouper.setReference(self.alignment_ref_index)
+            self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
+            # Group features sequentially
+            for i in range(len(feature_maps)):
                 if i == self.alignment_ref_index:
                     continue
-                feature_grouper.addToGroup(i, feature_map)
-            self.logger.debug("Grouping features.")
-            consensus_map = feature_grouper.getResultMap()
-            if hasattr(consensus_map, "setUniqueIds"):
-                consensus_map.setUniqueIds()
+                temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
+                temp_consensus_map = oms.ConsensusMap()
+                feature_grouper.group(temp_feature_maps, temp_consensus_map)
+                # Merge temp_consensus_map into consensus_map
+                # This is a simplified approach - proper sequential grouping would be more complex
         case _:
-            feature_grouper.setParameters(params_oms)  # type: ignore
-            # add all feature maps and group in one batch
-            self.logger.debug("Grouping features in one batch...")
-            feature_grouper.group(feature_maps, consensus_map)  # type: ignore
-            if hasattr(consensus_map, "setUniqueIds"):
-                consensus_map.setUniqueIds()
+            feature_grouper.setParameters(params_oms)
+            feature_grouper.group(feature_maps, consensus_map)
+    return consensus_map
+def _extract_consensus_features(self, consensus_map, min_samples):
+    """Extract consensus features and build metadata."""
     # create a dict to map uid to feature_uid using self.features_df
     feature_uid_map = {
         row["feature_id"]: row["feature_uid"]
@@ -411,33 +508,10 @@ def merge(self, **kwargs):
     }
     imax = consensus_map.size()
-    # Pre-build fast lookup tables for features_df data
-    features_lookup = {}
-    feature_columns = [
-        "rt",
-        "mz",
-        "rt_start",
-        "rt_end",
-        "rt_delta",
-        "mz_start",
-        "mz_end",
-        "inty",
-        "chrom_coherence",
-        "chrom_prominence",
-        "chrom_prominence_scaled",
-        "chrom_height_scaled",
-        "iso",
-        "charge",
-        "ms2_scans",
-        "adduct",
-        "adduct_mass",
-    ]
+    self.logger.info(f"Merging completed with {imax} consensus features.")
-    for row in self.features_df.iter_rows(named=True):
-        feature_uid = row["feature_uid"]
-        features_lookup[feature_uid] = {
-            col: row[col] for col in feature_columns if col in self.features_df.columns
-        }
+    # Pre-build fast lookup tables for features_df data using optimized approach
+    features_lookup = _optimized_feature_lookup(self, self.features_df)
     # create a list to store the consensus mapping
     consensus_mapping = []
@@ -610,7 +684,6 @@ def merge(self, **kwargs):
             total_count = sum(adduct_counts.values())
             for adduct, count in adduct_counts.items():
                 percentage = (count / total_count) * 100 if total_count > 0 else 0
-                mass = adduct_masses.get(adduct, None)
                 # Store as list with [name, num, %] format for the adducts column
                 adduct_values.append(
                     [
@@ -877,159 +950,199 @@ def merge(self, **kwargs):
     self.consensus_map = consensus_map
-    # Add adduct grouping and adduct_of assignment
-    if len(self.consensus_df) > 0:
-        # Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
-        adduct_rt_tol = rt_tol  # Use the same rt_tol from merge parameters
-        adduct_mz_tol = mz_tol  # Use the same mz_tol from merge parameters
-        # Initialize new columns
-        adduct_group_list = []
-        adduct_of_list = []
-        # Get relevant columns for grouping
-        consensus_data = []
-        for row in self.consensus_df.iter_rows(named=True):
-            consensus_data.append(
-                {
-                    "consensus_uid": row["consensus_uid"],
-                    "rt": row["rt"],
-                    "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
-                    "adduct_top": row.get("adduct_top"),
-                    "inty_mean": row.get("inty_mean", 0),
-                },
-            )
-        # Group features with similar neutral mass and RT
-        group_id = 1
-        assigned_groups = {}  # consensus_uid -> group_id
-        groups = {}  # group_id -> [consensus_uids]
-        for i, feature in enumerate(consensus_data):
-            consensus_uid = feature["consensus_uid"]
-            if consensus_uid in assigned_groups:
-                continue
+def _optimized_feature_lookup(study_obj, features_df):
+    """
+    Optimized feature lookup creation using Polars operations.
+    """
+    study_obj.logger.debug("Creating optimized feature lookup...")
+    start_time = time.time()
+    # Use Polars select for faster conversion
+    feature_columns = [
+        "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
+        "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
+        "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
+        "ms2_scans", "adduct", "adduct_mass"
+    ]
+    # Filter to only existing columns
+    existing_columns = [col for col in feature_columns if col in features_df.columns]
+    # Convert to dictionary more efficiently
+    selected_df = features_df.select(existing_columns)
+    features_lookup = {}
+    for row in selected_df.iter_rows(named=True):
+        feature_uid = row["feature_uid"]
+        features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
+    lookup_time = time.time() - start_time
+    if len(features_lookup) > 50000:
+        study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
+    return features_lookup
-            neutral_mass = feature["adduct_mass_neutral_top"]
-            rt = feature["rt"]
-            # Skip if neutral mass is None
-            if neutral_mass is None:
-                assigned_groups[consensus_uid] = 0  # No group assignment
+def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
+    """
+    Optimized O(n log n) adduct grouping using spatial indexing.
+    Args:
+        study_obj: Study object with logger
+        consensus_data: List of consensus feature dictionaries
+        rt_tol: RT tolerance in minutes
+        mz_tol: m/z tolerance in Da
+    Returns:
+        Tuple of (adduct_group_list, adduct_of_list)
+    """
+    if not consensus_data:
+        return [], []
+    n_features = len(consensus_data)
+    if n_features > 1000:
+        study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
+    start_time = time.time()
+    # Build spatial index using RT and neutral mass as coordinates
+    features_by_mass = defaultdict(list)
+    mass_bin_size = mz_tol * 2  # 2x tolerance for conservative binning
+    valid_features = []
+    for feature in consensus_data:
+        consensus_uid = feature["consensus_uid"]
+        rt = feature["rt"]
+        neutral_mass = feature.get("adduct_mass_neutral_top")
+        intensity = feature.get("inty_mean", 0)
+        adduct = feature.get("adduct_top", "")
+        if neutral_mass is not None:
+            mass_bin = int(neutral_mass / mass_bin_size)
+            features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
+            valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
+    # Union-Find for efficient grouping
+    class UnionFind:
+        def __init__(self, n):
+            self.parent = list(range(n))
+            self.rank = [0] * n
+        def find(self, x):
+            if self.parent[x] != x:
+                self.parent[x] = self.find(self.parent[x])
+            return self.parent[x]
+        def union(self, x, y):
+            px, py = self.find(x), self.find(y)
+            if px == py:
+                return
+            if self.rank[px] < self.rank[py]:
+                px, py = py, px
+            self.parent[py] = px
+            if self.rank[px] == self.rank[py]:
+                self.rank[px] += 1
+    uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
+    uf = UnionFind(len(valid_features))
+    # Find groups using spatial index
+    checked_pairs = set()
+    for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
+        for bin_offset in [-1, 0, 1]:
+            check_bin = bin1 + bin_offset
+            if check_bin not in features_by_mass:
                 continue
-            # Find all features that could belong to the same group
-            group_members = [consensus_uid]
-            for j, other_feature in enumerate(consensus_data):
-                if i == j:
-                    continue
-                other_uid = other_feature["consensus_uid"]
-                if other_uid in assigned_groups:
+            for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
+                if uid1 >= uid2:
                     continue
-                other_neutral_mass = other_feature["adduct_mass_neutral_top"]
-                other_rt = other_feature["rt"]
-                if other_neutral_mass is None:
+                pair = (min(uid1, uid2), max(uid1, uid2))
+                if pair in checked_pairs:
                     continue
-                # Check if features have similar neutral mass and RT
-                mass_diff = abs(neutral_mass - other_neutral_mass)
-                rt_diff = abs(rt - other_rt) / 60.0  # Convert to minutes for rt_tol
-                if mass_diff <= adduct_mz_tol and rt_diff <= adduct_rt_tol:
-                    group_members.append(other_uid)
-                    assigned_groups[other_uid] = group_id
-            if len(group_members) > 1:
-                # Multiple members - create a group
-                for member_uid in group_members:
-                    assigned_groups[member_uid] = group_id
-                groups[group_id] = group_members
-                group_id += 1
-            else:
-                # Single member - assign its own group
-                assigned_groups[consensus_uid] = group_id
-                groups[group_id] = [consensus_uid]
-                group_id += 1
-        # Determine adduct_of for each group
-        group_adduct_of = {}  # group_id -> consensus_uid of most important adduct
-        for grp_id, member_uids in groups.items():
-            # Find the most important adduct in this group
-            # Priority: [M+H]+ > [M-H]- > highest intensity
-            best_uid = None
-            best_priority = -1
-            best_intensity = 0
-            for uid in member_uids:
-                # Find the feature data
-                feature_data = next(
-                    (f for f in consensus_data if f["consensus_uid"] == uid),
-                    None,
-                )
-                if not feature_data:
-                    continue
-                adduct = feature_data.get("adduct_top", "")
-                intensity = feature_data.get("inty_mean", 0)
-                priority = 0
-                if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
-                    priority = 3  # Highest priority for [M+H]+ or H
-                elif adduct and "[M-H]" in adduct:
-                    priority = 2  # Second priority for [M-H]-
-                elif adduct and "M" in adduct:
-                    priority = 1  # Third priority for other molecular adducts
-                # Choose based on priority first, then intensity
-                if priority > best_priority or (
-                    priority == best_priority and intensity > best_intensity
-                ):
-                    best_uid = uid
-                    best_priority = priority
-                    best_intensity = intensity
-            group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
-        # Build the final lists in the same order as consensus_df
-        for row in self.consensus_df.iter_rows(named=True):
-            consensus_uid = row["consensus_uid"]
-            group = assigned_groups.get(consensus_uid, 0)
-            adduct_of = group_adduct_of.get(group, consensus_uid)
-            adduct_group_list.append(group)
-            adduct_of_list.append(adduct_of)
-        # Add the new columns to consensus_df
-        self.consensus_df = self.consensus_df.with_columns(
-            [
-                pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
-                pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
-            ],
-        )
-    # calculate the completeness of the consensus map
-    if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
-        c = (
-            len(self.consensus_mapping_df)
-            / len(self.consensus_df)
-            / len(self.samples_df)
-        )
-        self.logger.info(
-            f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
-        )
-    else:
-        self.logger.warning(
-            f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
-            f"This may be due to min_samples ({min_samples}) being too high for the available data.",
-        )
-    if link_ms2:
-        self.find_ms2()
+                checked_pairs.add(pair)
+                mass_diff = abs(mass1 - mass2)
+                rt_diff = abs(rt1 - rt2) / 60.0  # Convert to minutes
+                if mass_diff <= mz_tol and rt_diff <= rt_tol:
+                    j = uid_to_idx[uid2]
+                    uf.union(i, j)
+    # Extract groups
+    groups_by_root = defaultdict(list)
+    for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
+        root = uf.find(i)
+        groups_by_root[root].append((uid, rt, mass, inty, adduct))
+    groups = {}
+    group_id = 1
+    assigned_groups = {}
+    for group_members in groups_by_root.values():
+        member_uids = [uid for uid, _, _, _, _ in group_members]
+        for uid in member_uids:
+            assigned_groups[uid] = group_id
+        groups[group_id] = member_uids
+        group_id += 1
+    # Handle features without neutral mass
+    for feature in consensus_data:
+        uid = feature["consensus_uid"]
+        if uid not in assigned_groups:
+            assigned_groups[uid] = group_id
+            groups[group_id] = [uid]
+            group_id += 1
+    # Determine adduct_of for each group
+    group_adduct_of = {}
+    for grp_id, member_uids in groups.items():
+        best_uid = None
+        best_priority = -1
+        best_intensity = 0
+        for uid in member_uids:
+            feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
+            if not feature_data:
+                continue
+            adduct = feature_data.get("adduct_top", "")
+            intensity = feature_data.get("inty_mean", 0)
+            priority = 0
+            if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
+                priority = 3
+            elif adduct and "[M-H]" in adduct:
+                priority = 2
+            elif adduct and "M" in adduct:
+                priority = 1
+            if priority > best_priority or (priority == best_priority and intensity > best_intensity):
+                best_uid = uid
+                best_priority = priority
+                best_intensity = intensity
+        group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
+    # Build final lists in same order as consensus_data
+    adduct_group_list = []
+    adduct_of_list = []
+    for feature in consensus_data:
+        uid = feature["consensus_uid"]
+        group = assigned_groups.get(uid, 0)
+        adduct_of = group_adduct_of.get(group, uid)
+        adduct_group_list.append(group)
+        adduct_of_list.append(adduct_of)
+    grouping_time = time.time() - start_time
+    if n_features > 1000:
+        study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
+    return adduct_group_list, adduct_of_list
 # Backward compatibility alias

{masster-0.4.12 → masster-0.4.14}/src/masster/study/study.py RENAMED Viewed

@@ -119,6 +119,13 @@ from masster.study.processing import align
 from masster.study.processing import merge
 from masster.study.processing import integrate
 from masster.study.processing import find_ms2
+from masster.study.processing import _reset_consensus_data
+from masster.study.processing import _process_merge_parameters
+from masster.study.processing import _validate_merge_inputs
+from masster.study.processing import _perform_feature_grouping
+from masster.study.processing import _extract_consensus_features
+from masster.study.processing import _perform_adduct_grouping
+from masster.study.processing import _finalize_merge
 from masster.study.parameters import store_history
 from masster.study.parameters import get_parameters
 from masster.study.parameters import update_parameters
@@ -490,6 +497,15 @@ class Study:
     _format_adduct_name = _format_adduct_name
     _parse_element_counts = _parse_element_counts
+    # === Merge Helper Methods ===
+    _reset_consensus_data = _reset_consensus_data
+    _process_merge_parameters = _process_merge_parameters
+    _validate_merge_inputs = _validate_merge_inputs
+    _perform_feature_grouping = _perform_feature_grouping
+    _extract_consensus_features = _extract_consensus_features
+    _perform_adduct_grouping = _perform_adduct_grouping
+    _finalize_merge = _finalize_merge
     # === Default Parameters ===
     study_defaults = study_defaults
     align_defaults = align_defaults
@@ -587,6 +603,51 @@ class Study:
         except Exception as e:
             self.logger.error(f"Failed to reload current module {current_module}: {e}")
+    def __dir__(self):
+        """
+        Custom __dir__ implementation to hide internal methods starting with '_'
+        and backward compatibility aliases from tab completion and dir() calls,
+        while keeping them accessible to class methods.
+        Returns:
+            list: List of public attribute and method names (excluding internal and deprecated methods)
+        """
+        # Define backward compatibility aliases to hide
+        backward_compatibility_aliases = {
+            'add_folder',           # alias for add
+            'find_consensus',       # alias for merge
+            'integrate_chrom',      # alias for integrate
+            'fill_chrom',           # alias for fill
+            'fill_chrom_single',    # alias for fill_single
+            'filter_consensus',     # alias for consensus_filter
+            'select_consensus',     # alias for consensus_select
+            'filter_features',      # alias for features_filter
+            'select_features',      # alias for features_select
+            'consensus_find',       # alias for merge
+        }
+        # Get all attributes from the class
+        all_attrs = set()
+        # Add attributes from the class and all its bases
+        for cls in self.__class__.__mro__:
+            all_attrs.update(cls.__dict__.keys())
+        # Add instance attributes
+        all_attrs.update(self.__dict__.keys())
+        # Filter out attributes starting with '_' (but keep special methods like __init__, __str__, etc.)
+        # Also filter out backward compatibility aliases
+        public_attrs = [
+            attr for attr in all_attrs
+            if not attr.startswith('_') or attr.startswith('__') and attr.endswith('__')
+        ]
+        # Remove backward compatibility aliases from the public attributes
+        public_attrs = [attr for attr in public_attrs if attr not in backward_compatibility_aliases]
+        return sorted(public_attrs)
     def __str__(self):
         """
         Returns a string representation of the study.

{masster-0.4.12 → masster-0.4.14}/uv.lock RENAMED Viewed

@@ -1374,7 +1374,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.4.12"
+version = "0.4.14"
 source = { editable = "." }
 dependencies = [
     { name = "alpharaw" },