PyPI - masster - Versions diffs - 0.4.13__py3-none-any.whl → 0.4.16__py3-none-any.whl - Mend

masster 0.4.13py3-none-any.whl → 0.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (25) hide show

masster/__init__.py +2 -0
masster/_version.py +1 -1
masster/sample/sample.py +41 -0
masster/study/__init__.py +1 -0
masster/study/defaults/find_consensus_def.py +1 -1
masster/study/defaults/merge_def.py +69 -25
masster/study/h5.py +65 -106
masster/study/id.py +1 -1
masster/study/load.py +11 -6
masster/study/merge.py +1607 -0
masster/study/processing.py +0 -874
masster/study/save.py +1 -1
masster/study/study.py +79 -21
masster/wizard/README.md +373 -0
masster/wizard/__init__.py +11 -0
masster/wizard/example.py +223 -0
masster/wizard/test_structure.py +49 -0
masster/wizard/test_wizard.py +285 -0
masster/wizard/wizard.py +1175 -0
masster/wizard.py +1175 -0
{masster-0.4.13.dist-info → masster-0.4.16.dist-info}/METADATA +1 -1
{masster-0.4.13.dist-info → masster-0.4.16.dist-info}/RECORD +25 -17
{masster-0.4.13.dist-info → masster-0.4.16.dist-info}/WHEEL +0 -0
{masster-0.4.13.dist-info → masster-0.4.16.dist-info}/entry_points.txt +0 -0
{masster-0.4.13.dist-info → masster-0.4.16.dist-info}/licenses/LICENSE +0 -0

masster/study/processing.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from __future__ import annotations
 from datetime import datetime
-from collections import defaultdict
-import time
 import numpy as np
 import polars as pl
@@ -14,7 +12,6 @@ from masster.study.defaults import (
     align_defaults,
     find_ms2_defaults,
     integrate_defaults,
-    merge_defaults,
 )
@@ -250,877 +247,6 @@ def align(self, **kwargs):
         self.save_samples()
-def merge(self, **kwargs):
-    """Group features across samples into consensus features.
-    Parameters can be provided as a ``merge_defaults`` instance or as
-    individual keyword arguments; they are validated against the defaults class.
-    Key parameters (from ``merge_defaults``):
-        - algorithm (str): Grouping algorithm to use ('qt', 'kd', 'unlabeled', 'sequential').
-        - min_samples (int): Minimum number of samples required for a consensus feature.
-        - link_ms2 (bool): Whether to attach/link MS2 spectra to consensus features.
-        - mz_tol (float): m/z tolerance for grouping (Da).
-        - rt_tol (float): RT tolerance for grouping (seconds).
-    """
-    # Initialize
-    self._reset_consensus_data()
-    self.logger.info("Merging...")
-    # Process parameters
-    params = self._process_merge_parameters(**kwargs)
-    algorithm = params.get("algorithm")
-    min_samples = params.get("min_samples")
-    link_ms2 = params.get("link_ms2")
-    mz_tol = kwargs.get("mz_tol", 0.01)
-    rt_tol = kwargs.get("rt_tol", 1.0)
-    # Validate and prepare
-    self._validate_merge_inputs(algorithm)
-    # Perform feature grouping using OpenMS
-    consensus_map = self._perform_feature_grouping(algorithm, params, mz_tol, rt_tol)
-    # Extract consensus features and build metadata
-    self._extract_consensus_features(consensus_map, min_samples)
-    # Perform adduct grouping optimization
-    self._perform_adduct_grouping(rt_tol, mz_tol)
-    # Complete merge process
-    self._finalize_merge(link_ms2, min_samples)
-def _reset_consensus_data(self):
-    """Reset consensus-related DataFrames at the start of merge."""
-    self.consensus_df = pl.DataFrame()
-    self.consensus_ms2 = pl.DataFrame()
-    self.consensus_mapping_df = pl.DataFrame()
-def _process_merge_parameters(self, **kwargs):
-    """Process and validate merge parameters."""
-    params = merge_defaults()
-    for key, value in kwargs.items():
-        if isinstance(value, merge_defaults):
-            params = value
-            self.logger.debug("Using provided merge_defaults parameters")
-        else:
-            if hasattr(params, key):
-                if params.set(key, value, validate=True):
-                    self.logger.debug(f"Updated parameter {key} = {value}")
-                else:
-                    self.logger.warning(
-                        f"Failed to set parameter {key} = {value} (validation failed)",
-                    )
-            else:
-                self.logger.debug(f"Unknown parameter {key} ignored")
-    # Store parameters in the Study object
-    self.store_history(["merge"], params.to_dict())
-    self.logger.debug("Parameters stored to merge")
-    return params
-def _validate_merge_inputs(self, algorithm):
-    """Validate merge inputs and provide warnings for performance."""
-    if len(self.samples_df) > 200 and algorithm == "qt":
-        self.logger.warning(
-            "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
-        )
-    # Check that features_maps is not empty
-    if not self.features_maps or len(self.features_maps) == 0:
-        self.load_features()
-def _perform_feature_grouping(self, algorithm, params, mz_tol, rt_tol):
-    """Perform feature grouping using OpenMS algorithms."""
-    params_oms = oms.Param()
-    ## TODO expose these
-    feature_grouper: object  # Use generic type for different OpenMS algorithms
-    match algorithm.lower():
-        case "kd":
-            feature_grouper = oms.FeatureGroupingAlgorithmKD()
-            self.logger.debug("Merging features with KDTree...")
-            params_oms.setValue("mz_unit", "Da")
-            params_oms.setValue("nr_partitions", len(self.samples_df))
-            params_oms.setValue("warp:enabled", "true")
-            params_oms.setValue("warp:rt_tol", rt_tol)
-            params_oms.setValue("warp:mz_tol", mz_tol)
-            params_oms.setValue("link:rt_tol", rt_tol)
-            params_oms.setValue("link:mz_tol", mz_tol)
-        case "unlabeled":
-            feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
-            self.logger.debug("Merging features with Unlabelled algorithm...")
-            params_oms.setValue("second_nearest_gap", 2.0)
-            params_oms.setValue("ignore_charge", "true")
-            params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
-            params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
-            params_oms.setValue("distance_MZ:unit", "Da")
-        case "sequential":
-            self.logger.debug(
-                "Merging features sequentially with Unlabelled algorithm...",
-            )
-            params_oms.setValue("second_nearest_gap", 2.0)
-            params_oms.setValue("ignore_charge", "true")
-            params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
-            params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
-            params_oms.setValue("distance_MZ:unit", "Da")
-        case "qt":
-            feature_grouper = oms.FeatureGroupingAlgorithmQT()
-            self.logger.debug("Grouping features with QT...")
-            params_oms.setValue("nr_partitions", len(self.samples_df))
-            params_oms.setValue("ignore_charge", "true")
-            params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
-            params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
-            params_oms.setValue("distance_MZ:unit", "Da")
-    self.logger.debug(f"Parameters for feature grouping: {params_oms}")
-    # Create consensus map and set up file descriptions
-    consensus_map = oms.ConsensusMap()
-    file_descriptions = consensus_map.getColumnHeaders()  # type: ignore
-    feature_maps = self.features_maps
-    for i, feature_map in enumerate(feature_maps):
-        file_description = file_descriptions.get(i, oms.ColumnHeader())
-        file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
-        file_description.size = feature_map.size()
-        file_description.unique_id = feature_map.getUniqueId()
-        file_descriptions[i] = file_description
-    consensus_map.setColumnHeaders(file_descriptions)  # type: ignore
-    # Execute the grouping algorithm
-    match algorithm.lower():
-        case "sequential":
-            # set the reference map to self.alignment_ref_index
-            if self.alignment_ref_index is None:
-                # pick the feature map with the most features as reference
-                self.alignment_ref_index = max(
-                    range(len(self.features_maps)),
-                    key=lambda i: self.features_maps[i].size(),
-                )
-            feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
-            feature_grouper.setParameters(params_oms)
-            feature_grouper.setReference(self.alignment_ref_index)
-            self.logger.debug(f"Sequential mode: reference map = {self.alignment_ref_index}")
-            # Group features sequentially
-            for i in range(len(feature_maps)):
-                if i == self.alignment_ref_index:
-                    continue
-                temp_feature_maps = [feature_maps[self.alignment_ref_index], feature_maps[i]]
-                temp_consensus_map = oms.ConsensusMap()
-                feature_grouper.group(temp_feature_maps, temp_consensus_map)
-                # Merge temp_consensus_map into consensus_map
-                # This is a simplified approach - proper sequential grouping would be more complex
-        case _:
-            feature_grouper.setParameters(params_oms)
-            feature_grouper.group(feature_maps, consensus_map)
-    return consensus_map
-def _extract_consensus_features(self, consensus_map, min_samples):
-    """Extract consensus features and build metadata."""
-    # create a dict to map uid to feature_uid using self.features_df
-    feature_uid_map = {
-        row["feature_id"]: row["feature_uid"]
-        for row in self.features_df.iter_rows(named=True)
-    }
-    imax = consensus_map.size()
-    self.logger.info(f"Merging completed with {imax} consensus features.")
-    # Pre-build fast lookup tables for features_df data using optimized approach
-    features_lookup = _optimized_feature_lookup(self, self.features_df)
-    # create a list to store the consensus mapping
-    consensus_mapping = []
-    metadata_list = []
-    tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
-    for i, feature in enumerate(
-        tqdm(
-            consensus_map,
-            total=imax,
-            disable=tqdm_disable,
-            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Extract metadata",
-        ),
-    ):
-        # get all features in the feature map with the same unique id as the consensus feature
-        features_list = feature.getFeatureList()
-        uids = []
-        feature_data_list = []
-        for _j, f in enumerate(features_list):
-            fuid = str(f.getUniqueId())
-            if fuid not in feature_uid_map:
-                # this is a feature that was removed but is still in the feature maps
-                continue
-            fuid = feature_uid_map[fuid]
-            consensus_mapping.append(
-                {
-                    "consensus_uid": i,
-                    "sample_uid": f.getMapIndex() + 1,
-                    "feature_uid": fuid,
-                },
-            )
-            uids.append(fuid)
-            # Get feature data from lookup instead of DataFrame filtering
-            feature_data = features_lookup.get(fuid)
-            if feature_data:
-                feature_data_list.append(feature_data)
-        if not feature_data_list:
-            # Skip this consensus feature if no valid features found
-            continue
-        # Compute statistics using vectorized operations on collected data
-        # Convert to numpy arrays for faster computation
-        rt_values = np.array(
-            [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
-        )
-        mz_values = np.array(
-            [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
-        )
-        rt_start_values = np.array(
-            [
-                fd.get("rt_start", 0)
-                for fd in feature_data_list
-                if fd.get("rt_start") is not None
-            ],
-        )
-        rt_end_values = np.array(
-            [
-                fd.get("rt_end", 0)
-                for fd in feature_data_list
-                if fd.get("rt_end") is not None
-            ],
-        )
-        rt_delta_values = np.array(
-            [
-                fd.get("rt_delta", 0)
-                for fd in feature_data_list
-                if fd.get("rt_delta") is not None
-            ],
-        )
-        mz_start_values = np.array(
-            [
-                fd.get("mz_start", 0)
-                for fd in feature_data_list
-                if fd.get("mz_start") is not None
-            ],
-        )
-        mz_end_values = np.array(
-            [
-                fd.get("mz_end", 0)
-                for fd in feature_data_list
-                if fd.get("mz_end") is not None
-            ],
-        )
-        inty_values = np.array(
-            [
-                fd.get("inty", 0)
-                for fd in feature_data_list
-                if fd.get("inty") is not None
-            ],
-        )
-        coherence_values = np.array(
-            [
-                fd.get("chrom_coherence", 0)
-                for fd in feature_data_list
-                if fd.get("chrom_coherence") is not None
-            ],
-        )
-        prominence_values = np.array(
-            [
-                fd.get("chrom_prominence", 0)
-                for fd in feature_data_list
-                if fd.get("chrom_prominence") is not None
-            ],
-        )
-        prominence_scaled_values = np.array(
-            [
-                fd.get("chrom_prominence_scaled", 0)
-                for fd in feature_data_list
-                if fd.get("chrom_prominence_scaled") is not None
-            ],
-        )
-        height_scaled_values = np.array(
-            [
-                fd.get("chrom_height_scaled", 0)
-                for fd in feature_data_list
-                if fd.get("chrom_height_scaled") is not None
-            ],
-        )
-        iso_values = np.array(
-            [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
-        )
-        charge_values = np.array(
-            [
-                fd.get("charge", 0)
-                for fd in feature_data_list
-                if fd.get("charge") is not None
-            ],
-        )
-        # adduct_values
-        # Collect all adducts from feature_data_list to create consensus adduct information
-        # Only consider adducts that are in study._get_adducts() plus items with '?'
-        all_adducts = []
-        adduct_masses = {}
-        # Get valid adducts from study._get_adducts()
-        valid_adducts = set()
-        try:
-            study_adducts_df = self._get_adducts()
-            if not study_adducts_df.is_empty():
-                valid_adducts.update(study_adducts_df["name"].to_list())
-        except Exception as e:
-            self.logger.warning(f"Could not retrieve study adducts: {e}")
-        # Always allow '?' adducts
-        valid_adducts.add("?")
-        for fd in feature_data_list:
-            # Get individual adduct and mass from each feature data (fd)
-            adduct = fd.get("adduct")
-            adduct_mass = fd.get("adduct_mass")
-            if adduct is not None:
-                # Only include adducts that are valid (from study._get_adducts() or contain '?')
-                if adduct in valid_adducts or "?" in adduct:
-                    all_adducts.append(adduct)
-                    if adduct_mass is not None:
-                        adduct_masses[adduct] = adduct_mass
-        # Calculate adduct_values for the consensus feature
-        adduct_values = []
-        if all_adducts:
-            adduct_counts = {
-                adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
-            }
-            total_count = sum(adduct_counts.values())
-            for adduct, count in adduct_counts.items():
-                percentage = (count / total_count) * 100 if total_count > 0 else 0
-                mass = adduct_masses.get(adduct, None)
-                # Store as list with [name, num, %] format for the adducts column
-                adduct_values.append(
-                    [
-                        str(adduct),
-                        int(count),
-                        float(round(percentage, 2)),
-                    ],
-                )
-        # Sort adduct_values by count in descending order
-        adduct_values.sort(key=lambda x: x[1], reverse=True)  # Sort by count (index 1)
-        # Store adduct_values for use in metadata
-        consensus_adduct_values = adduct_values
-        # Extract top adduct information for new columns
-        adduct_top = None
-        adduct_charge_top = None
-        adduct_mass_neutral_top = None
-        adduct_mass_shift_top = None
-        if consensus_adduct_values:
-            top_adduct_name = consensus_adduct_values[0][0]  # Get top adduct name
-            adduct_top = top_adduct_name
-            # Parse adduct information to extract charge and mass shift
-            # Handle "?" as "H" and parse common adduct formats
-            if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
-                adduct_charge_top = 1
-                adduct_mass_shift_top = 1.007825  # H mass
-            elif top_adduct_name == "[M+?]-":
-                adduct_charge_top = -1
-                adduct_mass_shift_top = -1.007825  # -H mass
-            else:
-                # Try to get charge and mass shift from study._get_adducts()
-                adduct_found = False
-                try:
-                    study_adducts_df = self._get_adducts()
-                    if not study_adducts_df.is_empty():
-                        # Look for exact match in study adducts
-                        matching_adduct = study_adducts_df.filter(
-                            pl.col("name") == top_adduct_name,
-                        )
-                        if not matching_adduct.is_empty():
-                            adduct_row = matching_adduct.row(0, named=True)
-                            adduct_charge_top = adduct_row["charge"]
-                            adduct_mass_shift_top = adduct_row["mass_shift"]
-                            adduct_found = True
-                except Exception as e:
-                    self.logger.warning(
-                        f"Could not lookup adduct in study adducts: {e}",
-                    )
-                if not adduct_found:
-                    # Fallback to regex parsing
-                    import re
-                    # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
-                    pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
-                    match = re.match(pattern, top_adduct_name)
-                    if match:
-                        sign = match.group(1)
-                        element = match.group(2)
-                        multiplier_str = match.group(3)
-                        charge_sign = match.group(4)
-                        multiplier = int(multiplier_str) if multiplier_str else 1
-                        charge = multiplier if charge_sign == "+" else -multiplier
-                        adduct_charge_top = charge
-                        # Calculate mass shift based on element
-                        element_masses = {
-                            "H": 1.007825,
-                            "Na": 22.989769,
-                            "K": 38.963708,
-                            "NH4": 18.033823,
-                            "Li": 7.016930,
-                            "Cl": 34.969401,
-                            "Br": 78.918885,
-                            "HCOO": 44.998201,
-                            "CH3COO": 59.013851,
-                            "H2O": 18.010565,
-                        }
-                        base_mass = element_masses.get(
-                            element,
-                            1.007825,
-                        )  # Default to H if unknown
-                        mass_shift = (
-                            base_mass * multiplier
-                            if sign == "+"
-                            else -base_mass * multiplier
-                        )
-                        adduct_mass_shift_top = mass_shift
-                    else:
-                        # Default fallback
-                        adduct_charge_top = 1
-                        adduct_mass_shift_top = 1.007825
-        else:
-            # No valid adducts found - assign default based on study polarity
-            study_polarity = getattr(self, "polarity", "positive")
-            if study_polarity in ["negative", "neg"]:
-                # Negative mode default
-                adduct_top = "[M-?]1-"
-                adduct_charge_top = -1
-                adduct_mass_shift_top = -1.007825  # -H mass (loss of proton)
-            else:
-                # Positive mode default (includes 'positive', 'pos', or any other value)
-                adduct_top = "[M+?]1+"
-                adduct_charge_top = 1
-                adduct_mass_shift_top = 1.007825  # H mass (gain of proton)
-            # Create a single default adduct entry in the adducts list for consistency
-            consensus_adduct_values = [[adduct_top, 1, 100.0]]
-        # Calculate neutral mass from consensus mz (for both cases)
-        consensus_mz = (
-            round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
-        )
-        if adduct_charge_top and adduct_mass_shift_top is not None:
-            adduct_mass_neutral_top = (
-                consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
-            )
-        # Calculate number of MS2 spectra
-        ms2_count = 0
-        for fd in feature_data_list:
-            ms2_scans = fd.get("ms2_scans")
-            if ms2_scans is not None:
-                ms2_count += len(ms2_scans)
-        metadata_list.append(
-            {
-                "consensus_uid": int(i),  # "consensus_id": i,
-                "consensus_id": str(feature.getUniqueId()),
-                "quality": round(float(feature.getQuality()), 3),
-                "number_samples": len(feature_data_list),
-                # "number_ext": int(len(features_list)),
-                "rt": round(float(np.mean(rt_values)), 4)
-                if len(rt_values) > 0
-                else 0.0,
-                "mz": round(float(np.mean(mz_values)), 4)
-                if len(mz_values) > 0
-                else 0.0,
-                "rt_min": round(float(np.min(rt_values)), 3)
-                if len(rt_values) > 0
-                else 0.0,
-                "rt_max": round(float(np.max(rt_values)), 3)
-                if len(rt_values) > 0
-                else 0.0,
-                "rt_mean": round(float(np.mean(rt_values)), 3)
-                if len(rt_values) > 0
-                else 0.0,
-                "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
-                if len(rt_start_values) > 0
-                else 0.0,
-                "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
-                if len(rt_end_values) > 0
-                else 0.0,
-                "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
-                if len(rt_delta_values) > 0
-                else 0.0,
-                "mz_min": round(float(np.min(mz_values)), 4)
-                if len(mz_values) > 0
-                else 0.0,
-                "mz_max": round(float(np.max(mz_values)), 4)
-                if len(mz_values) > 0
-                else 0.0,
-                "mz_mean": round(float(np.mean(mz_values)), 4)
-                if len(mz_values) > 0
-                else 0.0,
-                "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
-                if len(mz_start_values) > 0
-                else 0.0,
-                "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
-                if len(mz_end_values) > 0
-                else 0.0,
-                "inty_mean": round(float(np.mean(inty_values)), 0)
-                if len(inty_values) > 0
-                else 0.0,
-                "bl": -1.0,
-                "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
-                if len(coherence_values) > 0
-                else 0.0,
-                "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
-                if len(prominence_values) > 0
-                else 0.0,
-                "chrom_prominence_scaled_mean": round(
-                    float(np.mean(prominence_scaled_values)),
-                    3,
-                )
-                if len(prominence_scaled_values) > 0
-                else 0.0,
-                "chrom_height_scaled_mean": round(
-                    float(np.mean(height_scaled_values)),
-                    3,
-                )
-                if len(height_scaled_values) > 0
-                else 0.0,
-                "iso_mean": round(float(np.mean(iso_values)), 2)
-                if len(iso_values) > 0
-                else 0.0,
-                "charge_mean": round(float(np.mean(charge_values)), 2)
-                if len(charge_values) > 0
-                else 0.0,
-                "number_ms2": int(ms2_count),
-                "adducts": consensus_adduct_values
-                if consensus_adduct_values
-                else [],  # Ensure it's always a list
-                # New columns for top-ranked adduct information
-                "adduct_top": adduct_top,
-                "adduct_charge_top": adduct_charge_top,
-                "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
-                if adduct_mass_neutral_top is not None
-                else None,
-                "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
-                if adduct_mass_shift_top is not None
-                else None,
-                # New columns for top-scoring identification results
-                "id_top_name": None,
-                "id_top_class": None,
-                "id_top_adduct": None,
-                "id_top_score": None,
-            },
-        )
-    consensus_mapping_df = pl.DataFrame(consensus_mapping)
-    # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
-    l1 = len(consensus_mapping_df)
-    consensus_mapping_df = consensus_mapping_df.filter(
-        pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
-    )
-    self.logger.debug(
-        f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
-    )
-    self.consensus_mapping_df = consensus_mapping_df
-    self.consensus_df = pl.DataFrame(metadata_list, strict=False)
-    if min_samples is None:
-        min_samples = 1
-    if min_samples < 1:
-        min_samples = int(min_samples * len(self.samples_df))
-    # Validate that min_samples doesn't exceed the number of samples
-    if min_samples > len(self.samples_df):
-        self.logger.warning(
-            f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
-            f"Setting min_samples to {len(self.samples_df)}.",
-        )
-        min_samples = len(self.samples_df)
-    # filter out consensus features with less than min_samples features
-    l1 = len(self.consensus_df)
-    self.consensus_df = self.consensus_df.filter(
-        pl.col("number_samples") >= min_samples,
-    )
-    self.logger.debug(
-        f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
-    )
-    # filter out consensus mapping with less than min_samples features
-    self.consensus_mapping_df = self.consensus_mapping_df.filter(
-        pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
-    )
-    self.consensus_map = consensus_map
-    # Add adduct grouping and adduct_of assignment
-    if len(self.consensus_df) > 0:
-        # Get rt_tol and mz_tol from kwargs or use defaults from merge_defaults
-        adduct_rt_tol = rt_tol  # Use the same rt_tol from merge parameters
-        adduct_mz_tol = mz_tol  # Use the same mz_tol from merge parameters
-        # Get relevant columns for grouping
-        consensus_data = []
-        for row in self.consensus_df.iter_rows(named=True):
-            consensus_data.append(
-                {
-                    "consensus_uid": row["consensus_uid"],
-                    "rt": row["rt"],
-                    "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
-                    "adduct_top": row.get("adduct_top"),
-                    "inty_mean": row.get("inty_mean", 0),
-                },
-            )
-        # Use optimized adduct grouping
-        adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
-            self, consensus_data, adduct_rt_tol, adduct_mz_tol
-        )
-        # Add the new columns to consensus_df
-        self.consensus_df = self.consensus_df.with_columns(
-            [
-                pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
-                pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
-            ],
-        )
-    # calculate the completeness of the consensus map
-    if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
-        c = (
-            len(self.consensus_mapping_df)
-            / len(self.consensus_df)
-            / len(self.samples_df)
-        )
-        self.logger.info(
-            f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
-        )
-    else:
-        self.logger.warning(
-            f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
-            f"This may be due to min_samples ({min_samples}) being too high for the available data.",
-        )
-    if link_ms2:
-        self.find_ms2()
-def _optimized_feature_lookup(study_obj, features_df):
-    """
-    Optimized feature lookup creation using Polars operations.
-    """
-    study_obj.logger.debug("Creating optimized feature lookup...")
-    start_time = time.time()
-    # Use Polars select for faster conversion
-    feature_columns = [
-        "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
-        "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
-        "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
-        "ms2_scans", "adduct", "adduct_mass"
-    ]
-    # Filter to only existing columns
-    existing_columns = [col for col in feature_columns if col in features_df.columns]
-    # Convert to dictionary more efficiently
-    selected_df = features_df.select(existing_columns)
-    features_lookup = {}
-    for row in selected_df.iter_rows(named=True):
-        feature_uid = row["feature_uid"]
-        features_lookup[feature_uid] = {k: v for k, v in row.items() if k != "feature_uid"}
-    lookup_time = time.time() - start_time
-    if len(features_lookup) > 50000:
-        study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
-    return features_lookup
-def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
-    """
-    Optimized O(n log n) adduct grouping using spatial indexing.
-    Args:
-        study_obj: Study object with logger
-        consensus_data: List of consensus feature dictionaries
-        rt_tol: RT tolerance in minutes
-        mz_tol: m/z tolerance in Da
-    Returns:
-        Tuple of (adduct_group_list, adduct_of_list)
-    """
-    if not consensus_data:
-        return [], []
-    n_features = len(consensus_data)
-    if n_features > 1000:
-        study_obj.logger.info(f"Optimizing adduct grouping for {n_features} consensus features...")
-    start_time = time.time()
-    # Build spatial index using RT and neutral mass as coordinates
-    features_by_mass = defaultdict(list)
-    mass_bin_size = mz_tol * 2  # 2x tolerance for conservative binning
-    valid_features = []
-    for feature in consensus_data:
-        consensus_uid = feature["consensus_uid"]
-        rt = feature["rt"]
-        neutral_mass = feature.get("adduct_mass_neutral_top")
-        intensity = feature.get("inty_mean", 0)
-        adduct = feature.get("adduct_top", "")
-        if neutral_mass is not None:
-            mass_bin = int(neutral_mass / mass_bin_size)
-            features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
-            valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
-    # Union-Find for efficient grouping
-    class UnionFind:
-        def __init__(self, n):
-            self.parent = list(range(n))
-            self.rank = [0] * n
-        def find(self, x):
-            if self.parent[x] != x:
-                self.parent[x] = self.find(self.parent[x])
-            return self.parent[x]
-        def union(self, x, y):
-            px, py = self.find(x), self.find(y)
-            if px == py:
-                return
-            if self.rank[px] < self.rank[py]:
-                px, py = py, px
-            self.parent[py] = px
-            if self.rank[px] == self.rank[py]:
-                self.rank[px] += 1
-    uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
-    uf = UnionFind(len(valid_features))
-    # Find groups using spatial index
-    checked_pairs = set()
-    for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
-        for bin_offset in [-1, 0, 1]:
-            check_bin = bin1 + bin_offset
-            if check_bin not in features_by_mass:
-                continue
-            for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
-                if uid1 >= uid2:
-                    continue
-                pair = (min(uid1, uid2), max(uid1, uid2))
-                if pair in checked_pairs:
-                    continue
-                checked_pairs.add(pair)
-                mass_diff = abs(mass1 - mass2)
-                rt_diff = abs(rt1 - rt2) / 60.0  # Convert to minutes
-                if mass_diff <= mz_tol and rt_diff <= rt_tol:
-                    j = uid_to_idx[uid2]
-                    uf.union(i, j)
-    # Extract groups
-    groups_by_root = defaultdict(list)
-    for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
-        root = uf.find(i)
-        groups_by_root[root].append((uid, rt, mass, inty, adduct))
-    groups = {}
-    group_id = 1
-    assigned_groups = {}
-    for group_members in groups_by_root.values():
-        member_uids = [uid for uid, _, _, _, _ in group_members]
-        for uid in member_uids:
-            assigned_groups[uid] = group_id
-        groups[group_id] = member_uids
-        group_id += 1
-    # Handle features without neutral mass
-    for feature in consensus_data:
-        uid = feature["consensus_uid"]
-        if uid not in assigned_groups:
-            assigned_groups[uid] = group_id
-            groups[group_id] = [uid]
-            group_id += 1
-    # Determine adduct_of for each group
-    group_adduct_of = {}
-    for grp_id, member_uids in groups.items():
-        best_uid = None
-        best_priority = -1
-        best_intensity = 0
-        for uid in member_uids:
-            feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
-            if not feature_data:
-                continue
-            adduct = feature_data.get("adduct_top", "")
-            intensity = feature_data.get("inty_mean", 0)
-            priority = 0
-            if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
-                priority = 3
-            elif adduct and "[M-H]" in adduct:
-                priority = 2
-            elif adduct and "M" in adduct:
-                priority = 1
-            if priority > best_priority or (priority == best_priority and intensity > best_intensity):
-                best_uid = uid
-                best_priority = priority
-                best_intensity = intensity
-        group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
-    # Build final lists in same order as consensus_data
-    adduct_group_list = []
-    adduct_of_list = []
-    for feature in consensus_data:
-        uid = feature["consensus_uid"]
-        group = assigned_groups.get(uid, 0)
-        adduct_of = group_adduct_of.get(group, uid)
-        adduct_group_list.append(group)
-        adduct_of_list.append(adduct_of)
-    grouping_time = time.time() - start_time
-    if n_features > 1000:
-        study_obj.logger.info(f"Adduct grouping completed in {grouping_time:.2f}s ({len(groups)} groups)")
-    return adduct_group_list, adduct_of_list
-# Backward compatibility alias
-find_consensus = merge
 def find_ms2(self, **kwargs):
     """
     Links MS2 spectra to consensus features and stores the result in self.consensus_ms2.

masster 0.4.13__py3-none-any.whl → 0.4.16__py3-none-any.whl

Potentially problematic release.

masster 0.4.13py3-none-any.whl → 0.4.16py3-none-any.whl