PyPI - masster - Versions diffs - 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl - Mend

masster 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (17) hide show

masster/_version.py +1 -1
masster/logger.py +58 -43
masster/sample/h5.py +1 -1
masster/sample/plot.py +4 -4
masster/sample/processing.py +3 -3
masster/sample/save.py +5 -5
masster/study/h5.py +1 -1
masster/study/helpers.py +150 -5
masster/study/id.py +4 -4
masster/study/merge.py +565 -162
masster/study/processing.py +2 -2
masster/study/study.py +2 -1
{masster-0.5.7.dist-info → masster-0.5.9.dist-info}/METADATA +1 -1
{masster-0.5.7.dist-info → masster-0.5.9.dist-info}/RECORD +17 -17
{masster-0.5.7.dist-info → masster-0.5.9.dist-info}/WHEEL +0 -0
{masster-0.5.7.dist-info → masster-0.5.9.dist-info}/entry_points.txt +0 -0
{masster-0.5.7.dist-info → masster-0.5.9.dist-info}/licenses/LICENSE +0 -0

masster/study/merge.py CHANGED Viewed

@@ -441,8 +441,7 @@ def merge(study, **kwargs) -> None:
     cached_adducts_df = None
     cached_valid_adducts = None
     try:
-        from masster.study.id import _get_adducts
-        cached_adducts_df = _get_adducts(study)
+        cached_adducts_df = study._get_adducts()
         if not cached_adducts_df.is_empty():
             cached_valid_adducts = set(cached_adducts_df["name"].to_list())
         else:
@@ -819,7 +818,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
                         serialized_chunk_results.append((chunk_start_idx, consensus_features))
                         completed_chunks += 1
                         n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
-                        study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
+                        study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
                     except Exception as exc:
                         # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
                         if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
@@ -853,7 +852,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
                             serialized_chunk_results.append((chunk_start_idx, consensus_features))
                             completed_chunks += 1
                             n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
-                            study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
+                            study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
                         except Exception as exc:
                             study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
                             raise exc
@@ -994,7 +993,7 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
                         serialized_chunk_results.append((chunk_start_idx, consensus_features))
                         completed_chunks += 1
                         n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
-                        study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
+                        study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
                     except Exception as exc:
                         # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
                         if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
@@ -1028,7 +1027,7 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
                             serialized_chunk_results.append((chunk_start_idx, consensus_features))
                             completed_chunks += 1
                             n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
-                            study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
+                            study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
                         except Exception as exc:
                             study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
                             raise exc
@@ -2258,6 +2257,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
                 {
                     "consensus_uid": row["consensus_uid"],
                     "rt": row["rt"],
+                    "mz": row["mz"],  # Add missing mz field
                     "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
                     "adduct_top": row.get("adduct_top"),
                     "inty_mean": row.get("inty_mean", 0),
@@ -2265,8 +2265,9 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
             )
         # Use optimized adduct grouping
+        study.logger.info(f"About to call adduct grouping for {len(consensus_data)} consensus features")
         adduct_group_list, adduct_of_list = __merge_adduct_grouping(
-            study, consensus_data, rt_tol, mz_tol
+            study, consensus_data, rt_tol/3, mz_tol
         )
         # Add the new columns to consensus_df
@@ -2713,8 +2714,6 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
         cached_adducts_df: Pre-computed adducts DataFrame for performance
     """
     import polars as pl
-    import numpy as np
-    from collections import defaultdict
     # Check if consensus_df exists and has features
     if len(study.consensus_df) == 0:
@@ -2727,8 +2726,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
     if cached_adducts_df is None or cached_adducts_df.is_empty():
         try:
             # Use lower min_probability for better adduct coverage in mass shift identification
-            from masster.study.id import _get_adducts
-            cached_adducts_df = _get_adducts(study, min_probability=0.01)
+            cached_adducts_df = study._get_adducts(min_probability=0.01)
         except Exception as e:
             study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
             return
@@ -2822,9 +2820,8 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
         mz1 = feature1["mz"]
         adduct1 = feature1["adduct_top"]
-        # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
-        # if adduct1 and "?" not in adduct1:
-        #     continue
+        # Conservative approach: Don't skip features here - let algorithm find pairs first
+        # We'll check for inappropriate assignments later in the pair processing logic
         # Search for coeluting features within strict RT tolerance
         for j in range(i + 1, n_features):
@@ -2838,9 +2835,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
             mz2 = feature2["mz"]
             adduct2 = feature2["adduct_top"]
-            # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
-            # if adduct2 and "?" not in adduct2:
-            #     continue
+            # Conservative approach: Don't skip feature2 here either - process all potential pairs
             # Calculate observed m/z difference
             mz_diff = mz2 - mz1
@@ -2890,24 +2885,45 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
                         else:
                             # Assignment based on mass shift direction
                             # catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
-                            # If catalog_shift > 0: from_adduct has higher m/z than to_adduct
-                            # If catalog_shift < 0: from_adduct has lower m/z than to_adduct
-                            # observed mz_diff = mz2 - mz1
-                            # If mz_diff matches catalog_shift: feature2 should get to_adduct, feature1 should get from_adduct
-                            # If mz_diff matches -catalog_shift: assignments are swapped
+                            # If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
+                            # If catalog_shift < 0: from_adduct has lower mass shift than to_adduct
+                            # observed mz_diff = mz2 - mz1 (always positive for mz2 > mz1)
+                            #
+                            # CRITICAL FIX: Correct assignment logic
+                            # When mz_diff matches positive catalog_shift:
+                            #   - from_adduct is the heavier adduct (higher mass shift)
+                            #   - to_adduct is the lighter adduct (lower mass shift)
+                            #   - Higher m/z feature should get the heavier adduct (from_adduct)
+                            #   - Lower m/z feature should get the lighter adduct (to_adduct)
                             if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
                                 # mz_diff matches catalog_shift direction
-                                from_feature = feature1
-                                to_feature = feature2
-                                from_adduct_name = best_rel["from_adduct"]
-                                to_adduct_name = best_rel["to_adduct"]
+                                if catalog_shift > 0:
+                                    # from_adduct is heavier, to_adduct is lighter
+                                    from_feature = feature2  # Higher m/z gets heavier adduct
+                                    to_feature = feature1    # Lower m/z gets lighter adduct
+                                    from_adduct_name = best_rel["from_adduct"]  # Heavier adduct
+                                    to_adduct_name = best_rel["to_adduct"]      # Lighter adduct
+                                else:
+                                    # from_adduct is lighter, to_adduct is heavier
+                                    from_feature = feature1  # Lower m/z gets lighter adduct
+                                    to_feature = feature2    # Higher m/z gets heavier adduct
+                                    from_adduct_name = best_rel["from_adduct"]  # Lighter adduct
+                                    to_adduct_name = best_rel["to_adduct"]      # Heavier adduct
                             else:
                                 # mz_diff matches reverse direction of catalog_shift
-                                from_feature = feature2
-                                to_feature = feature1
-                                from_adduct_name = best_rel["to_adduct"]
-                                to_adduct_name = best_rel["from_adduct"]
+                                if catalog_shift > 0:
+                                    # Reverse: from_adduct becomes lighter, to_adduct becomes heavier
+                                    from_feature = feature1  # Lower m/z gets lighter adduct
+                                    to_feature = feature2    # Higher m/z gets heavier adduct
+                                    from_adduct_name = best_rel["to_adduct"]    # Now lighter adduct
+                                    to_adduct_name = best_rel["from_adduct"]    # Now heavier adduct
+                                else:
+                                    # Reverse: from_adduct becomes heavier, to_adduct becomes lighter
+                                    from_feature = feature2  # Higher m/z gets heavier adduct
+                                    to_feature = feature1    # Lower m/z gets lighter adduct
+                                    from_adduct_name = best_rel["to_adduct"]    # Now heavier adduct
+                                    to_adduct_name = best_rel["from_adduct"]    # Now lighter adduct
                         # Get adduct details from catalog
                         from_adduct_info = adduct_info.get(from_adduct_name, {})
@@ -2922,7 +2938,40 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
                         from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
                         to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
-                        # Store updates
+                        # Smart conservative check: prevent inappropriate assignments to isolated features
+                        # Check if both features are isolated (single-member groups) with [M+?]1+ assignments
+                        def is_isolated_unknown_feature(feature):
+                            """Check if a feature is isolated with unknown adduct"""
+                            if not feature["adduct_top"] or "[M+?]" not in feature["adduct_top"]:
+                                return False  # Not unknown, safe to process
+                            # Check group size
+                            try:
+                                feature_row = study.consensus_df.filter(study.consensus_df["consensus_uid"] == feature["consensus_uid"])
+                                if len(feature_row) > 0:
+                                    adduct_group = feature_row["adduct_group"].iloc[0]
+                                    if adduct_group > 0:
+                                        group_members = study.consensus_df.filter(study.consensus_df["adduct_group"] == adduct_group)
+                                        return len(group_members) <= 1  # Isolated if group size <= 1
+                            except Exception:
+                                pass
+                            return True  # Default to isolated if can't determine
+                        from_isolated = is_isolated_unknown_feature(from_feature)
+                        to_isolated = is_isolated_unknown_feature(to_feature)
+                        # Only skip assignment if BOTH features are isolated AND would get the SAME adduct
+                        # (This prevents inappropriate duplicate assignments to isolated features)
+                        skip_assignment = (from_isolated and to_isolated and from_adduct_name == to_adduct_name)
+                        if skip_assignment:
+                            study.logger.debug(
+                                f"Skipping inappropriate assignment: both isolated features would get {from_adduct_name} "
+                                f"(UIDs {from_feature['consensus_uid']}, {to_feature['consensus_uid']})"
+                            )
+                            continue  # Skip this pair, continue to next relationship
+                        # Store updates (legitimate pair or at least one feature already has specific adduct)
                         adduct_updates[from_feature["consensus_uid"]] = {
                             "adduct_top": from_adduct_name,
                             "adduct_charge_top": from_charge,
@@ -3083,164 +3132,518 @@ def __merge_feature_lookup(study_obj, features_df):
     return features_lookup
+def _get_features_matrix(study, consensus_data, quant_col="inty"):
+    """
+    Create a local intensity matrix from features_df for correlation calculations.
+    Args:
+        study: Study object with features_df and samples_df
+        consensus_data: List of consensus feature dictionaries
+        quant_col: Column name to use for quantification (default: "inty")
+    Returns:
+        pandas.DataFrame: Matrix with consensus_uid as index, sample names as columns
+    """
+    import pandas as pd
+    import numpy as np
+    # Get all sample names
+    sample_names = study.samples_df["sample_name"].to_list()
+    consensus_uids = [int(f["consensus_uid"]) for f in consensus_data]
+    # Initialize matrix with zeros
+    matrix_data = pd.DataFrame(
+        index=pd.Index(consensus_uids, name="consensus_uid"),
+        columns=sample_names,
+        data=0.0,
+        dtype=float
+    )
+    study.logger.debug(f"Building local features matrix: {len(consensus_uids)} features x {len(sample_names)} samples")
+    # Fill matrix with actual intensity values
+    features_df_pandas = study.features_df.to_pandas()
+    samples_df_pandas = study.samples_df.to_pandas()
+    consensus_mapping_pandas = study.consensus_mapping_df.to_pandas()
+    # Create sample_uid to sample_name mapping
+    uid_to_name = dict(zip(samples_df_pandas["sample_uid"], samples_df_pandas["sample_name"]))
+    # For each consensus feature, get intensities from all samples
+    for consensus_uid in consensus_uids:
+        # Get all feature_uids that map to this consensus_uid
+        feature_mappings = consensus_mapping_pandas[
+            consensus_mapping_pandas["consensus_uid"] == consensus_uid
+        ]
+        for _, mapping in feature_mappings.iterrows():
+            feature_uid = mapping["feature_uid"]
+            sample_uid = mapping["sample_uid"]
+            sample_name = uid_to_name.get(sample_uid, f"sample_{sample_uid}")
+            # Get intensity for this feature
+            feature_row = features_df_pandas[
+                (features_df_pandas["feature_uid"] == feature_uid) &
+                (features_df_pandas["sample_uid"] == sample_uid)
+            ]
+            if len(feature_row) > 0:
+                intensity = feature_row[quant_col].iloc[0]
+                if pd.notna(intensity):
+                    matrix_data.loc[consensus_uid, sample_name] = float(intensity)
+    # Convert any remaining NaN to 0
+    matrix_data = matrix_data.fillna(0.0)
+    study.logger.debug(f"Local matrix built successfully with shape {matrix_data.shape}")
+    return matrix_data
+def _get_adduct_deltas_with_likelihood(study):
+    """
+    Extract all pairwise mass differences between adducts with joint likelihood scoring.
+    Args:
+        study: Study object with _get_adducts method
+    Returns:
+        List of tuples: (mass_delta, joint_likelihood, adduct1_name, adduct2_name)
+        Sorted by joint_likelihood descending (most likely pairs first)
+    """
+    try:
+        adducts_df = study._get_adducts()
+        if adducts_df is None or adducts_df.is_empty():
+            study.logger.warning("No adducts dataframe available for study")
+            return []
+        # Convert to pandas for easier manipulation
+        adducts_pd = adducts_df.to_pandas()
+        # Check if we have likelihood/probability information
+        likelihood_col = None
+        for col in ['likelihood', 'probability', 'freq', 'frequency', 'score']:
+            if col in adducts_pd.columns:
+                likelihood_col = col
+                break
+        # If no likelihood column, estimate based on adduct type
+        if likelihood_col is None:
+            adducts_pd['estimated_likelihood'] = adducts_pd.apply(_estimate_adduct_likelihood, axis=1)
+            likelihood_col = 'estimated_likelihood'
+        # Get mass column (try different possible column names)
+        mass_col = None
+        for col_name in ['mass_shift', 'mass', 'mass_shift_da', 'mass_da']:
+            if col_name in adducts_pd.columns:
+                mass_col = col_name
+                break
+        if mass_col is None:
+            study.logger.warning(f"No mass column found in adducts dataframe. Available columns: {list(adducts_pd.columns)}")
+            return []
+        # Calculate all pairwise differences with joint likelihoods
+        adduct_pairs = []
+        for i in range(len(adducts_pd)):
+            for j in range(i + 1, len(adducts_pd)):
+                row_i = adducts_pd.iloc[i]
+                row_j = adducts_pd.iloc[j]
+                # Skip if masses are NaN or invalid
+                if (hasattr(row_i[mass_col], '__iter__') and not isinstance(row_i[mass_col], str)) or \
+                   (hasattr(row_j[mass_col], '__iter__') and not isinstance(row_j[mass_col], str)):
+                    continue
+                mass_i = float(row_i[mass_col])
+                mass_j = float(row_j[mass_col])
+                delta = abs(mass_i - mass_j)
+                if delta > 0.1:  # Only meaningful mass differences
+                    # Joint likelihood is sum of individual likelihoods
+                    joint_likelihood = float(row_i[likelihood_col]) + float(row_j[likelihood_col])
+                    adduct1_name = row_i.get('adduct', row_i.get('name', f'adduct_{i}'))
+                    adduct2_name = row_j.get('adduct', row_j.get('name', f'adduct_{j}'))
+                    # CRITICAL FIX: Order adducts consistently from lower mass to higher mass
+                    # This ensures consistent assignment: lower mass adduct = from_adduct, higher mass adduct = to_adduct
+                    if mass_i <= mass_j:
+                        # row_i has lower or equal mass shift -> from_adduct
+                        # row_j has higher mass shift -> to_adduct
+                        adduct_pairs.append((round(delta, 4), joint_likelihood, adduct1_name, adduct2_name))
+                    else:
+                        # row_j has lower mass shift -> from_adduct
+                        # row_i has higher mass shift -> to_adduct
+                        adduct_pairs.append((round(delta, 4), joint_likelihood, adduct2_name, adduct1_name))
+        # Sort by joint likelihood descending (most likely pairs first)
+        adduct_pairs.sort(key=lambda x: x[1], reverse=True)
+        study.logger.debug(f"Extracted {len(adduct_pairs)} adduct pairs with likelihood scoring")
+        return adduct_pairs
+    except Exception as e:
+        study.logger.warning(f"Could not extract adduct deltas with likelihood: {e}. No adducts defined - returning empty list.")
+        return []
+def _estimate_adduct_likelihood(adduct_row):
+    """
+    Estimate likelihood of an adduct based on common knowledge.
+    Args:
+        adduct_row: pandas Series with adduct information
+    Returns:
+        float: Estimated likelihood (0.0 to 1.0)
+    """
+    adduct_name = str(adduct_row.get('adduct', adduct_row.get('name', ''))).lower()
+    # Common likelihood estimates based on adduct frequency in positive mode
+    likelihood_map = {
+        '[m+h]': 0.9,      # Most common
+        '[m+na]': 0.7,     # Very common
+        '[m+nh4]': 0.6,    # Common
+        '[m+k]': 0.3,      # Less common
+        '[m+2h]': 0.2,     # Doubly charged, less frequent
+        '[m+3h]': 0.1,     # Triply charged, rare
+        '[m+h-h2o]': 0.4,  # Loss adducts, moderately common
+    }
+    # Find best match
+    for pattern, likelihood in likelihood_map.items():
+        if pattern in adduct_name:
+            return likelihood
+    # Default for unknown adducts
+    return 0.2
+def _get_adduct_deltas(study):
+    """
+    Extract all pairwise mass differences between adducts from study adducts data.
+    Args:
+        study: Study object with _get_adducts method
+    Returns:
+        List of mass differences (deltas) for adduct filtering
+    """
+    # Use the enhanced function and extract just the deltas for backward compatibility
+    adduct_pairs = _get_adduct_deltas_with_likelihood(study)
+    return [pair[0] for pair in adduct_pairs]  # Extract just the mass deltas
+def _fast_correlation(vec1, vec2):
+    """
+    Fast Pearson correlation coefficient calculation.
+    Optimized for repeated use in tight loops.
+    """
+    if len(vec1) != len(vec2):
+        return 0.0
+    # Remove NaN values and corresponding positions
+    mask = ~(np.isnan(vec1) | np.isnan(vec2))
+    if np.sum(mask) < 2:  # Need at least 2 valid points
+        return 0.0
+    v1 = vec1[mask]
+    v2 = vec2[mask]
+    # Fast correlation using numpy built-in
+    try:
+        corr_matrix = np.corrcoef(v1, v2)
+        return corr_matrix[0, 1] if not np.isnan(corr_matrix[0, 1]) else 0.0
+    except Exception:
+        return 0.0
 def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
     """
-    Optimized O(n log n) adduct grouping using spatial indexing.
+    Groups consensus features that represent the same molecule with different adducts.
+    Uses multi-step filtering:
+    1. Build local intensity matrix once
+    2. RT coelution filtering with spatial indexing
+    3. Mass shift validation with hash lookup
+    4. Hierarchical boss structure (prevent transitivity)
+    5. Correlation-based confirmation
+    6. Intensity-based ranking for final selection
     Args:
-        study: Study object with logger
+        study: Study object
         consensus_data: List of consensus feature dictionaries
-        rt_tol: RT tolerance in minutes
-        mz_tol: m/z tolerance in Da
+        rt_tol: Retention time tolerance (seconds)
+        mz_tol: M/z tolerance (Da)
     Returns:
         Tuple of (adduct_group_list, adduct_of_list)
     """
     if not consensus_data:
         return [], []
     n_features = len(consensus_data)
-    if n_features > 10000:
-        study.logger.info(f"Adduct grouping for {n_features} consensus features...")
-    else:
-        study.logger.debug(f"Adduct grouping for {n_features} consensus features...")
+    study.logger.info(f"Starting adduct grouping for {n_features} features")
-    # Build spatial index using RT and neutral mass as coordinates
-    features_by_mass = defaultdict(list)
-    mass_bin_size = mz_tol * 2  # 2x tolerance for conservative binning
-    valid_features = []
-    for feature in consensus_data:
-        consensus_uid = feature["consensus_uid"]
+    # Step 1: Build local intensity matrix ONCE
+    try:
+        intensity_matrix_pd = _get_features_matrix(study, consensus_data, quant_col="inty")
+        if intensity_matrix_pd is None or len(intensity_matrix_pd) == 0:
+            study.logger.warning("Could not build local intensity matrix - creating single-feature groups")
+            adduct_group_list = list(range(1, len(consensus_data) + 1))
+            adduct_of_list = [0] * len(consensus_data)
+            return adduct_group_list, adduct_of_list
+        study.logger.info(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
+    except Exception as e:
+        study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
+        adduct_group_list = list(range(1, len(consensus_data) + 1))
+        adduct_of_list = [0] * len(consensus_data)
+        return adduct_group_list, adduct_of_list
+    # Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
+    adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
+    study.logger.info(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
+    # Build hash map for O(1) mass shift lookup
+    mass_shift_map = {}  # rounded_delta -> [(likelihood, adduct1, adduct2), ...]
+    for mass_delta, joint_likelihood, adduct1, adduct2 in adduct_pairs_with_likelihood:
+        key = round(mass_delta / mz_tol) * mz_tol  # Round to tolerance grid
+        if key not in mass_shift_map:
+            mass_shift_map[key] = []
+        mass_shift_map[key].append((joint_likelihood, adduct1, adduct2))
+    # Sort each mass shift group by likelihood (highest first)
+    for key in mass_shift_map:
+        mass_shift_map[key].sort(key=lambda x: x[0], reverse=True)
+    # Step 3: Pre-compute feature properties and sort by RT for spatial filtering
+    feature_props = []
+    for i, feature in enumerate(consensus_data):
+        uid = feature["consensus_uid"]
         rt = feature["rt"]
-        neutral_mass = feature.get("adduct_mass_neutral_top")
+        mz = feature["mz"]
         intensity = feature.get("inty_mean", 0)
-        adduct = feature.get("adduct_top", "")
-        if neutral_mass is not None:
-            mass_bin = int(neutral_mass / mass_bin_size)
-            features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
-            valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
+        # Get matrix vector once
+        matrix_vector = intensity_matrix_pd.loc[uid].values if uid in intensity_matrix_pd.index else None
+        feature_props.append({
+            'index': i,
+            'uid': uid,
+            'rt': rt,
+            'mz': mz,
+            'intensity': intensity,
+            'vector': matrix_vector,
+            'feature': feature
+        })
-    # Union-Find for efficient grouping
-    class UnionFind:
-        def __init__(study, n):
-            study.parent = list(range(n))
-            study.rank = [0] * n
-        def find(study, x):
-            if study.parent[x] != x:
-                study.parent[x] = study.find(study.parent[x])
-            return study.parent[x]
-        def union(study, x, y):
-            px, py = study.find(x), study.find(y)
-            if px == py:
-                return
-            if study.rank[px] < study.rank[py]:
-                px, py = py, px
-            study.parent[py] = px
-            if study.rank[px] == study.rank[py]:
-                study.rank[px] += 1
-    uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
-    uf = UnionFind(len(valid_features))
-    # Find groups using spatial index
-    checked_pairs = set()
-    for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
-        for bin_offset in [-1, 0, 1]:
-            check_bin = bin1 + bin_offset
-            if check_bin not in features_by_mass:
-                continue
-            for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
-                if uid1 >= uid2:
-                    continue
+    # Sort by RT for efficient spatial filtering
+    feature_props.sort(key=lambda x: x['rt'])
+    # Initialize grouping structures
+    uid_to_boss = {}  # Hierarchical structure: uid -> boss_uid
+    boss_to_members = {}  # boss_uid -> [member_uids]
+    processed_uids = set()
+    # Step 4: Process features with optimized RT filtering
+    for i, boss_prop in enumerate(feature_props):
+        boss_uid = boss_prop['uid']
+        if boss_uid in processed_uids:
+            continue
+        if boss_prop['vector'] is None:
+            processed_uids.add(boss_uid)
+            continue
+        # Initialize as boss
+        if boss_uid not in uid_to_boss:
+            uid_to_boss[boss_uid] = boss_uid
+            boss_to_members[boss_uid] = []
+        boss_rt = boss_prop['rt']
+        boss_mz = boss_prop['mz']
+        boss_vector = boss_prop['vector']
+        # Step 5: Efficient RT coelution filtering using sorted array
+        candidate_pairs = []
+        # Search backwards from current position
+        j = i - 1
+        while j >= 0 and (boss_rt - feature_props[j]['rt']) <= rt_tol:
+            candidate = feature_props[j]
+            if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
+                if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
+                    # Calculate mz difference and check mass shift
+                    mz_diff = abs(boss_mz - candidate['mz'])
+                    mass_shift_key = round(mz_diff / mz_tol) * mz_tol
+                    if mass_shift_key in mass_shift_map:
+                        likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0]  # Best likelihood
+                        candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
+            j -= 1
+        # Search forwards from current position
+        j = i + 1
+        while j < len(feature_props) and (feature_props[j]['rt'] - boss_rt) <= rt_tol:
+            candidate = feature_props[j]
+            if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
+                if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
+                    # Calculate mz difference and check mass shift
+                    mz_diff = abs(boss_mz - candidate['mz'])
+                    mass_shift_key = round(mz_diff / mz_tol) * mz_tol
+                    if mass_shift_key in mass_shift_map:
+                        likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0]  # Best likelihood
+                        candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
+            j += 1
+        # Sort candidates by likelihood (descending) to prioritize chemically meaningful pairs
+        candidate_pairs.sort(key=lambda x: x[1], reverse=True)
+        # Step 6: Process candidates in likelihood priority order
+        for candidate_prop, likelihood, adduct_info in candidate_pairs:
+            candidate_uid = candidate_prop['uid']
+            candidate_vector = candidate_prop['vector']
+            # Correlation confirmation with optimized threshold
+            try:
+                correlation = _fast_correlation(boss_vector, candidate_vector)
-                pair = (min(uid1, uid2), max(uid1, uid2))
-                if pair in checked_pairs:
+                if correlation < 0.5:  # More permissive for legitimate adduct relationships
                     continue
-                checked_pairs.add(pair)
-                mass_diff = abs(mass1 - mass2)
-                rt_diff = abs(rt1 - rt2) / 60.0  # Convert to minutes
+            except Exception:
+                continue
+            # Step 7: Hierarchical assignment (merge groups if needed)
+            if candidate_uid in boss_to_members:
+                old_members = boss_to_members[candidate_uid].copy()
+                del boss_to_members[candidate_uid]
-                if mass_diff <= mz_tol and rt_diff <= rt_tol:
-                    j = uid_to_idx[uid2]
-                    uf.union(i, j)
+                # Reassign old members to new boss
+                for member in old_members:
+                    uid_to_boss[member] = boss_uid
+                    boss_to_members[boss_uid].append(member)
+            # Assign candidate to current boss
+            uid_to_boss[candidate_uid] = boss_uid
+            boss_to_members[boss_uid].append(candidate_uid)
+            processed_uids.add(candidate_uid)
+        processed_uids.add(boss_uid)
+    # Step 8: Intensity-based ranking within groups (optimized)
+    for boss_uid in list(boss_to_members.keys()):
+        members = boss_to_members[boss_uid]
+        if len(members) == 0:
+            continue
+        all_group_members = [boss_uid] + members
+        # Find member with highest intensity efficiently
+        max_intensity = -1
+        new_boss = boss_uid
+        for member_uid in all_group_members:
+            # Find member_uid in feature_props
+            member_intensity = next((fp['intensity'] for fp in feature_props if fp['uid'] == member_uid), 0)
+            if member_intensity > max_intensity:
+                max_intensity = member_intensity
+                new_boss = member_uid
+        # Update boss if needed
+        if new_boss != boss_uid:
+            boss_to_members[new_boss] = [m for m in all_group_members if m != new_boss]
+            del boss_to_members[boss_uid]
+            # Update all member references
+            for member in all_group_members:
+                uid_to_boss[member] = new_boss
-    # Extract groups
-    groups_by_root = defaultdict(list)
-    for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
-        root = uf.find(i)
-        groups_by_root[root].append(valid_features[i])
+    # Count and log results
+    total_groups = len(boss_to_members)
+    multi_member_groups = sum(1 for members in boss_to_members.values() if len(members) > 0)
+    total_grouped_features = sum(len(members) + 1 for members in boss_to_members.values())
-    groups = {}
-    group_id = 1
-    assigned_groups = {}
+    study.logger.info(f"Grouping results: {total_groups} groups ({multi_member_groups} multi-member, {total_grouped_features} features)")
-    for group_members in groups_by_root.values():
-        member_uids = [uid for uid, _, _, _, _, _ in group_members]
-        for uid in member_uids:
-            assigned_groups[uid] = group_id
-        groups[group_id] = member_uids
-        group_id += 1
+    # Step 9: Convert to return format (optimized)
+    uid_to_index = {fp['uid']: fp['index'] for fp in feature_props}
+    adduct_group_list = [0] * n_features
+    adduct_of_list = [0] * n_features
-    # Handle features without neutral mass
-    for feature in consensus_data:
-        uid = feature["consensus_uid"]
-        if uid not in assigned_groups:
-            assigned_groups[uid] = group_id
-            groups[group_id] = [uid]
-            group_id += 1
-    # Determine adduct_of for each group
-    group_adduct_of = {}
-    for grp_id, member_uids in groups.items():
-        best_uid = None
-        best_priority = -1
-        best_intensity = 0
-        for uid in member_uids:
-            feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
-            if not feature_data:
-                continue
-            adduct = feature_data.get("adduct_top", "")
-            intensity = feature_data.get("inty_mean", 0)
-            priority = 0
-            if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
-                priority = 3
-            elif adduct and "[M-H]" in adduct:
-                priority = 2
-            elif adduct and "M" in adduct:
-                priority = 1
-            if priority > best_priority or (priority == best_priority and intensity > best_intensity):
-                best_uid = uid
-                best_priority = priority
-                best_intensity = intensity
+    group_counter = 1
+    for boss_uid, members in boss_to_members.items():
+        # Assign boss
+        boss_idx = uid_to_index[boss_uid]
+        adduct_group_list[boss_idx] = group_counter
+        adduct_of_list[boss_idx] = 0
+        # Assign members
+        for member_uid in members:
+            member_idx = uid_to_index[member_uid]
+            adduct_group_list[member_idx] = group_counter
+            adduct_of_list[member_idx] = boss_uid
-        group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
+        group_counter += 1
-    # Build final lists in same order as consensus_data
-    adduct_group_list = []
-    adduct_of_list = []
+    # Handle ungrouped features
+    for i in range(n_features):
+        if adduct_group_list[i] == 0:
+            adduct_group_list[i] = group_counter
+            adduct_of_list[i] = 0
+            group_counter += 1
-    for feature in consensus_data:
-        uid = feature["consensus_uid"]
-        group = assigned_groups.get(uid, 0)
-        adduct_of = group_adduct_of.get(group, uid)
+    return adduct_group_list, adduct_of_list
+def _fast_correlation(x, y):
+    """
+    Fast correlation coefficient calculation for consensus matrix data.
+    In the consensus matrix:
+    - Negative values (typically -1.0) indicate missing features
+    - Zero and positive values are actual intensities
+    - Only consider intensities >= 1000 for meaningful correlation
+    Args:
+        x, y: numpy arrays of the same length
-        adduct_group_list.append(group)
-        adduct_of_list.append(adduct_of)
+    Returns:
+        Correlation coefficient (float), 0 if cannot be calculated
+    """
+    import numpy as np
-    if n_features > 10000:
-        study.logger.info("Adduct grouping completed.")
-    else:
-        study.logger.debug("Adduct grouping completed.")
-    return adduct_group_list, adduct_of_list
+    # For consensus matrix: exclude negative values (missing features) and very low intensities
+    # Use a very low threshold since processed matrix values are often scaled/normalized
+    valid = ~(np.isnan(x) | np.isnan(y) | (x < 0) | (y < 0) | (x < 0.1) | (y < 0.1))
+    if np.sum(valid) < 3:  # Need at least 3 valid pairs
+        return 0.0
+    x_valid = x[valid]
+    y_valid = y[valid]
+    # If all values are the same (e.g., all zeros), correlation is undefined
+    if np.var(x_valid) == 0 or np.var(y_valid) == 0:
+        return 0.0
+    # Fast correlation using numpy
+    try:
+        correlation_matrix = np.corrcoef(x_valid, y_valid)
+        correlation = correlation_matrix[0, 1]
+        # Handle NaN result
+        if np.isnan(correlation):
+            return 0.0
+        return correlation
+    except Exception:
+        return 0.0

masster 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

Potentially problematic release.

masster 0.5.7py3-none-any.whl → 0.5.9py3-none-any.whl