masster 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py CHANGED
@@ -441,8 +441,7 @@ def merge(study, **kwargs) -> None:
441
441
  cached_adducts_df = None
442
442
  cached_valid_adducts = None
443
443
  try:
444
- from masster.study.id import _get_adducts
445
- cached_adducts_df = _get_adducts(study)
444
+ cached_adducts_df = study._get_adducts()
446
445
  if not cached_adducts_df.is_empty():
447
446
  cached_valid_adducts = set(cached_adducts_df["name"].to_list())
448
447
  else:
@@ -819,7 +818,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
819
818
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
820
819
  completed_chunks += 1
821
820
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
822
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
821
+ study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
823
822
  except Exception as exc:
824
823
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
825
824
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
@@ -853,7 +852,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
853
852
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
854
853
  completed_chunks += 1
855
854
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
856
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
855
+ study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
857
856
  except Exception as exc:
858
857
  study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
859
858
  raise exc
@@ -994,7 +993,7 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
994
993
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
995
994
  completed_chunks += 1
996
995
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
997
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
996
+ study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
998
997
  except Exception as exc:
999
998
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1000
999
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
@@ -1028,7 +1027,7 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
1028
1027
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1029
1028
  completed_chunks += 1
1030
1029
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1031
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1030
+ study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1032
1031
  except Exception as exc:
1033
1032
  study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1034
1033
  raise exc
@@ -2258,6 +2257,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2258
2257
  {
2259
2258
  "consensus_uid": row["consensus_uid"],
2260
2259
  "rt": row["rt"],
2260
+ "mz": row["mz"], # Add missing mz field
2261
2261
  "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
2262
2262
  "adduct_top": row.get("adduct_top"),
2263
2263
  "inty_mean": row.get("inty_mean", 0),
@@ -2265,8 +2265,9 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2265
2265
  )
2266
2266
 
2267
2267
  # Use optimized adduct grouping
2268
+ study.logger.info(f"About to call adduct grouping for {len(consensus_data)} consensus features")
2268
2269
  adduct_group_list, adduct_of_list = __merge_adduct_grouping(
2269
- study, consensus_data, rt_tol, mz_tol
2270
+ study, consensus_data, rt_tol/3, mz_tol
2270
2271
  )
2271
2272
 
2272
2273
  # Add the new columns to consensus_df
@@ -2713,8 +2714,6 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2713
2714
  cached_adducts_df: Pre-computed adducts DataFrame for performance
2714
2715
  """
2715
2716
  import polars as pl
2716
- import numpy as np
2717
- from collections import defaultdict
2718
2717
 
2719
2718
  # Check if consensus_df exists and has features
2720
2719
  if len(study.consensus_df) == 0:
@@ -2727,8 +2726,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2727
2726
  if cached_adducts_df is None or cached_adducts_df.is_empty():
2728
2727
  try:
2729
2728
  # Use lower min_probability for better adduct coverage in mass shift identification
2730
- from masster.study.id import _get_adducts
2731
- cached_adducts_df = _get_adducts(study, min_probability=0.01)
2729
+ cached_adducts_df = study._get_adducts(min_probability=0.01)
2732
2730
  except Exception as e:
2733
2731
  study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
2734
2732
  return
@@ -2822,9 +2820,8 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2822
2820
  mz1 = feature1["mz"]
2823
2821
  adduct1 = feature1["adduct_top"]
2824
2822
 
2825
- # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
2826
- # if adduct1 and "?" not in adduct1:
2827
- # continue
2823
+ # Conservative approach: Don't skip features here - let algorithm find pairs first
2824
+ # We'll check for inappropriate assignments later in the pair processing logic
2828
2825
 
2829
2826
  # Search for coeluting features within strict RT tolerance
2830
2827
  for j in range(i + 1, n_features):
@@ -2838,9 +2835,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2838
2835
  mz2 = feature2["mz"]
2839
2836
  adduct2 = feature2["adduct_top"]
2840
2837
 
2841
- # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
2842
- # if adduct2 and "?" not in adduct2:
2843
- # continue
2838
+ # Conservative approach: Don't skip feature2 here either - process all potential pairs
2844
2839
 
2845
2840
  # Calculate observed m/z difference
2846
2841
  mz_diff = mz2 - mz1
@@ -2890,24 +2885,45 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2890
2885
  else:
2891
2886
  # Assignment based on mass shift direction
2892
2887
  # catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
2893
- # If catalog_shift > 0: from_adduct has higher m/z than to_adduct
2894
- # If catalog_shift < 0: from_adduct has lower m/z than to_adduct
2895
- # observed mz_diff = mz2 - mz1
2896
- # If mz_diff matches catalog_shift: feature2 should get to_adduct, feature1 should get from_adduct
2897
- # If mz_diff matches -catalog_shift: assignments are swapped
2888
+ # If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
2889
+ # If catalog_shift < 0: from_adduct has lower mass shift than to_adduct
2890
+ # observed mz_diff = mz2 - mz1 (always positive for mz2 > mz1)
2891
+ #
2892
+ # CRITICAL FIX: Correct assignment logic
2893
+ # When mz_diff matches positive catalog_shift:
2894
+ # - from_adduct is the heavier adduct (higher mass shift)
2895
+ # - to_adduct is the lighter adduct (lower mass shift)
2896
+ # - Higher m/z feature should get the heavier adduct (from_adduct)
2897
+ # - Lower m/z feature should get the lighter adduct (to_adduct)
2898
2898
 
2899
2899
  if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
2900
2900
  # mz_diff matches catalog_shift direction
2901
- from_feature = feature1
2902
- to_feature = feature2
2903
- from_adduct_name = best_rel["from_adduct"]
2904
- to_adduct_name = best_rel["to_adduct"]
2901
+ if catalog_shift > 0:
2902
+ # from_adduct is heavier, to_adduct is lighter
2903
+ from_feature = feature2 # Higher m/z gets heavier adduct
2904
+ to_feature = feature1 # Lower m/z gets lighter adduct
2905
+ from_adduct_name = best_rel["from_adduct"] # Heavier adduct
2906
+ to_adduct_name = best_rel["to_adduct"] # Lighter adduct
2907
+ else:
2908
+ # from_adduct is lighter, to_adduct is heavier
2909
+ from_feature = feature1 # Lower m/z gets lighter adduct
2910
+ to_feature = feature2 # Higher m/z gets heavier adduct
2911
+ from_adduct_name = best_rel["from_adduct"] # Lighter adduct
2912
+ to_adduct_name = best_rel["to_adduct"] # Heavier adduct
2905
2913
  else:
2906
2914
  # mz_diff matches reverse direction of catalog_shift
2907
- from_feature = feature2
2908
- to_feature = feature1
2909
- from_adduct_name = best_rel["to_adduct"]
2910
- to_adduct_name = best_rel["from_adduct"]
2915
+ if catalog_shift > 0:
2916
+ # Reverse: from_adduct becomes lighter, to_adduct becomes heavier
2917
+ from_feature = feature1 # Lower m/z gets lighter adduct
2918
+ to_feature = feature2 # Higher m/z gets heavier adduct
2919
+ from_adduct_name = best_rel["to_adduct"] # Now lighter adduct
2920
+ to_adduct_name = best_rel["from_adduct"] # Now heavier adduct
2921
+ else:
2922
+ # Reverse: from_adduct becomes heavier, to_adduct becomes lighter
2923
+ from_feature = feature2 # Higher m/z gets heavier adduct
2924
+ to_feature = feature1 # Lower m/z gets lighter adduct
2925
+ from_adduct_name = best_rel["to_adduct"] # Now heavier adduct
2926
+ to_adduct_name = best_rel["from_adduct"] # Now lighter adduct
2911
2927
 
2912
2928
  # Get adduct details from catalog
2913
2929
  from_adduct_info = adduct_info.get(from_adduct_name, {})
@@ -2922,7 +2938,40 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2922
2938
  from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
2923
2939
  to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
2924
2940
 
2925
- # Store updates
2941
+ # Smart conservative check: prevent inappropriate assignments to isolated features
2942
+ # Check if both features are isolated (single-member groups) with [M+?]1+ assignments
2943
+ def is_isolated_unknown_feature(feature):
2944
+ """Check if a feature is isolated with unknown adduct"""
2945
+ if not feature["adduct_top"] or "[M+?]" not in feature["adduct_top"]:
2946
+ return False # Not unknown, safe to process
2947
+
2948
+ # Check group size
2949
+ try:
2950
+ feature_row = study.consensus_df.filter(study.consensus_df["consensus_uid"] == feature["consensus_uid"])
2951
+ if len(feature_row) > 0:
2952
+ adduct_group = feature_row["adduct_group"].iloc[0]
2953
+ if adduct_group > 0:
2954
+ group_members = study.consensus_df.filter(study.consensus_df["adduct_group"] == adduct_group)
2955
+ return len(group_members) <= 1 # Isolated if group size <= 1
2956
+ except Exception:
2957
+ pass
2958
+ return True # Default to isolated if can't determine
2959
+
2960
+ from_isolated = is_isolated_unknown_feature(from_feature)
2961
+ to_isolated = is_isolated_unknown_feature(to_feature)
2962
+
2963
+ # Only skip assignment if BOTH features are isolated AND would get the SAME adduct
2964
+ # (This prevents inappropriate duplicate assignments to isolated features)
2965
+ skip_assignment = (from_isolated and to_isolated and from_adduct_name == to_adduct_name)
2966
+
2967
+ if skip_assignment:
2968
+ study.logger.debug(
2969
+ f"Skipping inappropriate assignment: both isolated features would get {from_adduct_name} "
2970
+ f"(UIDs {from_feature['consensus_uid']}, {to_feature['consensus_uid']})"
2971
+ )
2972
+ continue # Skip this pair, continue to next relationship
2973
+
2974
+ # Store updates (legitimate pair or at least one feature already has specific adduct)
2926
2975
  adduct_updates[from_feature["consensus_uid"]] = {
2927
2976
  "adduct_top": from_adduct_name,
2928
2977
  "adduct_charge_top": from_charge,
@@ -3083,164 +3132,518 @@ def __merge_feature_lookup(study_obj, features_df):
3083
3132
  return features_lookup
3084
3133
 
3085
3134
 
3135
+ def _get_features_matrix(study, consensus_data, quant_col="inty"):
3136
+ """
3137
+ Create a local intensity matrix from features_df for correlation calculations.
3138
+
3139
+ Args:
3140
+ study: Study object with features_df and samples_df
3141
+ consensus_data: List of consensus feature dictionaries
3142
+ quant_col: Column name to use for quantification (default: "inty")
3143
+
3144
+ Returns:
3145
+ pandas.DataFrame: Matrix with consensus_uid as index, sample names as columns
3146
+ """
3147
+ import pandas as pd
3148
+ import numpy as np
3149
+
3150
+ # Get all sample names
3151
+ sample_names = study.samples_df["sample_name"].to_list()
3152
+ consensus_uids = [int(f["consensus_uid"]) for f in consensus_data]
3153
+
3154
+ # Initialize matrix with zeros
3155
+ matrix_data = pd.DataFrame(
3156
+ index=pd.Index(consensus_uids, name="consensus_uid"),
3157
+ columns=sample_names,
3158
+ data=0.0,
3159
+ dtype=float
3160
+ )
3161
+
3162
+ study.logger.debug(f"Building local features matrix: {len(consensus_uids)} features x {len(sample_names)} samples")
3163
+
3164
+ # Fill matrix with actual intensity values
3165
+ features_df_pandas = study.features_df.to_pandas()
3166
+ samples_df_pandas = study.samples_df.to_pandas()
3167
+ consensus_mapping_pandas = study.consensus_mapping_df.to_pandas()
3168
+
3169
+ # Create sample_uid to sample_name mapping
3170
+ uid_to_name = dict(zip(samples_df_pandas["sample_uid"], samples_df_pandas["sample_name"]))
3171
+
3172
+ # For each consensus feature, get intensities from all samples
3173
+ for consensus_uid in consensus_uids:
3174
+ # Get all feature_uids that map to this consensus_uid
3175
+ feature_mappings = consensus_mapping_pandas[
3176
+ consensus_mapping_pandas["consensus_uid"] == consensus_uid
3177
+ ]
3178
+
3179
+ for _, mapping in feature_mappings.iterrows():
3180
+ feature_uid = mapping["feature_uid"]
3181
+ sample_uid = mapping["sample_uid"]
3182
+ sample_name = uid_to_name.get(sample_uid, f"sample_{sample_uid}")
3183
+
3184
+ # Get intensity for this feature
3185
+ feature_row = features_df_pandas[
3186
+ (features_df_pandas["feature_uid"] == feature_uid) &
3187
+ (features_df_pandas["sample_uid"] == sample_uid)
3188
+ ]
3189
+
3190
+ if len(feature_row) > 0:
3191
+ intensity = feature_row[quant_col].iloc[0]
3192
+ if pd.notna(intensity):
3193
+ matrix_data.loc[consensus_uid, sample_name] = float(intensity)
3194
+
3195
+ # Convert any remaining NaN to 0
3196
+ matrix_data = matrix_data.fillna(0.0)
3197
+
3198
+ study.logger.debug(f"Local matrix built successfully with shape {matrix_data.shape}")
3199
+
3200
+ return matrix_data
3201
+
3202
+
3203
+ def _get_adduct_deltas_with_likelihood(study):
3204
+ """
3205
+ Extract all pairwise mass differences between adducts with joint likelihood scoring.
3206
+
3207
+ Args:
3208
+ study: Study object with _get_adducts method
3209
+
3210
+ Returns:
3211
+ List of tuples: (mass_delta, joint_likelihood, adduct1_name, adduct2_name)
3212
+ Sorted by joint_likelihood descending (most likely pairs first)
3213
+ """
3214
+ try:
3215
+ adducts_df = study._get_adducts()
3216
+
3217
+ if adducts_df is None or adducts_df.is_empty():
3218
+ study.logger.warning("No adducts dataframe available for study")
3219
+ return []
3220
+
3221
+ # Convert to pandas for easier manipulation
3222
+ adducts_pd = adducts_df.to_pandas()
3223
+
3224
+ # Check if we have likelihood/probability information
3225
+ likelihood_col = None
3226
+ for col in ['likelihood', 'probability', 'freq', 'frequency', 'score']:
3227
+ if col in adducts_pd.columns:
3228
+ likelihood_col = col
3229
+ break
3230
+
3231
+ # If no likelihood column, estimate based on adduct type
3232
+ if likelihood_col is None:
3233
+ adducts_pd['estimated_likelihood'] = adducts_pd.apply(_estimate_adduct_likelihood, axis=1)
3234
+ likelihood_col = 'estimated_likelihood'
3235
+
3236
+ # Get mass column (try different possible column names)
3237
+ mass_col = None
3238
+ for col_name in ['mass_shift', 'mass', 'mass_shift_da', 'mass_da']:
3239
+ if col_name in adducts_pd.columns:
3240
+ mass_col = col_name
3241
+ break
3242
+
3243
+ if mass_col is None:
3244
+ study.logger.warning(f"No mass column found in adducts dataframe. Available columns: {list(adducts_pd.columns)}")
3245
+ return []
3246
+
3247
+ # Calculate all pairwise differences with joint likelihoods
3248
+ adduct_pairs = []
3249
+ for i in range(len(adducts_pd)):
3250
+ for j in range(i + 1, len(adducts_pd)):
3251
+ row_i = adducts_pd.iloc[i]
3252
+ row_j = adducts_pd.iloc[j]
3253
+
3254
+ # Skip if masses are NaN or invalid
3255
+ if (hasattr(row_i[mass_col], '__iter__') and not isinstance(row_i[mass_col], str)) or \
3256
+ (hasattr(row_j[mass_col], '__iter__') and not isinstance(row_j[mass_col], str)):
3257
+ continue
3258
+
3259
+ mass_i = float(row_i[mass_col])
3260
+ mass_j = float(row_j[mass_col])
3261
+ delta = abs(mass_i - mass_j)
3262
+
3263
+ if delta > 0.1: # Only meaningful mass differences
3264
+ # Joint likelihood is sum of individual likelihoods
3265
+ joint_likelihood = float(row_i[likelihood_col]) + float(row_j[likelihood_col])
3266
+
3267
+ adduct1_name = row_i.get('adduct', row_i.get('name', f'adduct_{i}'))
3268
+ adduct2_name = row_j.get('adduct', row_j.get('name', f'adduct_{j}'))
3269
+
3270
+ # CRITICAL FIX: Order adducts consistently from lower mass to higher mass
3271
+ # This ensures consistent assignment: lower mass adduct = from_adduct, higher mass adduct = to_adduct
3272
+ if mass_i <= mass_j:
3273
+ # row_i has lower or equal mass shift -> from_adduct
3274
+ # row_j has higher mass shift -> to_adduct
3275
+ adduct_pairs.append((round(delta, 4), joint_likelihood, adduct1_name, adduct2_name))
3276
+ else:
3277
+ # row_j has lower mass shift -> from_adduct
3278
+ # row_i has higher mass shift -> to_adduct
3279
+ adduct_pairs.append((round(delta, 4), joint_likelihood, adduct2_name, adduct1_name))
3280
+
3281
+ # Sort by joint likelihood descending (most likely pairs first)
3282
+ adduct_pairs.sort(key=lambda x: x[1], reverse=True)
3283
+
3284
+ study.logger.debug(f"Extracted {len(adduct_pairs)} adduct pairs with likelihood scoring")
3285
+ return adduct_pairs
3286
+
3287
+ except Exception as e:
3288
+ study.logger.warning(f"Could not extract adduct deltas with likelihood: {e}. No adducts defined - returning empty list.")
3289
+ return []
3290
+
3291
+
3292
+ def _estimate_adduct_likelihood(adduct_row):
3293
+ """
3294
+ Estimate likelihood of an adduct based on common knowledge.
3295
+
3296
+ Args:
3297
+ adduct_row: pandas Series with adduct information
3298
+
3299
+ Returns:
3300
+ float: Estimated likelihood (0.0 to 1.0)
3301
+ """
3302
+ adduct_name = str(adduct_row.get('adduct', adduct_row.get('name', ''))).lower()
3303
+
3304
+ # Common likelihood estimates based on adduct frequency in positive mode
3305
+ likelihood_map = {
3306
+ '[m+h]': 0.9, # Most common
3307
+ '[m+na]': 0.7, # Very common
3308
+ '[m+nh4]': 0.6, # Common
3309
+ '[m+k]': 0.3, # Less common
3310
+ '[m+2h]': 0.2, # Doubly charged, less frequent
3311
+ '[m+3h]': 0.1, # Triply charged, rare
3312
+ '[m+h-h2o]': 0.4, # Loss adducts, moderately common
3313
+ }
3314
+
3315
+ # Find best match
3316
+ for pattern, likelihood in likelihood_map.items():
3317
+ if pattern in adduct_name:
3318
+ return likelihood
3319
+
3320
+ # Default for unknown adducts
3321
+ return 0.2
3322
+
3323
+
3324
+ def _get_adduct_deltas(study):
3325
+ """
3326
+ Extract all pairwise mass differences between adducts from study adducts data.
3327
+
3328
+ Args:
3329
+ study: Study object with _get_adducts method
3330
+
3331
+ Returns:
3332
+ List of mass differences (deltas) for adduct filtering
3333
+ """
3334
+ # Use the enhanced function and extract just the deltas for backward compatibility
3335
+ adduct_pairs = _get_adduct_deltas_with_likelihood(study)
3336
+ return [pair[0] for pair in adduct_pairs] # Extract just the mass deltas
3337
+
3338
+
3339
+ def _fast_correlation(vec1, vec2):
3340
+ """
3341
+ Fast Pearson correlation coefficient calculation.
3342
+ Optimized for repeated use in tight loops.
3343
+ """
3344
+ if len(vec1) != len(vec2):
3345
+ return 0.0
3346
+
3347
+ # Remove NaN values and corresponding positions
3348
+ mask = ~(np.isnan(vec1) | np.isnan(vec2))
3349
+ if np.sum(mask) < 2: # Need at least 2 valid points
3350
+ return 0.0
3351
+
3352
+ v1 = vec1[mask]
3353
+ v2 = vec2[mask]
3354
+
3355
+ # Fast correlation using numpy built-in
3356
+ try:
3357
+ corr_matrix = np.corrcoef(v1, v2)
3358
+ return corr_matrix[0, 1] if not np.isnan(corr_matrix[0, 1]) else 0.0
3359
+ except Exception:
3360
+ return 0.0
3361
+
3362
+
3086
3363
  def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3087
3364
  """
3088
- Optimized O(n log n) adduct grouping using spatial indexing.
3365
+ Groups consensus features that represent the same molecule with different adducts.
3366
+ Uses multi-step filtering:
3367
+ 1. Build local intensity matrix once
3368
+ 2. RT coelution filtering with spatial indexing
3369
+ 3. Mass shift validation with hash lookup
3370
+ 4. Hierarchical boss structure (prevent transitivity)
3371
+ 5. Correlation-based confirmation
3372
+ 6. Intensity-based ranking for final selection
3089
3373
 
3090
3374
  Args:
3091
- study: Study object with logger
3375
+ study: Study object
3092
3376
  consensus_data: List of consensus feature dictionaries
3093
- rt_tol: RT tolerance in minutes
3094
- mz_tol: m/z tolerance in Da
3377
+ rt_tol: Retention time tolerance (seconds)
3378
+ mz_tol: M/z tolerance (Da)
3095
3379
 
3096
3380
  Returns:
3097
3381
  Tuple of (adduct_group_list, adduct_of_list)
3098
3382
  """
3383
+
3099
3384
  if not consensus_data:
3100
3385
  return [], []
3101
-
3386
+
3102
3387
  n_features = len(consensus_data)
3103
- if n_features > 10000:
3104
- study.logger.info(f"Adduct grouping for {n_features} consensus features...")
3105
- else:
3106
- study.logger.debug(f"Adduct grouping for {n_features} consensus features...")
3388
+ study.logger.info(f"Starting adduct grouping for {n_features} features")
3107
3389
 
3108
- # Build spatial index using RT and neutral mass as coordinates
3109
- features_by_mass = defaultdict(list)
3110
- mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
3111
-
3112
- valid_features = []
3113
- for feature in consensus_data:
3114
- consensus_uid = feature["consensus_uid"]
3390
+ # Step 1: Build local intensity matrix ONCE
3391
+ try:
3392
+ intensity_matrix_pd = _get_features_matrix(study, consensus_data, quant_col="inty")
3393
+
3394
+ if intensity_matrix_pd is None or len(intensity_matrix_pd) == 0:
3395
+ study.logger.warning("Could not build local intensity matrix - creating single-feature groups")
3396
+ adduct_group_list = list(range(1, len(consensus_data) + 1))
3397
+ adduct_of_list = [0] * len(consensus_data)
3398
+ return adduct_group_list, adduct_of_list
3399
+
3400
+ study.logger.info(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
3401
+
3402
+ except Exception as e:
3403
+ study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
3404
+ adduct_group_list = list(range(1, len(consensus_data) + 1))
3405
+ adduct_of_list = [0] * len(consensus_data)
3406
+ return adduct_group_list, adduct_of_list
3407
+
3408
+ # Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
3409
+ adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
3410
+ study.logger.info(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
3411
+
3412
+ # Build hash map for O(1) mass shift lookup
3413
+ mass_shift_map = {} # rounded_delta -> [(likelihood, adduct1, adduct2), ...]
3414
+ for mass_delta, joint_likelihood, adduct1, adduct2 in adduct_pairs_with_likelihood:
3415
+ key = round(mass_delta / mz_tol) * mz_tol # Round to tolerance grid
3416
+ if key not in mass_shift_map:
3417
+ mass_shift_map[key] = []
3418
+ mass_shift_map[key].append((joint_likelihood, adduct1, adduct2))
3419
+
3420
+ # Sort each mass shift group by likelihood (highest first)
3421
+ for key in mass_shift_map:
3422
+ mass_shift_map[key].sort(key=lambda x: x[0], reverse=True)
3423
+
3424
+ # Step 3: Pre-compute feature properties and sort by RT for spatial filtering
3425
+ feature_props = []
3426
+ for i, feature in enumerate(consensus_data):
3427
+ uid = feature["consensus_uid"]
3115
3428
  rt = feature["rt"]
3116
- neutral_mass = feature.get("adduct_mass_neutral_top")
3429
+ mz = feature["mz"]
3117
3430
  intensity = feature.get("inty_mean", 0)
3118
- adduct = feature.get("adduct_top", "")
3119
3431
 
3120
- if neutral_mass is not None:
3121
- mass_bin = int(neutral_mass / mass_bin_size)
3122
- features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
3123
- valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
3432
+ # Get matrix vector once
3433
+ matrix_vector = intensity_matrix_pd.loc[uid].values if uid in intensity_matrix_pd.index else None
3434
+
3435
+ feature_props.append({
3436
+ 'index': i,
3437
+ 'uid': uid,
3438
+ 'rt': rt,
3439
+ 'mz': mz,
3440
+ 'intensity': intensity,
3441
+ 'vector': matrix_vector,
3442
+ 'feature': feature
3443
+ })
3124
3444
 
3125
- # Union-Find for efficient grouping
3126
- class UnionFind:
3127
- def __init__(study, n):
3128
- study.parent = list(range(n))
3129
- study.rank = [0] * n
3130
-
3131
- def find(study, x):
3132
- if study.parent[x] != x:
3133
- study.parent[x] = study.find(study.parent[x])
3134
- return study.parent[x]
3135
-
3136
- def union(study, x, y):
3137
- px, py = study.find(x), study.find(y)
3138
- if px == py:
3139
- return
3140
- if study.rank[px] < study.rank[py]:
3141
- px, py = py, px
3142
- study.parent[py] = px
3143
- if study.rank[px] == study.rank[py]:
3144
- study.rank[px] += 1
3145
-
3146
- uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
3147
- uf = UnionFind(len(valid_features))
3148
-
3149
- # Find groups using spatial index
3150
- checked_pairs = set()
3151
- for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
3152
- for bin_offset in [-1, 0, 1]:
3153
- check_bin = bin1 + bin_offset
3154
- if check_bin not in features_by_mass:
3155
- continue
3156
-
3157
- for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
3158
- if uid1 >= uid2:
3159
- continue
3445
+ # Sort by RT for efficient spatial filtering
3446
+ feature_props.sort(key=lambda x: x['rt'])
3447
+
3448
+ # Initialize grouping structures
3449
+ uid_to_boss = {} # Hierarchical structure: uid -> boss_uid
3450
+ boss_to_members = {} # boss_uid -> [member_uids]
3451
+ processed_uids = set()
3452
+
3453
+ # Step 4: Process features with optimized RT filtering
3454
+ for i, boss_prop in enumerate(feature_props):
3455
+ boss_uid = boss_prop['uid']
3456
+
3457
+ if boss_uid in processed_uids:
3458
+ continue
3459
+
3460
+ if boss_prop['vector'] is None:
3461
+ processed_uids.add(boss_uid)
3462
+ continue
3463
+
3464
+ # Initialize as boss
3465
+ if boss_uid not in uid_to_boss:
3466
+ uid_to_boss[boss_uid] = boss_uid
3467
+ boss_to_members[boss_uid] = []
3468
+
3469
+ boss_rt = boss_prop['rt']
3470
+ boss_mz = boss_prop['mz']
3471
+ boss_vector = boss_prop['vector']
3472
+
3473
+ # Step 5: Efficient RT coelution filtering using sorted array
3474
+ candidate_pairs = []
3475
+
3476
+ # Search backwards from current position
3477
+ j = i - 1
3478
+ while j >= 0 and (boss_rt - feature_props[j]['rt']) <= rt_tol:
3479
+ candidate = feature_props[j]
3480
+ if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
3481
+ if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
3482
+ # Calculate mz difference and check mass shift
3483
+ mz_diff = abs(boss_mz - candidate['mz'])
3484
+ mass_shift_key = round(mz_diff / mz_tol) * mz_tol
3485
+
3486
+ if mass_shift_key in mass_shift_map:
3487
+ likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
3488
+ candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
3489
+ j -= 1
3490
+
3491
+ # Search forwards from current position
3492
+ j = i + 1
3493
+ while j < len(feature_props) and (feature_props[j]['rt'] - boss_rt) <= rt_tol:
3494
+ candidate = feature_props[j]
3495
+ if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
3496
+ if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
3497
+ # Calculate mz difference and check mass shift
3498
+ mz_diff = abs(boss_mz - candidate['mz'])
3499
+ mass_shift_key = round(mz_diff / mz_tol) * mz_tol
3500
+
3501
+ if mass_shift_key in mass_shift_map:
3502
+ likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
3503
+ candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
3504
+ j += 1
3505
+
3506
+ # Sort candidates by likelihood (descending) to prioritize chemically meaningful pairs
3507
+ candidate_pairs.sort(key=lambda x: x[1], reverse=True)
3508
+
3509
+ # Step 6: Process candidates in likelihood priority order
3510
+ for candidate_prop, likelihood, adduct_info in candidate_pairs:
3511
+ candidate_uid = candidate_prop['uid']
3512
+ candidate_vector = candidate_prop['vector']
3513
+
3514
+ # Correlation confirmation with optimized threshold
3515
+ try:
3516
+ correlation = _fast_correlation(boss_vector, candidate_vector)
3160
3517
 
3161
- pair = (min(uid1, uid2), max(uid1, uid2))
3162
- if pair in checked_pairs:
3518
+ if correlation < 0.5: # More permissive for legitimate adduct relationships
3163
3519
  continue
3164
- checked_pairs.add(pair)
3165
3520
 
3166
- mass_diff = abs(mass1 - mass2)
3167
- rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
3521
+ except Exception:
3522
+ continue
3523
+
3524
+ # Step 7: Hierarchical assignment (merge groups if needed)
3525
+ if candidate_uid in boss_to_members:
3526
+ old_members = boss_to_members[candidate_uid].copy()
3527
+ del boss_to_members[candidate_uid]
3168
3528
 
3169
- if mass_diff <= mz_tol and rt_diff <= rt_tol:
3170
- j = uid_to_idx[uid2]
3171
- uf.union(i, j)
3529
+ # Reassign old members to new boss
3530
+ for member in old_members:
3531
+ uid_to_boss[member] = boss_uid
3532
+ boss_to_members[boss_uid].append(member)
3533
+
3534
+ # Assign candidate to current boss
3535
+ uid_to_boss[candidate_uid] = boss_uid
3536
+ boss_to_members[boss_uid].append(candidate_uid)
3537
+ processed_uids.add(candidate_uid)
3538
+
3539
+ processed_uids.add(boss_uid)
3540
+
3541
+ # Step 8: Intensity-based ranking within groups (optimized)
3542
+ for boss_uid in list(boss_to_members.keys()):
3543
+ members = boss_to_members[boss_uid]
3544
+ if len(members) == 0:
3545
+ continue
3546
+
3547
+ all_group_members = [boss_uid] + members
3548
+
3549
+ # Find member with highest intensity efficiently
3550
+ max_intensity = -1
3551
+ new_boss = boss_uid
3552
+
3553
+ for member_uid in all_group_members:
3554
+ # Find member_uid in feature_props
3555
+ member_intensity = next((fp['intensity'] for fp in feature_props if fp['uid'] == member_uid), 0)
3556
+ if member_intensity > max_intensity:
3557
+ max_intensity = member_intensity
3558
+ new_boss = member_uid
3559
+
3560
+ # Update boss if needed
3561
+ if new_boss != boss_uid:
3562
+ boss_to_members[new_boss] = [m for m in all_group_members if m != new_boss]
3563
+ del boss_to_members[boss_uid]
3564
+
3565
+ # Update all member references
3566
+ for member in all_group_members:
3567
+ uid_to_boss[member] = new_boss
3172
3568
 
3173
- # Extract groups
3174
- groups_by_root = defaultdict(list)
3175
- for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
3176
- root = uf.find(i)
3177
- groups_by_root[root].append(valid_features[i])
3569
+ # Count and log results
3570
+ total_groups = len(boss_to_members)
3571
+ multi_member_groups = sum(1 for members in boss_to_members.values() if len(members) > 0)
3572
+ total_grouped_features = sum(len(members) + 1 for members in boss_to_members.values())
3178
3573
 
3179
- groups = {}
3180
- group_id = 1
3181
- assigned_groups = {}
3574
+ study.logger.info(f"Grouping results: {total_groups} groups ({multi_member_groups} multi-member, {total_grouped_features} features)")
3182
3575
 
3183
- for group_members in groups_by_root.values():
3184
- member_uids = [uid for uid, _, _, _, _, _ in group_members]
3185
-
3186
- for uid in member_uids:
3187
- assigned_groups[uid] = group_id
3188
- groups[group_id] = member_uids
3189
- group_id += 1
3576
+ # Step 9: Convert to return format (optimized)
3577
+ uid_to_index = {fp['uid']: fp['index'] for fp in feature_props}
3578
+ adduct_group_list = [0] * n_features
3579
+ adduct_of_list = [0] * n_features
3190
3580
 
3191
- # Handle features without neutral mass
3192
- for feature in consensus_data:
3193
- uid = feature["consensus_uid"]
3194
- if uid not in assigned_groups:
3195
- assigned_groups[uid] = group_id
3196
- groups[group_id] = [uid]
3197
- group_id += 1
3198
-
3199
- # Determine adduct_of for each group
3200
- group_adduct_of = {}
3201
- for grp_id, member_uids in groups.items():
3202
- best_uid = None
3203
- best_priority = -1
3204
- best_intensity = 0
3205
-
3206
- for uid in member_uids:
3207
- feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
3208
- if not feature_data:
3209
- continue
3210
-
3211
- adduct = feature_data.get("adduct_top", "")
3212
- intensity = feature_data.get("inty_mean", 0)
3213
-
3214
- priority = 0
3215
- if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
3216
- priority = 3
3217
- elif adduct and "[M-H]" in adduct:
3218
- priority = 2
3219
- elif adduct and "M" in adduct:
3220
- priority = 1
3221
-
3222
- if priority > best_priority or (priority == best_priority and intensity > best_intensity):
3223
- best_uid = uid
3224
- best_priority = priority
3225
- best_intensity = intensity
3581
+ group_counter = 1
3582
+ for boss_uid, members in boss_to_members.items():
3583
+ # Assign boss
3584
+ boss_idx = uid_to_index[boss_uid]
3585
+ adduct_group_list[boss_idx] = group_counter
3586
+ adduct_of_list[boss_idx] = 0
3587
+
3588
+ # Assign members
3589
+ for member_uid in members:
3590
+ member_idx = uid_to_index[member_uid]
3591
+ adduct_group_list[member_idx] = group_counter
3592
+ adduct_of_list[member_idx] = boss_uid
3226
3593
 
3227
- group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
3594
+ group_counter += 1
3228
3595
 
3229
- # Build final lists in same order as consensus_data
3230
- adduct_group_list = []
3231
- adduct_of_list = []
3596
+ # Handle ungrouped features
3597
+ for i in range(n_features):
3598
+ if adduct_group_list[i] == 0:
3599
+ adduct_group_list[i] = group_counter
3600
+ adduct_of_list[i] = 0
3601
+ group_counter += 1
3232
3602
 
3233
- for feature in consensus_data:
3234
- uid = feature["consensus_uid"]
3235
- group = assigned_groups.get(uid, 0)
3236
- adduct_of = group_adduct_of.get(group, uid)
3603
+ return adduct_group_list, adduct_of_list
3604
+
3605
+
3606
+ def _fast_correlation(x, y):
3607
+ """
3608
+ Fast correlation coefficient calculation for consensus matrix data.
3609
+
3610
+ In the consensus matrix:
3611
+ - Negative values (typically -1.0) indicate missing features
3612
+ - Zero and positive values are actual intensities
3613
+ - Only consider intensities >= 1000 for meaningful correlation
3614
+
3615
+ Args:
3616
+ x, y: numpy arrays of the same length
3237
3617
 
3238
- adduct_group_list.append(group)
3239
- adduct_of_list.append(adduct_of)
3618
+ Returns:
3619
+ Correlation coefficient (float), 0 if cannot be calculated
3620
+ """
3621
+ import numpy as np
3240
3622
 
3241
- if n_features > 10000:
3242
- study.logger.info("Adduct grouping completed.")
3243
- else:
3244
- study.logger.debug("Adduct grouping completed.")
3245
-
3246
- return adduct_group_list, adduct_of_list
3623
+ # For consensus matrix: exclude negative values (missing features) and very low intensities
3624
+ # Use a very low threshold since processed matrix values are often scaled/normalized
3625
+ valid = ~(np.isnan(x) | np.isnan(y) | (x < 0) | (y < 0) | (x < 0.1) | (y < 0.1))
3626
+
3627
+ if np.sum(valid) < 3: # Need at least 3 valid pairs
3628
+ return 0.0
3629
+
3630
+ x_valid = x[valid]
3631
+ y_valid = y[valid]
3632
+
3633
+ # If all values are the same (e.g., all zeros), correlation is undefined
3634
+ if np.var(x_valid) == 0 or np.var(y_valid) == 0:
3635
+ return 0.0
3636
+
3637
+ # Fast correlation using numpy
3638
+ try:
3639
+ correlation_matrix = np.corrcoef(x_valid, y_valid)
3640
+ correlation = correlation_matrix[0, 1]
3641
+
3642
+ # Handle NaN result
3643
+ if np.isnan(correlation):
3644
+ return 0.0
3645
+
3646
+ return correlation
3647
+
3648
+ except Exception:
3649
+ return 0.0