masster 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +58 -43
- masster/sample/h5.py +1 -1
- masster/sample/plot.py +4 -4
- masster/sample/processing.py +3 -3
- masster/sample/save.py +5 -5
- masster/study/h5.py +1 -1
- masster/study/helpers.py +150 -5
- masster/study/id.py +4 -4
- masster/study/merge.py +565 -162
- masster/study/processing.py +2 -2
- masster/study/study.py +2 -1
- {masster-0.5.7.dist-info → masster-0.5.9.dist-info}/METADATA +1 -1
- {masster-0.5.7.dist-info → masster-0.5.9.dist-info}/RECORD +17 -17
- {masster-0.5.7.dist-info → masster-0.5.9.dist-info}/WHEEL +0 -0
- {masster-0.5.7.dist-info → masster-0.5.9.dist-info}/entry_points.txt +0 -0
- {masster-0.5.7.dist-info → masster-0.5.9.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
CHANGED
|
@@ -441,8 +441,7 @@ def merge(study, **kwargs) -> None:
|
|
|
441
441
|
cached_adducts_df = None
|
|
442
442
|
cached_valid_adducts = None
|
|
443
443
|
try:
|
|
444
|
-
|
|
445
|
-
cached_adducts_df = _get_adducts(study)
|
|
444
|
+
cached_adducts_df = study._get_adducts()
|
|
446
445
|
if not cached_adducts_df.is_empty():
|
|
447
446
|
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
448
447
|
else:
|
|
@@ -819,7 +818,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
819
818
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
820
819
|
completed_chunks += 1
|
|
821
820
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
822
|
-
study.logger.
|
|
821
|
+
study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
823
822
|
except Exception as exc:
|
|
824
823
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
825
824
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
@@ -853,7 +852,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
853
852
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
854
853
|
completed_chunks += 1
|
|
855
854
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
856
|
-
study.logger.
|
|
855
|
+
study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
857
856
|
except Exception as exc:
|
|
858
857
|
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
859
858
|
raise exc
|
|
@@ -994,7 +993,7 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
994
993
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
995
994
|
completed_chunks += 1
|
|
996
995
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
997
|
-
study.logger.
|
|
996
|
+
study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
998
997
|
except Exception as exc:
|
|
999
998
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
1000
999
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
@@ -1028,7 +1027,7 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
1028
1027
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1029
1028
|
completed_chunks += 1
|
|
1030
1029
|
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1031
|
-
study.logger.
|
|
1030
|
+
study.logger.success(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1032
1031
|
except Exception as exc:
|
|
1033
1032
|
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1034
1033
|
raise exc
|
|
@@ -2258,6 +2257,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
|
2258
2257
|
{
|
|
2259
2258
|
"consensus_uid": row["consensus_uid"],
|
|
2260
2259
|
"rt": row["rt"],
|
|
2260
|
+
"mz": row["mz"], # Add missing mz field
|
|
2261
2261
|
"adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
|
|
2262
2262
|
"adduct_top": row.get("adduct_top"),
|
|
2263
2263
|
"inty_mean": row.get("inty_mean", 0),
|
|
@@ -2265,8 +2265,9 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
|
2265
2265
|
)
|
|
2266
2266
|
|
|
2267
2267
|
# Use optimized adduct grouping
|
|
2268
|
+
study.logger.info(f"About to call adduct grouping for {len(consensus_data)} consensus features")
|
|
2268
2269
|
adduct_group_list, adduct_of_list = __merge_adduct_grouping(
|
|
2269
|
-
study, consensus_data, rt_tol, mz_tol
|
|
2270
|
+
study, consensus_data, rt_tol/3, mz_tol
|
|
2270
2271
|
)
|
|
2271
2272
|
|
|
2272
2273
|
# Add the new columns to consensus_df
|
|
@@ -2713,8 +2714,6 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2713
2714
|
cached_adducts_df: Pre-computed adducts DataFrame for performance
|
|
2714
2715
|
"""
|
|
2715
2716
|
import polars as pl
|
|
2716
|
-
import numpy as np
|
|
2717
|
-
from collections import defaultdict
|
|
2718
2717
|
|
|
2719
2718
|
# Check if consensus_df exists and has features
|
|
2720
2719
|
if len(study.consensus_df) == 0:
|
|
@@ -2727,8 +2726,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2727
2726
|
if cached_adducts_df is None or cached_adducts_df.is_empty():
|
|
2728
2727
|
try:
|
|
2729
2728
|
# Use lower min_probability for better adduct coverage in mass shift identification
|
|
2730
|
-
|
|
2731
|
-
cached_adducts_df = _get_adducts(study, min_probability=0.01)
|
|
2729
|
+
cached_adducts_df = study._get_adducts(min_probability=0.01)
|
|
2732
2730
|
except Exception as e:
|
|
2733
2731
|
study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
|
|
2734
2732
|
return
|
|
@@ -2822,9 +2820,8 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2822
2820
|
mz1 = feature1["mz"]
|
|
2823
2821
|
adduct1 = feature1["adduct_top"]
|
|
2824
2822
|
|
|
2825
|
-
#
|
|
2826
|
-
#
|
|
2827
|
-
# continue
|
|
2823
|
+
# Conservative approach: Don't skip features here - let algorithm find pairs first
|
|
2824
|
+
# We'll check for inappropriate assignments later in the pair processing logic
|
|
2828
2825
|
|
|
2829
2826
|
# Search for coeluting features within strict RT tolerance
|
|
2830
2827
|
for j in range(i + 1, n_features):
|
|
@@ -2838,9 +2835,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2838
2835
|
mz2 = feature2["mz"]
|
|
2839
2836
|
adduct2 = feature2["adduct_top"]
|
|
2840
2837
|
|
|
2841
|
-
#
|
|
2842
|
-
# if adduct2 and "?" not in adduct2:
|
|
2843
|
-
# continue
|
|
2838
|
+
# Conservative approach: Don't skip feature2 here either - process all potential pairs
|
|
2844
2839
|
|
|
2845
2840
|
# Calculate observed m/z difference
|
|
2846
2841
|
mz_diff = mz2 - mz1
|
|
@@ -2890,24 +2885,45 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2890
2885
|
else:
|
|
2891
2886
|
# Assignment based on mass shift direction
|
|
2892
2887
|
# catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
|
|
2893
|
-
# If catalog_shift > 0: from_adduct has higher
|
|
2894
|
-
# If catalog_shift < 0: from_adduct has lower
|
|
2895
|
-
# observed mz_diff = mz2 - mz1
|
|
2896
|
-
#
|
|
2897
|
-
#
|
|
2888
|
+
# If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
|
|
2889
|
+
# If catalog_shift < 0: from_adduct has lower mass shift than to_adduct
|
|
2890
|
+
# observed mz_diff = mz2 - mz1 (always positive for mz2 > mz1)
|
|
2891
|
+
#
|
|
2892
|
+
# CRITICAL FIX: Correct assignment logic
|
|
2893
|
+
# When mz_diff matches positive catalog_shift:
|
|
2894
|
+
# - from_adduct is the heavier adduct (higher mass shift)
|
|
2895
|
+
# - to_adduct is the lighter adduct (lower mass shift)
|
|
2896
|
+
# - Higher m/z feature should get the heavier adduct (from_adduct)
|
|
2897
|
+
# - Lower m/z feature should get the lighter adduct (to_adduct)
|
|
2898
2898
|
|
|
2899
2899
|
if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
|
|
2900
2900
|
# mz_diff matches catalog_shift direction
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2901
|
+
if catalog_shift > 0:
|
|
2902
|
+
# from_adduct is heavier, to_adduct is lighter
|
|
2903
|
+
from_feature = feature2 # Higher m/z gets heavier adduct
|
|
2904
|
+
to_feature = feature1 # Lower m/z gets lighter adduct
|
|
2905
|
+
from_adduct_name = best_rel["from_adduct"] # Heavier adduct
|
|
2906
|
+
to_adduct_name = best_rel["to_adduct"] # Lighter adduct
|
|
2907
|
+
else:
|
|
2908
|
+
# from_adduct is lighter, to_adduct is heavier
|
|
2909
|
+
from_feature = feature1 # Lower m/z gets lighter adduct
|
|
2910
|
+
to_feature = feature2 # Higher m/z gets heavier adduct
|
|
2911
|
+
from_adduct_name = best_rel["from_adduct"] # Lighter adduct
|
|
2912
|
+
to_adduct_name = best_rel["to_adduct"] # Heavier adduct
|
|
2905
2913
|
else:
|
|
2906
2914
|
# mz_diff matches reverse direction of catalog_shift
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2915
|
+
if catalog_shift > 0:
|
|
2916
|
+
# Reverse: from_adduct becomes lighter, to_adduct becomes heavier
|
|
2917
|
+
from_feature = feature1 # Lower m/z gets lighter adduct
|
|
2918
|
+
to_feature = feature2 # Higher m/z gets heavier adduct
|
|
2919
|
+
from_adduct_name = best_rel["to_adduct"] # Now lighter adduct
|
|
2920
|
+
to_adduct_name = best_rel["from_adduct"] # Now heavier adduct
|
|
2921
|
+
else:
|
|
2922
|
+
# Reverse: from_adduct becomes heavier, to_adduct becomes lighter
|
|
2923
|
+
from_feature = feature2 # Higher m/z gets heavier adduct
|
|
2924
|
+
to_feature = feature1 # Lower m/z gets lighter adduct
|
|
2925
|
+
from_adduct_name = best_rel["to_adduct"] # Now heavier adduct
|
|
2926
|
+
to_adduct_name = best_rel["from_adduct"] # Now lighter adduct
|
|
2911
2927
|
|
|
2912
2928
|
# Get adduct details from catalog
|
|
2913
2929
|
from_adduct_info = adduct_info.get(from_adduct_name, {})
|
|
@@ -2922,7 +2938,40 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2922
2938
|
from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
|
|
2923
2939
|
to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
|
|
2924
2940
|
|
|
2925
|
-
#
|
|
2941
|
+
# Smart conservative check: prevent inappropriate assignments to isolated features
|
|
2942
|
+
# Check if both features are isolated (single-member groups) with [M+?]1+ assignments
|
|
2943
|
+
def is_isolated_unknown_feature(feature):
|
|
2944
|
+
"""Check if a feature is isolated with unknown adduct"""
|
|
2945
|
+
if not feature["adduct_top"] or "[M+?]" not in feature["adduct_top"]:
|
|
2946
|
+
return False # Not unknown, safe to process
|
|
2947
|
+
|
|
2948
|
+
# Check group size
|
|
2949
|
+
try:
|
|
2950
|
+
feature_row = study.consensus_df.filter(study.consensus_df["consensus_uid"] == feature["consensus_uid"])
|
|
2951
|
+
if len(feature_row) > 0:
|
|
2952
|
+
adduct_group = feature_row["adduct_group"].iloc[0]
|
|
2953
|
+
if adduct_group > 0:
|
|
2954
|
+
group_members = study.consensus_df.filter(study.consensus_df["adduct_group"] == adduct_group)
|
|
2955
|
+
return len(group_members) <= 1 # Isolated if group size <= 1
|
|
2956
|
+
except Exception:
|
|
2957
|
+
pass
|
|
2958
|
+
return True # Default to isolated if can't determine
|
|
2959
|
+
|
|
2960
|
+
from_isolated = is_isolated_unknown_feature(from_feature)
|
|
2961
|
+
to_isolated = is_isolated_unknown_feature(to_feature)
|
|
2962
|
+
|
|
2963
|
+
# Only skip assignment if BOTH features are isolated AND would get the SAME adduct
|
|
2964
|
+
# (This prevents inappropriate duplicate assignments to isolated features)
|
|
2965
|
+
skip_assignment = (from_isolated and to_isolated and from_adduct_name == to_adduct_name)
|
|
2966
|
+
|
|
2967
|
+
if skip_assignment:
|
|
2968
|
+
study.logger.debug(
|
|
2969
|
+
f"Skipping inappropriate assignment: both isolated features would get {from_adduct_name} "
|
|
2970
|
+
f"(UIDs {from_feature['consensus_uid']}, {to_feature['consensus_uid']})"
|
|
2971
|
+
)
|
|
2972
|
+
continue # Skip this pair, continue to next relationship
|
|
2973
|
+
|
|
2974
|
+
# Store updates (legitimate pair or at least one feature already has specific adduct)
|
|
2926
2975
|
adduct_updates[from_feature["consensus_uid"]] = {
|
|
2927
2976
|
"adduct_top": from_adduct_name,
|
|
2928
2977
|
"adduct_charge_top": from_charge,
|
|
@@ -3083,164 +3132,518 @@ def __merge_feature_lookup(study_obj, features_df):
|
|
|
3083
3132
|
return features_lookup
|
|
3084
3133
|
|
|
3085
3134
|
|
|
3135
|
+
def _get_features_matrix(study, consensus_data, quant_col="inty"):
|
|
3136
|
+
"""
|
|
3137
|
+
Create a local intensity matrix from features_df for correlation calculations.
|
|
3138
|
+
|
|
3139
|
+
Args:
|
|
3140
|
+
study: Study object with features_df and samples_df
|
|
3141
|
+
consensus_data: List of consensus feature dictionaries
|
|
3142
|
+
quant_col: Column name to use for quantification (default: "inty")
|
|
3143
|
+
|
|
3144
|
+
Returns:
|
|
3145
|
+
pandas.DataFrame: Matrix with consensus_uid as index, sample names as columns
|
|
3146
|
+
"""
|
|
3147
|
+
import pandas as pd
|
|
3148
|
+
import numpy as np
|
|
3149
|
+
|
|
3150
|
+
# Get all sample names
|
|
3151
|
+
sample_names = study.samples_df["sample_name"].to_list()
|
|
3152
|
+
consensus_uids = [int(f["consensus_uid"]) for f in consensus_data]
|
|
3153
|
+
|
|
3154
|
+
# Initialize matrix with zeros
|
|
3155
|
+
matrix_data = pd.DataFrame(
|
|
3156
|
+
index=pd.Index(consensus_uids, name="consensus_uid"),
|
|
3157
|
+
columns=sample_names,
|
|
3158
|
+
data=0.0,
|
|
3159
|
+
dtype=float
|
|
3160
|
+
)
|
|
3161
|
+
|
|
3162
|
+
study.logger.debug(f"Building local features matrix: {len(consensus_uids)} features x {len(sample_names)} samples")
|
|
3163
|
+
|
|
3164
|
+
# Fill matrix with actual intensity values
|
|
3165
|
+
features_df_pandas = study.features_df.to_pandas()
|
|
3166
|
+
samples_df_pandas = study.samples_df.to_pandas()
|
|
3167
|
+
consensus_mapping_pandas = study.consensus_mapping_df.to_pandas()
|
|
3168
|
+
|
|
3169
|
+
# Create sample_uid to sample_name mapping
|
|
3170
|
+
uid_to_name = dict(zip(samples_df_pandas["sample_uid"], samples_df_pandas["sample_name"]))
|
|
3171
|
+
|
|
3172
|
+
# For each consensus feature, get intensities from all samples
|
|
3173
|
+
for consensus_uid in consensus_uids:
|
|
3174
|
+
# Get all feature_uids that map to this consensus_uid
|
|
3175
|
+
feature_mappings = consensus_mapping_pandas[
|
|
3176
|
+
consensus_mapping_pandas["consensus_uid"] == consensus_uid
|
|
3177
|
+
]
|
|
3178
|
+
|
|
3179
|
+
for _, mapping in feature_mappings.iterrows():
|
|
3180
|
+
feature_uid = mapping["feature_uid"]
|
|
3181
|
+
sample_uid = mapping["sample_uid"]
|
|
3182
|
+
sample_name = uid_to_name.get(sample_uid, f"sample_{sample_uid}")
|
|
3183
|
+
|
|
3184
|
+
# Get intensity for this feature
|
|
3185
|
+
feature_row = features_df_pandas[
|
|
3186
|
+
(features_df_pandas["feature_uid"] == feature_uid) &
|
|
3187
|
+
(features_df_pandas["sample_uid"] == sample_uid)
|
|
3188
|
+
]
|
|
3189
|
+
|
|
3190
|
+
if len(feature_row) > 0:
|
|
3191
|
+
intensity = feature_row[quant_col].iloc[0]
|
|
3192
|
+
if pd.notna(intensity):
|
|
3193
|
+
matrix_data.loc[consensus_uid, sample_name] = float(intensity)
|
|
3194
|
+
|
|
3195
|
+
# Convert any remaining NaN to 0
|
|
3196
|
+
matrix_data = matrix_data.fillna(0.0)
|
|
3197
|
+
|
|
3198
|
+
study.logger.debug(f"Local matrix built successfully with shape {matrix_data.shape}")
|
|
3199
|
+
|
|
3200
|
+
return matrix_data
|
|
3201
|
+
|
|
3202
|
+
|
|
3203
|
+
def _get_adduct_deltas_with_likelihood(study):
|
|
3204
|
+
"""
|
|
3205
|
+
Extract all pairwise mass differences between adducts with joint likelihood scoring.
|
|
3206
|
+
|
|
3207
|
+
Args:
|
|
3208
|
+
study: Study object with _get_adducts method
|
|
3209
|
+
|
|
3210
|
+
Returns:
|
|
3211
|
+
List of tuples: (mass_delta, joint_likelihood, adduct1_name, adduct2_name)
|
|
3212
|
+
Sorted by joint_likelihood descending (most likely pairs first)
|
|
3213
|
+
"""
|
|
3214
|
+
try:
|
|
3215
|
+
adducts_df = study._get_adducts()
|
|
3216
|
+
|
|
3217
|
+
if adducts_df is None or adducts_df.is_empty():
|
|
3218
|
+
study.logger.warning("No adducts dataframe available for study")
|
|
3219
|
+
return []
|
|
3220
|
+
|
|
3221
|
+
# Convert to pandas for easier manipulation
|
|
3222
|
+
adducts_pd = adducts_df.to_pandas()
|
|
3223
|
+
|
|
3224
|
+
# Check if we have likelihood/probability information
|
|
3225
|
+
likelihood_col = None
|
|
3226
|
+
for col in ['likelihood', 'probability', 'freq', 'frequency', 'score']:
|
|
3227
|
+
if col in adducts_pd.columns:
|
|
3228
|
+
likelihood_col = col
|
|
3229
|
+
break
|
|
3230
|
+
|
|
3231
|
+
# If no likelihood column, estimate based on adduct type
|
|
3232
|
+
if likelihood_col is None:
|
|
3233
|
+
adducts_pd['estimated_likelihood'] = adducts_pd.apply(_estimate_adduct_likelihood, axis=1)
|
|
3234
|
+
likelihood_col = 'estimated_likelihood'
|
|
3235
|
+
|
|
3236
|
+
# Get mass column (try different possible column names)
|
|
3237
|
+
mass_col = None
|
|
3238
|
+
for col_name in ['mass_shift', 'mass', 'mass_shift_da', 'mass_da']:
|
|
3239
|
+
if col_name in adducts_pd.columns:
|
|
3240
|
+
mass_col = col_name
|
|
3241
|
+
break
|
|
3242
|
+
|
|
3243
|
+
if mass_col is None:
|
|
3244
|
+
study.logger.warning(f"No mass column found in adducts dataframe. Available columns: {list(adducts_pd.columns)}")
|
|
3245
|
+
return []
|
|
3246
|
+
|
|
3247
|
+
# Calculate all pairwise differences with joint likelihoods
|
|
3248
|
+
adduct_pairs = []
|
|
3249
|
+
for i in range(len(adducts_pd)):
|
|
3250
|
+
for j in range(i + 1, len(adducts_pd)):
|
|
3251
|
+
row_i = adducts_pd.iloc[i]
|
|
3252
|
+
row_j = adducts_pd.iloc[j]
|
|
3253
|
+
|
|
3254
|
+
# Skip if masses are NaN or invalid
|
|
3255
|
+
if (hasattr(row_i[mass_col], '__iter__') and not isinstance(row_i[mass_col], str)) or \
|
|
3256
|
+
(hasattr(row_j[mass_col], '__iter__') and not isinstance(row_j[mass_col], str)):
|
|
3257
|
+
continue
|
|
3258
|
+
|
|
3259
|
+
mass_i = float(row_i[mass_col])
|
|
3260
|
+
mass_j = float(row_j[mass_col])
|
|
3261
|
+
delta = abs(mass_i - mass_j)
|
|
3262
|
+
|
|
3263
|
+
if delta > 0.1: # Only meaningful mass differences
|
|
3264
|
+
# Joint likelihood is sum of individual likelihoods
|
|
3265
|
+
joint_likelihood = float(row_i[likelihood_col]) + float(row_j[likelihood_col])
|
|
3266
|
+
|
|
3267
|
+
adduct1_name = row_i.get('adduct', row_i.get('name', f'adduct_{i}'))
|
|
3268
|
+
adduct2_name = row_j.get('adduct', row_j.get('name', f'adduct_{j}'))
|
|
3269
|
+
|
|
3270
|
+
# CRITICAL FIX: Order adducts consistently from lower mass to higher mass
|
|
3271
|
+
# This ensures consistent assignment: lower mass adduct = from_adduct, higher mass adduct = to_adduct
|
|
3272
|
+
if mass_i <= mass_j:
|
|
3273
|
+
# row_i has lower or equal mass shift -> from_adduct
|
|
3274
|
+
# row_j has higher mass shift -> to_adduct
|
|
3275
|
+
adduct_pairs.append((round(delta, 4), joint_likelihood, adduct1_name, adduct2_name))
|
|
3276
|
+
else:
|
|
3277
|
+
# row_j has lower mass shift -> from_adduct
|
|
3278
|
+
# row_i has higher mass shift -> to_adduct
|
|
3279
|
+
adduct_pairs.append((round(delta, 4), joint_likelihood, adduct2_name, adduct1_name))
|
|
3280
|
+
|
|
3281
|
+
# Sort by joint likelihood descending (most likely pairs first)
|
|
3282
|
+
adduct_pairs.sort(key=lambda x: x[1], reverse=True)
|
|
3283
|
+
|
|
3284
|
+
study.logger.debug(f"Extracted {len(adduct_pairs)} adduct pairs with likelihood scoring")
|
|
3285
|
+
return adduct_pairs
|
|
3286
|
+
|
|
3287
|
+
except Exception as e:
|
|
3288
|
+
study.logger.warning(f"Could not extract adduct deltas with likelihood: {e}. No adducts defined - returning empty list.")
|
|
3289
|
+
return []
|
|
3290
|
+
|
|
3291
|
+
|
|
3292
|
+
def _estimate_adduct_likelihood(adduct_row):
|
|
3293
|
+
"""
|
|
3294
|
+
Estimate likelihood of an adduct based on common knowledge.
|
|
3295
|
+
|
|
3296
|
+
Args:
|
|
3297
|
+
adduct_row: pandas Series with adduct information
|
|
3298
|
+
|
|
3299
|
+
Returns:
|
|
3300
|
+
float: Estimated likelihood (0.0 to 1.0)
|
|
3301
|
+
"""
|
|
3302
|
+
adduct_name = str(adduct_row.get('adduct', adduct_row.get('name', ''))).lower()
|
|
3303
|
+
|
|
3304
|
+
# Common likelihood estimates based on adduct frequency in positive mode
|
|
3305
|
+
likelihood_map = {
|
|
3306
|
+
'[m+h]': 0.9, # Most common
|
|
3307
|
+
'[m+na]': 0.7, # Very common
|
|
3308
|
+
'[m+nh4]': 0.6, # Common
|
|
3309
|
+
'[m+k]': 0.3, # Less common
|
|
3310
|
+
'[m+2h]': 0.2, # Doubly charged, less frequent
|
|
3311
|
+
'[m+3h]': 0.1, # Triply charged, rare
|
|
3312
|
+
'[m+h-h2o]': 0.4, # Loss adducts, moderately common
|
|
3313
|
+
}
|
|
3314
|
+
|
|
3315
|
+
# Find best match
|
|
3316
|
+
for pattern, likelihood in likelihood_map.items():
|
|
3317
|
+
if pattern in adduct_name:
|
|
3318
|
+
return likelihood
|
|
3319
|
+
|
|
3320
|
+
# Default for unknown adducts
|
|
3321
|
+
return 0.2
|
|
3322
|
+
|
|
3323
|
+
|
|
3324
|
+
def _get_adduct_deltas(study):
|
|
3325
|
+
"""
|
|
3326
|
+
Extract all pairwise mass differences between adducts from study adducts data.
|
|
3327
|
+
|
|
3328
|
+
Args:
|
|
3329
|
+
study: Study object with _get_adducts method
|
|
3330
|
+
|
|
3331
|
+
Returns:
|
|
3332
|
+
List of mass differences (deltas) for adduct filtering
|
|
3333
|
+
"""
|
|
3334
|
+
# Use the enhanced function and extract just the deltas for backward compatibility
|
|
3335
|
+
adduct_pairs = _get_adduct_deltas_with_likelihood(study)
|
|
3336
|
+
return [pair[0] for pair in adduct_pairs] # Extract just the mass deltas
|
|
3337
|
+
|
|
3338
|
+
|
|
3339
|
+
def _fast_correlation(vec1, vec2):
|
|
3340
|
+
"""
|
|
3341
|
+
Fast Pearson correlation coefficient calculation.
|
|
3342
|
+
Optimized for repeated use in tight loops.
|
|
3343
|
+
"""
|
|
3344
|
+
if len(vec1) != len(vec2):
|
|
3345
|
+
return 0.0
|
|
3346
|
+
|
|
3347
|
+
# Remove NaN values and corresponding positions
|
|
3348
|
+
mask = ~(np.isnan(vec1) | np.isnan(vec2))
|
|
3349
|
+
if np.sum(mask) < 2: # Need at least 2 valid points
|
|
3350
|
+
return 0.0
|
|
3351
|
+
|
|
3352
|
+
v1 = vec1[mask]
|
|
3353
|
+
v2 = vec2[mask]
|
|
3354
|
+
|
|
3355
|
+
# Fast correlation using numpy built-in
|
|
3356
|
+
try:
|
|
3357
|
+
corr_matrix = np.corrcoef(v1, v2)
|
|
3358
|
+
return corr_matrix[0, 1] if not np.isnan(corr_matrix[0, 1]) else 0.0
|
|
3359
|
+
except Exception:
|
|
3360
|
+
return 0.0
|
|
3361
|
+
|
|
3362
|
+
|
|
3086
3363
|
def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
3087
3364
|
"""
|
|
3088
|
-
|
|
3365
|
+
Groups consensus features that represent the same molecule with different adducts.
|
|
3366
|
+
Uses multi-step filtering:
|
|
3367
|
+
1. Build local intensity matrix once
|
|
3368
|
+
2. RT coelution filtering with spatial indexing
|
|
3369
|
+
3. Mass shift validation with hash lookup
|
|
3370
|
+
4. Hierarchical boss structure (prevent transitivity)
|
|
3371
|
+
5. Correlation-based confirmation
|
|
3372
|
+
6. Intensity-based ranking for final selection
|
|
3089
3373
|
|
|
3090
3374
|
Args:
|
|
3091
|
-
study: Study object
|
|
3375
|
+
study: Study object
|
|
3092
3376
|
consensus_data: List of consensus feature dictionaries
|
|
3093
|
-
rt_tol:
|
|
3094
|
-
mz_tol:
|
|
3377
|
+
rt_tol: Retention time tolerance (seconds)
|
|
3378
|
+
mz_tol: M/z tolerance (Da)
|
|
3095
3379
|
|
|
3096
3380
|
Returns:
|
|
3097
3381
|
Tuple of (adduct_group_list, adduct_of_list)
|
|
3098
3382
|
"""
|
|
3383
|
+
|
|
3099
3384
|
if not consensus_data:
|
|
3100
3385
|
return [], []
|
|
3101
|
-
|
|
3386
|
+
|
|
3102
3387
|
n_features = len(consensus_data)
|
|
3103
|
-
|
|
3104
|
-
study.logger.info(f"Adduct grouping for {n_features} consensus features...")
|
|
3105
|
-
else:
|
|
3106
|
-
study.logger.debug(f"Adduct grouping for {n_features} consensus features...")
|
|
3388
|
+
study.logger.info(f"Starting adduct grouping for {n_features} features")
|
|
3107
3389
|
|
|
3108
|
-
#
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3390
|
+
# Step 1: Build local intensity matrix ONCE
|
|
3391
|
+
try:
|
|
3392
|
+
intensity_matrix_pd = _get_features_matrix(study, consensus_data, quant_col="inty")
|
|
3393
|
+
|
|
3394
|
+
if intensity_matrix_pd is None or len(intensity_matrix_pd) == 0:
|
|
3395
|
+
study.logger.warning("Could not build local intensity matrix - creating single-feature groups")
|
|
3396
|
+
adduct_group_list = list(range(1, len(consensus_data) + 1))
|
|
3397
|
+
adduct_of_list = [0] * len(consensus_data)
|
|
3398
|
+
return adduct_group_list, adduct_of_list
|
|
3399
|
+
|
|
3400
|
+
study.logger.info(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
|
|
3401
|
+
|
|
3402
|
+
except Exception as e:
|
|
3403
|
+
study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
|
|
3404
|
+
adduct_group_list = list(range(1, len(consensus_data) + 1))
|
|
3405
|
+
adduct_of_list = [0] * len(consensus_data)
|
|
3406
|
+
return adduct_group_list, adduct_of_list
|
|
3407
|
+
|
|
3408
|
+
# Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
|
|
3409
|
+
adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
|
|
3410
|
+
study.logger.info(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
|
|
3411
|
+
|
|
3412
|
+
# Build hash map for O(1) mass shift lookup
|
|
3413
|
+
mass_shift_map = {} # rounded_delta -> [(likelihood, adduct1, adduct2), ...]
|
|
3414
|
+
for mass_delta, joint_likelihood, adduct1, adduct2 in adduct_pairs_with_likelihood:
|
|
3415
|
+
key = round(mass_delta / mz_tol) * mz_tol # Round to tolerance grid
|
|
3416
|
+
if key not in mass_shift_map:
|
|
3417
|
+
mass_shift_map[key] = []
|
|
3418
|
+
mass_shift_map[key].append((joint_likelihood, adduct1, adduct2))
|
|
3419
|
+
|
|
3420
|
+
# Sort each mass shift group by likelihood (highest first)
|
|
3421
|
+
for key in mass_shift_map:
|
|
3422
|
+
mass_shift_map[key].sort(key=lambda x: x[0], reverse=True)
|
|
3423
|
+
|
|
3424
|
+
# Step 3: Pre-compute feature properties and sort by RT for spatial filtering
|
|
3425
|
+
feature_props = []
|
|
3426
|
+
for i, feature in enumerate(consensus_data):
|
|
3427
|
+
uid = feature["consensus_uid"]
|
|
3115
3428
|
rt = feature["rt"]
|
|
3116
|
-
|
|
3429
|
+
mz = feature["mz"]
|
|
3117
3430
|
intensity = feature.get("inty_mean", 0)
|
|
3118
|
-
adduct = feature.get("adduct_top", "")
|
|
3119
3431
|
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
|
|
3432
|
+
# Get matrix vector once
|
|
3433
|
+
matrix_vector = intensity_matrix_pd.loc[uid].values if uid in intensity_matrix_pd.index else None
|
|
3434
|
+
|
|
3435
|
+
feature_props.append({
|
|
3436
|
+
'index': i,
|
|
3437
|
+
'uid': uid,
|
|
3438
|
+
'rt': rt,
|
|
3439
|
+
'mz': mz,
|
|
3440
|
+
'intensity': intensity,
|
|
3441
|
+
'vector': matrix_vector,
|
|
3442
|
+
'feature': feature
|
|
3443
|
+
})
|
|
3124
3444
|
|
|
3125
|
-
#
|
|
3126
|
-
|
|
3127
|
-
|
|
3128
|
-
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
|
|
3445
|
+
# Sort by RT for efficient spatial filtering
|
|
3446
|
+
feature_props.sort(key=lambda x: x['rt'])
|
|
3447
|
+
|
|
3448
|
+
# Initialize grouping structures
|
|
3449
|
+
uid_to_boss = {} # Hierarchical structure: uid -> boss_uid
|
|
3450
|
+
boss_to_members = {} # boss_uid -> [member_uids]
|
|
3451
|
+
processed_uids = set()
|
|
3452
|
+
|
|
3453
|
+
# Step 4: Process features with optimized RT filtering
|
|
3454
|
+
for i, boss_prop in enumerate(feature_props):
|
|
3455
|
+
boss_uid = boss_prop['uid']
|
|
3456
|
+
|
|
3457
|
+
if boss_uid in processed_uids:
|
|
3458
|
+
continue
|
|
3459
|
+
|
|
3460
|
+
if boss_prop['vector'] is None:
|
|
3461
|
+
processed_uids.add(boss_uid)
|
|
3462
|
+
continue
|
|
3463
|
+
|
|
3464
|
+
# Initialize as boss
|
|
3465
|
+
if boss_uid not in uid_to_boss:
|
|
3466
|
+
uid_to_boss[boss_uid] = boss_uid
|
|
3467
|
+
boss_to_members[boss_uid] = []
|
|
3468
|
+
|
|
3469
|
+
boss_rt = boss_prop['rt']
|
|
3470
|
+
boss_mz = boss_prop['mz']
|
|
3471
|
+
boss_vector = boss_prop['vector']
|
|
3472
|
+
|
|
3473
|
+
# Step 5: Efficient RT coelution filtering using sorted array
|
|
3474
|
+
candidate_pairs = []
|
|
3475
|
+
|
|
3476
|
+
# Search backwards from current position
|
|
3477
|
+
j = i - 1
|
|
3478
|
+
while j >= 0 and (boss_rt - feature_props[j]['rt']) <= rt_tol:
|
|
3479
|
+
candidate = feature_props[j]
|
|
3480
|
+
if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
|
|
3481
|
+
if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
|
|
3482
|
+
# Calculate mz difference and check mass shift
|
|
3483
|
+
mz_diff = abs(boss_mz - candidate['mz'])
|
|
3484
|
+
mass_shift_key = round(mz_diff / mz_tol) * mz_tol
|
|
3485
|
+
|
|
3486
|
+
if mass_shift_key in mass_shift_map:
|
|
3487
|
+
likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
|
|
3488
|
+
candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
|
|
3489
|
+
j -= 1
|
|
3490
|
+
|
|
3491
|
+
# Search forwards from current position
|
|
3492
|
+
j = i + 1
|
|
3493
|
+
while j < len(feature_props) and (feature_props[j]['rt'] - boss_rt) <= rt_tol:
|
|
3494
|
+
candidate = feature_props[j]
|
|
3495
|
+
if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
|
|
3496
|
+
if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
|
|
3497
|
+
# Calculate mz difference and check mass shift
|
|
3498
|
+
mz_diff = abs(boss_mz - candidate['mz'])
|
|
3499
|
+
mass_shift_key = round(mz_diff / mz_tol) * mz_tol
|
|
3500
|
+
|
|
3501
|
+
if mass_shift_key in mass_shift_map:
|
|
3502
|
+
likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
|
|
3503
|
+
candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
|
|
3504
|
+
j += 1
|
|
3505
|
+
|
|
3506
|
+
# Sort candidates by likelihood (descending) to prioritize chemically meaningful pairs
|
|
3507
|
+
candidate_pairs.sort(key=lambda x: x[1], reverse=True)
|
|
3508
|
+
|
|
3509
|
+
# Step 6: Process candidates in likelihood priority order
|
|
3510
|
+
for candidate_prop, likelihood, adduct_info in candidate_pairs:
|
|
3511
|
+
candidate_uid = candidate_prop['uid']
|
|
3512
|
+
candidate_vector = candidate_prop['vector']
|
|
3513
|
+
|
|
3514
|
+
# Correlation confirmation with optimized threshold
|
|
3515
|
+
try:
|
|
3516
|
+
correlation = _fast_correlation(boss_vector, candidate_vector)
|
|
3160
3517
|
|
|
3161
|
-
|
|
3162
|
-
if pair in checked_pairs:
|
|
3518
|
+
if correlation < 0.5: # More permissive for legitimate adduct relationships
|
|
3163
3519
|
continue
|
|
3164
|
-
checked_pairs.add(pair)
|
|
3165
3520
|
|
|
3166
|
-
|
|
3167
|
-
|
|
3521
|
+
except Exception:
|
|
3522
|
+
continue
|
|
3523
|
+
|
|
3524
|
+
# Step 7: Hierarchical assignment (merge groups if needed)
|
|
3525
|
+
if candidate_uid in boss_to_members:
|
|
3526
|
+
old_members = boss_to_members[candidate_uid].copy()
|
|
3527
|
+
del boss_to_members[candidate_uid]
|
|
3168
3528
|
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3529
|
+
# Reassign old members to new boss
|
|
3530
|
+
for member in old_members:
|
|
3531
|
+
uid_to_boss[member] = boss_uid
|
|
3532
|
+
boss_to_members[boss_uid].append(member)
|
|
3533
|
+
|
|
3534
|
+
# Assign candidate to current boss
|
|
3535
|
+
uid_to_boss[candidate_uid] = boss_uid
|
|
3536
|
+
boss_to_members[boss_uid].append(candidate_uid)
|
|
3537
|
+
processed_uids.add(candidate_uid)
|
|
3538
|
+
|
|
3539
|
+
processed_uids.add(boss_uid)
|
|
3540
|
+
|
|
3541
|
+
# Step 8: Intensity-based ranking within groups (optimized)
|
|
3542
|
+
for boss_uid in list(boss_to_members.keys()):
|
|
3543
|
+
members = boss_to_members[boss_uid]
|
|
3544
|
+
if len(members) == 0:
|
|
3545
|
+
continue
|
|
3546
|
+
|
|
3547
|
+
all_group_members = [boss_uid] + members
|
|
3548
|
+
|
|
3549
|
+
# Find member with highest intensity efficiently
|
|
3550
|
+
max_intensity = -1
|
|
3551
|
+
new_boss = boss_uid
|
|
3552
|
+
|
|
3553
|
+
for member_uid in all_group_members:
|
|
3554
|
+
# Find member_uid in feature_props
|
|
3555
|
+
member_intensity = next((fp['intensity'] for fp in feature_props if fp['uid'] == member_uid), 0)
|
|
3556
|
+
if member_intensity > max_intensity:
|
|
3557
|
+
max_intensity = member_intensity
|
|
3558
|
+
new_boss = member_uid
|
|
3559
|
+
|
|
3560
|
+
# Update boss if needed
|
|
3561
|
+
if new_boss != boss_uid:
|
|
3562
|
+
boss_to_members[new_boss] = [m for m in all_group_members if m != new_boss]
|
|
3563
|
+
del boss_to_members[boss_uid]
|
|
3564
|
+
|
|
3565
|
+
# Update all member references
|
|
3566
|
+
for member in all_group_members:
|
|
3567
|
+
uid_to_boss[member] = new_boss
|
|
3172
3568
|
|
|
3173
|
-
#
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
groups_by_root[root].append(valid_features[i])
|
|
3569
|
+
# Count and log results
|
|
3570
|
+
total_groups = len(boss_to_members)
|
|
3571
|
+
multi_member_groups = sum(1 for members in boss_to_members.values() if len(members) > 0)
|
|
3572
|
+
total_grouped_features = sum(len(members) + 1 for members in boss_to_members.values())
|
|
3178
3573
|
|
|
3179
|
-
groups
|
|
3180
|
-
group_id = 1
|
|
3181
|
-
assigned_groups = {}
|
|
3574
|
+
study.logger.info(f"Grouping results: {total_groups} groups ({multi_member_groups} multi-member, {total_grouped_features} features)")
|
|
3182
3575
|
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
assigned_groups[uid] = group_id
|
|
3188
|
-
groups[group_id] = member_uids
|
|
3189
|
-
group_id += 1
|
|
3576
|
+
# Step 9: Convert to return format (optimized)
|
|
3577
|
+
uid_to_index = {fp['uid']: fp['index'] for fp in feature_props}
|
|
3578
|
+
adduct_group_list = [0] * n_features
|
|
3579
|
+
adduct_of_list = [0] * n_features
|
|
3190
3580
|
|
|
3191
|
-
|
|
3192
|
-
for
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
|
|
3203
|
-
best_priority = -1
|
|
3204
|
-
best_intensity = 0
|
|
3205
|
-
|
|
3206
|
-
for uid in member_uids:
|
|
3207
|
-
feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
|
|
3208
|
-
if not feature_data:
|
|
3209
|
-
continue
|
|
3210
|
-
|
|
3211
|
-
adduct = feature_data.get("adduct_top", "")
|
|
3212
|
-
intensity = feature_data.get("inty_mean", 0)
|
|
3213
|
-
|
|
3214
|
-
priority = 0
|
|
3215
|
-
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
3216
|
-
priority = 3
|
|
3217
|
-
elif adduct and "[M-H]" in adduct:
|
|
3218
|
-
priority = 2
|
|
3219
|
-
elif adduct and "M" in adduct:
|
|
3220
|
-
priority = 1
|
|
3221
|
-
|
|
3222
|
-
if priority > best_priority or (priority == best_priority and intensity > best_intensity):
|
|
3223
|
-
best_uid = uid
|
|
3224
|
-
best_priority = priority
|
|
3225
|
-
best_intensity = intensity
|
|
3581
|
+
group_counter = 1
|
|
3582
|
+
for boss_uid, members in boss_to_members.items():
|
|
3583
|
+
# Assign boss
|
|
3584
|
+
boss_idx = uid_to_index[boss_uid]
|
|
3585
|
+
adduct_group_list[boss_idx] = group_counter
|
|
3586
|
+
adduct_of_list[boss_idx] = 0
|
|
3587
|
+
|
|
3588
|
+
# Assign members
|
|
3589
|
+
for member_uid in members:
|
|
3590
|
+
member_idx = uid_to_index[member_uid]
|
|
3591
|
+
adduct_group_list[member_idx] = group_counter
|
|
3592
|
+
adduct_of_list[member_idx] = boss_uid
|
|
3226
3593
|
|
|
3227
|
-
|
|
3594
|
+
group_counter += 1
|
|
3228
3595
|
|
|
3229
|
-
#
|
|
3230
|
-
|
|
3231
|
-
|
|
3596
|
+
# Handle ungrouped features
|
|
3597
|
+
for i in range(n_features):
|
|
3598
|
+
if adduct_group_list[i] == 0:
|
|
3599
|
+
adduct_group_list[i] = group_counter
|
|
3600
|
+
adduct_of_list[i] = 0
|
|
3601
|
+
group_counter += 1
|
|
3232
3602
|
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3603
|
+
return adduct_group_list, adduct_of_list
|
|
3604
|
+
|
|
3605
|
+
|
|
3606
|
+
def _fast_correlation(x, y):
|
|
3607
|
+
"""
|
|
3608
|
+
Fast correlation coefficient calculation for consensus matrix data.
|
|
3609
|
+
|
|
3610
|
+
In the consensus matrix:
|
|
3611
|
+
- Negative values (typically -1.0) indicate missing features
|
|
3612
|
+
- Zero and positive values are actual intensities
|
|
3613
|
+
- Only consider intensities >= 1000 for meaningful correlation
|
|
3614
|
+
|
|
3615
|
+
Args:
|
|
3616
|
+
x, y: numpy arrays of the same length
|
|
3237
3617
|
|
|
3238
|
-
|
|
3239
|
-
|
|
3618
|
+
Returns:
|
|
3619
|
+
Correlation coefficient (float), 0 if cannot be calculated
|
|
3620
|
+
"""
|
|
3621
|
+
import numpy as np
|
|
3240
3622
|
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3623
|
+
# For consensus matrix: exclude negative values (missing features) and very low intensities
|
|
3624
|
+
# Use a very low threshold since processed matrix values are often scaled/normalized
|
|
3625
|
+
valid = ~(np.isnan(x) | np.isnan(y) | (x < 0) | (y < 0) | (x < 0.1) | (y < 0.1))
|
|
3626
|
+
|
|
3627
|
+
if np.sum(valid) < 3: # Need at least 3 valid pairs
|
|
3628
|
+
return 0.0
|
|
3629
|
+
|
|
3630
|
+
x_valid = x[valid]
|
|
3631
|
+
y_valid = y[valid]
|
|
3632
|
+
|
|
3633
|
+
# If all values are the same (e.g., all zeros), correlation is undefined
|
|
3634
|
+
if np.var(x_valid) == 0 or np.var(y_valid) == 0:
|
|
3635
|
+
return 0.0
|
|
3636
|
+
|
|
3637
|
+
# Fast correlation using numpy
|
|
3638
|
+
try:
|
|
3639
|
+
correlation_matrix = np.corrcoef(x_valid, y_valid)
|
|
3640
|
+
correlation = correlation_matrix[0, 1]
|
|
3641
|
+
|
|
3642
|
+
# Handle NaN result
|
|
3643
|
+
if np.isnan(correlation):
|
|
3644
|
+
return 0.0
|
|
3645
|
+
|
|
3646
|
+
return correlation
|
|
3647
|
+
|
|
3648
|
+
except Exception:
|
|
3649
|
+
return 0.0
|