masster 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/plot.py +4 -6
- masster/study/defaults/merge_def.py +1 -1
- masster/study/helpers.py +146 -1
- masster/study/id.py +1 -1
- masster/study/merge.py +561 -158
- masster/study/study.py +2 -1
- {masster-0.5.6.dist-info → masster-0.5.8.dist-info}/METADATA +1 -1
- {masster-0.5.6.dist-info → masster-0.5.8.dist-info}/RECORD +12 -12
- {masster-0.5.6.dist-info → masster-0.5.8.dist-info}/WHEEL +0 -0
- {masster-0.5.6.dist-info → masster-0.5.8.dist-info}/entry_points.txt +0 -0
- {masster-0.5.6.dist-info → masster-0.5.8.dist-info}/licenses/LICENSE +0 -0
masster/_version.py
CHANGED
masster/sample/plot.py
CHANGED
|
@@ -86,8 +86,6 @@ def _process_cmap(cmap, fallback="viridis", logger=None):
|
|
|
86
86
|
cmap = "viridis"
|
|
87
87
|
elif cmap == "grey":
|
|
88
88
|
cmap = "Greys256"
|
|
89
|
-
elif cmap == "iridescent":
|
|
90
|
-
cmap = "iridescent_r"
|
|
91
89
|
|
|
92
90
|
# If cmap package is not available, fall back to process_cmap
|
|
93
91
|
if Colormap is None:
|
|
@@ -455,15 +453,15 @@ def plot_2d(
|
|
|
455
453
|
show_ms2=False,
|
|
456
454
|
show_in_browser=False,
|
|
457
455
|
title=None,
|
|
458
|
-
cmap=
|
|
456
|
+
cmap='iridescent',
|
|
459
457
|
marker="circle",
|
|
460
|
-
markersize=
|
|
458
|
+
markersize=5,
|
|
461
459
|
size="static",
|
|
462
460
|
raster_dynamic=True,
|
|
463
461
|
raster_max_px=8,
|
|
464
462
|
raster_threshold=0.8,
|
|
465
463
|
height=600,
|
|
466
|
-
width=
|
|
464
|
+
width=750,
|
|
467
465
|
mz_range=None,
|
|
468
466
|
rt_range=None
|
|
469
467
|
):
|
|
@@ -529,7 +527,7 @@ def plot_2d(
|
|
|
529
527
|
return
|
|
530
528
|
|
|
531
529
|
# Process colormap using the cmap package
|
|
532
|
-
cmap_palette = _process_cmap(cmap, fallback="
|
|
530
|
+
cmap_palette = _process_cmap(cmap, fallback="iridescent", logger=self.logger)
|
|
533
531
|
|
|
534
532
|
# get columns rt, mz, inty from self.ms1_df, It's polars DataFrame
|
|
535
533
|
spectradf = self.ms1_df.select(["rt", "mz", "inty"])
|
masster/study/helpers.py
CHANGED
|
@@ -2956,6 +2956,17 @@ def consensus_select(
|
|
|
2956
2956
|
rt_delta_mean=None,
|
|
2957
2957
|
id_top_score=None,
|
|
2958
2958
|
identified=None,
|
|
2959
|
+
# New adduct filter parameters
|
|
2960
|
+
adduct_top=None,
|
|
2961
|
+
adduct_charge_top=None,
|
|
2962
|
+
adduct_mass_neutral_top=None,
|
|
2963
|
+
adduct_mass_shift_top=None,
|
|
2964
|
+
adduct_group=None,
|
|
2965
|
+
adduct_of=None,
|
|
2966
|
+
# New identification filter parameters
|
|
2967
|
+
id_top_name=None,
|
|
2968
|
+
id_top_class=None,
|
|
2969
|
+
id_top_adduct=None,
|
|
2959
2970
|
sortby=None,
|
|
2960
2971
|
descending=True,
|
|
2961
2972
|
):
|
|
@@ -2990,6 +3001,17 @@ def consensus_select(
|
|
|
2990
3001
|
- True: select only rows with id_top_name not null
|
|
2991
3002
|
- False: select only rows with id_top_name null
|
|
2992
3003
|
- None: no filtering (default)
|
|
3004
|
+
# New adduct filter parameters
|
|
3005
|
+
adduct_top: adduct type filter (list or single string value, e.g. "[M+H]+", "[M+Na]+")
|
|
3006
|
+
adduct_charge_top: adduct charge filter (tuple for range, single value for exact match)
|
|
3007
|
+
adduct_mass_neutral_top: neutral mass filter (tuple for range, single value for minimum)
|
|
3008
|
+
adduct_mass_shift_top: adduct mass shift filter (tuple for range, single value for minimum)
|
|
3009
|
+
adduct_group: adduct group ID filter (list, single value, or tuple for range)
|
|
3010
|
+
adduct_of: adduct representative UID filter (list, single value, or tuple for range)
|
|
3011
|
+
# New identification filter parameters
|
|
3012
|
+
id_top_name: identification name filter (list or single string value for compound names)
|
|
3013
|
+
id_top_class: identification class filter (list or single string value for compound classes)
|
|
3014
|
+
id_top_adduct: identification adduct filter (list or single string value for identified adducts)
|
|
2993
3015
|
sortby: column name(s) to sort by (string, list of strings, or None for no sorting)
|
|
2994
3016
|
descending: sort direction (True for descending, False for ascending, default is True)
|
|
2995
3017
|
|
|
@@ -3004,7 +3026,10 @@ def consensus_select(
|
|
|
3004
3026
|
filter_params = [mz, rt, inty_mean, consensus_uid, consensus_id, number_samples,
|
|
3005
3027
|
number_ms2, quality, bl, chrom_coherence_mean, chrom_prominence_mean,
|
|
3006
3028
|
chrom_prominence_scaled_mean, chrom_height_scaled_mean,
|
|
3007
|
-
rt_delta_mean, id_top_score, identified
|
|
3029
|
+
rt_delta_mean, id_top_score, identified,
|
|
3030
|
+
# New adduct and identification parameters
|
|
3031
|
+
adduct_top, adduct_charge_top, adduct_mass_neutral_top, adduct_mass_shift_top,
|
|
3032
|
+
adduct_group, adduct_of, id_top_name, id_top_class, id_top_adduct]
|
|
3008
3033
|
|
|
3009
3034
|
if all(param is None for param in filter_params) and sortby is None:
|
|
3010
3035
|
return self.consensus_df.clone()
|
|
@@ -3132,6 +3157,126 @@ def consensus_select(
|
|
|
3132
3157
|
else:
|
|
3133
3158
|
warnings.append("'id_top_name' column not found in consensus_df")
|
|
3134
3159
|
|
|
3160
|
+
# Handle adduct_top filter (string or list)
|
|
3161
|
+
if adduct_top is not None:
|
|
3162
|
+
if "adduct_top" in available_columns:
|
|
3163
|
+
if isinstance(adduct_top, list):
|
|
3164
|
+
filter_conditions.append(pl.col("adduct_top").is_in(adduct_top))
|
|
3165
|
+
else:
|
|
3166
|
+
filter_conditions.append(pl.col("adduct_top") == adduct_top)
|
|
3167
|
+
else:
|
|
3168
|
+
warnings.append("'adduct_top' column not found in consensus_df")
|
|
3169
|
+
|
|
3170
|
+
# Handle adduct_charge_top filter (single value, range tuple, or list)
|
|
3171
|
+
if adduct_charge_top is not None:
|
|
3172
|
+
if "adduct_charge_top" in available_columns:
|
|
3173
|
+
if isinstance(adduct_charge_top, tuple) and len(adduct_charge_top) == 2:
|
|
3174
|
+
filter_conditions.append(
|
|
3175
|
+
(pl.col("adduct_charge_top") >= adduct_charge_top[0]) &
|
|
3176
|
+
(pl.col("adduct_charge_top") <= adduct_charge_top[1])
|
|
3177
|
+
)
|
|
3178
|
+
elif isinstance(adduct_charge_top, list):
|
|
3179
|
+
filter_conditions.append(pl.col("adduct_charge_top").is_in(adduct_charge_top))
|
|
3180
|
+
else:
|
|
3181
|
+
filter_conditions.append(pl.col("adduct_charge_top") == adduct_charge_top)
|
|
3182
|
+
else:
|
|
3183
|
+
warnings.append("'adduct_charge_top' column not found in consensus_df")
|
|
3184
|
+
|
|
3185
|
+
# Handle adduct_mass_neutral_top filter (single value, range tuple, or list)
|
|
3186
|
+
if adduct_mass_neutral_top is not None:
|
|
3187
|
+
if "adduct_mass_neutral_top" in available_columns:
|
|
3188
|
+
if isinstance(adduct_mass_neutral_top, tuple) and len(adduct_mass_neutral_top) == 2:
|
|
3189
|
+
filter_conditions.append(
|
|
3190
|
+
(pl.col("adduct_mass_neutral_top") >= adduct_mass_neutral_top[0]) &
|
|
3191
|
+
(pl.col("adduct_mass_neutral_top") <= adduct_mass_neutral_top[1])
|
|
3192
|
+
)
|
|
3193
|
+
elif isinstance(adduct_mass_neutral_top, list):
|
|
3194
|
+
filter_conditions.append(pl.col("adduct_mass_neutral_top").is_in(adduct_mass_neutral_top))
|
|
3195
|
+
else:
|
|
3196
|
+
filter_conditions.append(pl.col("adduct_mass_neutral_top") == adduct_mass_neutral_top)
|
|
3197
|
+
else:
|
|
3198
|
+
warnings.append("'adduct_mass_neutral_top' column not found in consensus_df")
|
|
3199
|
+
|
|
3200
|
+
# Handle adduct_mass_shift_top filter (single value, range tuple, or list)
|
|
3201
|
+
if adduct_mass_shift_top is not None:
|
|
3202
|
+
if "adduct_mass_shift_top" in available_columns:
|
|
3203
|
+
if isinstance(adduct_mass_shift_top, tuple) and len(adduct_mass_shift_top) == 2:
|
|
3204
|
+
filter_conditions.append(
|
|
3205
|
+
(pl.col("adduct_mass_shift_top") >= adduct_mass_shift_top[0]) &
|
|
3206
|
+
(pl.col("adduct_mass_shift_top") <= adduct_mass_shift_top[1])
|
|
3207
|
+
)
|
|
3208
|
+
elif isinstance(adduct_mass_shift_top, list):
|
|
3209
|
+
filter_conditions.append(pl.col("adduct_mass_shift_top").is_in(adduct_mass_shift_top))
|
|
3210
|
+
else:
|
|
3211
|
+
filter_conditions.append(pl.col("adduct_mass_shift_top") == adduct_mass_shift_top)
|
|
3212
|
+
else:
|
|
3213
|
+
warnings.append("'adduct_mass_shift_top' column not found in consensus_df")
|
|
3214
|
+
|
|
3215
|
+
# Handle adduct_group filter (single value or list)
|
|
3216
|
+
if adduct_group is not None:
|
|
3217
|
+
if "adduct_group" in available_columns:
|
|
3218
|
+
if isinstance(adduct_group, list):
|
|
3219
|
+
filter_conditions.append(pl.col("adduct_group").is_in(adduct_group))
|
|
3220
|
+
else:
|
|
3221
|
+
filter_conditions.append(pl.col("adduct_group") == adduct_group)
|
|
3222
|
+
else:
|
|
3223
|
+
warnings.append("'adduct_group' column not found in consensus_df")
|
|
3224
|
+
|
|
3225
|
+
# Handle adduct_of filter (single value or list)
|
|
3226
|
+
if adduct_of is not None:
|
|
3227
|
+
if "adduct_of" in available_columns:
|
|
3228
|
+
if isinstance(adduct_of, list):
|
|
3229
|
+
filter_conditions.append(pl.col("adduct_of").is_in(adduct_of))
|
|
3230
|
+
else:
|
|
3231
|
+
filter_conditions.append(pl.col("adduct_of") == adduct_of)
|
|
3232
|
+
else:
|
|
3233
|
+
warnings.append("'adduct_of' column not found in consensus_df")
|
|
3234
|
+
|
|
3235
|
+
# Handle id_top_name filter (string or list)
|
|
3236
|
+
if id_top_name is not None:
|
|
3237
|
+
if "id_top_name" in available_columns:
|
|
3238
|
+
if isinstance(id_top_name, list):
|
|
3239
|
+
filter_conditions.append(pl.col("id_top_name").is_in(id_top_name))
|
|
3240
|
+
else:
|
|
3241
|
+
filter_conditions.append(pl.col("id_top_name") == id_top_name)
|
|
3242
|
+
else:
|
|
3243
|
+
warnings.append("'id_top_name' column not found in consensus_df")
|
|
3244
|
+
|
|
3245
|
+
# Handle id_top_class filter (string or list)
|
|
3246
|
+
if id_top_class is not None:
|
|
3247
|
+
if "id_top_class" in available_columns:
|
|
3248
|
+
if isinstance(id_top_class, list):
|
|
3249
|
+
filter_conditions.append(pl.col("id_top_class").is_in(id_top_class))
|
|
3250
|
+
else:
|
|
3251
|
+
filter_conditions.append(pl.col("id_top_class") == id_top_class)
|
|
3252
|
+
else:
|
|
3253
|
+
warnings.append("'id_top_class' column not found in consensus_df")
|
|
3254
|
+
|
|
3255
|
+
# Handle id_top_adduct filter (string or list)
|
|
3256
|
+
if id_top_adduct is not None:
|
|
3257
|
+
if "id_top_adduct" in available_columns:
|
|
3258
|
+
if isinstance(id_top_adduct, list):
|
|
3259
|
+
filter_conditions.append(pl.col("id_top_adduct").is_in(id_top_adduct))
|
|
3260
|
+
else:
|
|
3261
|
+
filter_conditions.append(pl.col("id_top_adduct") == id_top_adduct)
|
|
3262
|
+
else:
|
|
3263
|
+
warnings.append("'id_top_adduct' column not found in consensus_df")
|
|
3264
|
+
|
|
3265
|
+
# Handle id_top_score filter (single value, range tuple, or list)
|
|
3266
|
+
if id_top_score is not None:
|
|
3267
|
+
if "id_top_score" in available_columns:
|
|
3268
|
+
if isinstance(id_top_score, tuple) and len(id_top_score) == 2:
|
|
3269
|
+
filter_conditions.append(
|
|
3270
|
+
(pl.col("id_top_score") >= id_top_score[0]) &
|
|
3271
|
+
(pl.col("id_top_score") <= id_top_score[1])
|
|
3272
|
+
)
|
|
3273
|
+
elif isinstance(id_top_score, list):
|
|
3274
|
+
filter_conditions.append(pl.col("id_top_score").is_in(id_top_score))
|
|
3275
|
+
else:
|
|
3276
|
+
filter_conditions.append(pl.col("id_top_score") == id_top_score)
|
|
3277
|
+
else:
|
|
3278
|
+
warnings.append("'id_top_score' column not found in consensus_df")
|
|
3279
|
+
|
|
3135
3280
|
# Log warnings once
|
|
3136
3281
|
for warning in warnings:
|
|
3137
3282
|
self.logger.warning(warning)
|
masster/study/id.py
CHANGED
|
@@ -1201,7 +1201,7 @@ def lib_reset(study):
|
|
|
1201
1201
|
logger.info("Library and identification data reset completed")
|
|
1202
1202
|
|
|
1203
1203
|
|
|
1204
|
-
def _get_adducts(study, adducts_list: list = None, **kwargs):
|
|
1204
|
+
def _get_adducts(study, adducts_list: list | None = None, **kwargs):
|
|
1205
1205
|
"""
|
|
1206
1206
|
Generate comprehensive adduct specifications for study-level adduct filtering.
|
|
1207
1207
|
|
masster/study/merge.py
CHANGED
|
@@ -441,8 +441,7 @@ def merge(study, **kwargs) -> None:
|
|
|
441
441
|
cached_adducts_df = None
|
|
442
442
|
cached_valid_adducts = None
|
|
443
443
|
try:
|
|
444
|
-
|
|
445
|
-
cached_adducts_df = _get_adducts(study)
|
|
444
|
+
cached_adducts_df = study._get_adducts()
|
|
446
445
|
if not cached_adducts_df.is_empty():
|
|
447
446
|
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
448
447
|
else:
|
|
@@ -2258,6 +2257,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
|
2258
2257
|
{
|
|
2259
2258
|
"consensus_uid": row["consensus_uid"],
|
|
2260
2259
|
"rt": row["rt"],
|
|
2260
|
+
"mz": row["mz"], # Add missing mz field
|
|
2261
2261
|
"adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
|
|
2262
2262
|
"adduct_top": row.get("adduct_top"),
|
|
2263
2263
|
"inty_mean": row.get("inty_mean", 0),
|
|
@@ -2265,8 +2265,9 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
|
2265
2265
|
)
|
|
2266
2266
|
|
|
2267
2267
|
# Use optimized adduct grouping
|
|
2268
|
+
study.logger.info(f"About to call adduct grouping for {len(consensus_data)} consensus features")
|
|
2268
2269
|
adduct_group_list, adduct_of_list = __merge_adduct_grouping(
|
|
2269
|
-
study, consensus_data, rt_tol, mz_tol
|
|
2270
|
+
study, consensus_data, rt_tol/3, mz_tol
|
|
2270
2271
|
)
|
|
2271
2272
|
|
|
2272
2273
|
# Add the new columns to consensus_df
|
|
@@ -2713,8 +2714,6 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2713
2714
|
cached_adducts_df: Pre-computed adducts DataFrame for performance
|
|
2714
2715
|
"""
|
|
2715
2716
|
import polars as pl
|
|
2716
|
-
import numpy as np
|
|
2717
|
-
from collections import defaultdict
|
|
2718
2717
|
|
|
2719
2718
|
# Check if consensus_df exists and has features
|
|
2720
2719
|
if len(study.consensus_df) == 0:
|
|
@@ -2727,8 +2726,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2727
2726
|
if cached_adducts_df is None or cached_adducts_df.is_empty():
|
|
2728
2727
|
try:
|
|
2729
2728
|
# Use lower min_probability for better adduct coverage in mass shift identification
|
|
2730
|
-
|
|
2731
|
-
cached_adducts_df = _get_adducts(study, min_probability=0.01)
|
|
2729
|
+
cached_adducts_df = study._get_adducts(min_probability=0.01)
|
|
2732
2730
|
except Exception as e:
|
|
2733
2731
|
study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
|
|
2734
2732
|
return
|
|
@@ -2822,9 +2820,8 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2822
2820
|
mz1 = feature1["mz"]
|
|
2823
2821
|
adduct1 = feature1["adduct_top"]
|
|
2824
2822
|
|
|
2825
|
-
#
|
|
2826
|
-
#
|
|
2827
|
-
# continue
|
|
2823
|
+
# Conservative approach: Don't skip features here - let algorithm find pairs first
|
|
2824
|
+
# We'll check for inappropriate assignments later in the pair processing logic
|
|
2828
2825
|
|
|
2829
2826
|
# Search for coeluting features within strict RT tolerance
|
|
2830
2827
|
for j in range(i + 1, n_features):
|
|
@@ -2838,9 +2835,7 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2838
2835
|
mz2 = feature2["mz"]
|
|
2839
2836
|
adduct2 = feature2["adduct_top"]
|
|
2840
2837
|
|
|
2841
|
-
#
|
|
2842
|
-
# if adduct2 and "?" not in adduct2:
|
|
2843
|
-
# continue
|
|
2838
|
+
# Conservative approach: Don't skip feature2 here either - process all potential pairs
|
|
2844
2839
|
|
|
2845
2840
|
# Calculate observed m/z difference
|
|
2846
2841
|
mz_diff = mz2 - mz1
|
|
@@ -2890,24 +2885,45 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2890
2885
|
else:
|
|
2891
2886
|
# Assignment based on mass shift direction
|
|
2892
2887
|
# catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
|
|
2893
|
-
# If catalog_shift > 0: from_adduct has higher
|
|
2894
|
-
# If catalog_shift < 0: from_adduct has lower
|
|
2895
|
-
# observed mz_diff = mz2 - mz1
|
|
2896
|
-
#
|
|
2897
|
-
#
|
|
2888
|
+
# If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
|
|
2889
|
+
# If catalog_shift < 0: from_adduct has lower mass shift than to_adduct
|
|
2890
|
+
# observed mz_diff = mz2 - mz1 (always positive for mz2 > mz1)
|
|
2891
|
+
#
|
|
2892
|
+
# CRITICAL FIX: Correct assignment logic
|
|
2893
|
+
# When mz_diff matches positive catalog_shift:
|
|
2894
|
+
# - from_adduct is the heavier adduct (higher mass shift)
|
|
2895
|
+
# - to_adduct is the lighter adduct (lower mass shift)
|
|
2896
|
+
# - Higher m/z feature should get the heavier adduct (from_adduct)
|
|
2897
|
+
# - Lower m/z feature should get the lighter adduct (to_adduct)
|
|
2898
2898
|
|
|
2899
2899
|
if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
|
|
2900
2900
|
# mz_diff matches catalog_shift direction
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2901
|
+
if catalog_shift > 0:
|
|
2902
|
+
# from_adduct is heavier, to_adduct is lighter
|
|
2903
|
+
from_feature = feature2 # Higher m/z gets heavier adduct
|
|
2904
|
+
to_feature = feature1 # Lower m/z gets lighter adduct
|
|
2905
|
+
from_adduct_name = best_rel["from_adduct"] # Heavier adduct
|
|
2906
|
+
to_adduct_name = best_rel["to_adduct"] # Lighter adduct
|
|
2907
|
+
else:
|
|
2908
|
+
# from_adduct is lighter, to_adduct is heavier
|
|
2909
|
+
from_feature = feature1 # Lower m/z gets lighter adduct
|
|
2910
|
+
to_feature = feature2 # Higher m/z gets heavier adduct
|
|
2911
|
+
from_adduct_name = best_rel["from_adduct"] # Lighter adduct
|
|
2912
|
+
to_adduct_name = best_rel["to_adduct"] # Heavier adduct
|
|
2905
2913
|
else:
|
|
2906
2914
|
# mz_diff matches reverse direction of catalog_shift
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2910
|
-
|
|
2915
|
+
if catalog_shift > 0:
|
|
2916
|
+
# Reverse: from_adduct becomes lighter, to_adduct becomes heavier
|
|
2917
|
+
from_feature = feature1 # Lower m/z gets lighter adduct
|
|
2918
|
+
to_feature = feature2 # Higher m/z gets heavier adduct
|
|
2919
|
+
from_adduct_name = best_rel["to_adduct"] # Now lighter adduct
|
|
2920
|
+
to_adduct_name = best_rel["from_adduct"] # Now heavier adduct
|
|
2921
|
+
else:
|
|
2922
|
+
# Reverse: from_adduct becomes heavier, to_adduct becomes lighter
|
|
2923
|
+
from_feature = feature2 # Higher m/z gets heavier adduct
|
|
2924
|
+
to_feature = feature1 # Lower m/z gets lighter adduct
|
|
2925
|
+
from_adduct_name = best_rel["to_adduct"] # Now heavier adduct
|
|
2926
|
+
to_adduct_name = best_rel["from_adduct"] # Now lighter adduct
|
|
2911
2927
|
|
|
2912
2928
|
# Get adduct details from catalog
|
|
2913
2929
|
from_adduct_info = adduct_info.get(from_adduct_name, {})
|
|
@@ -2922,7 +2938,40 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2922
2938
|
from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
|
|
2923
2939
|
to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
|
|
2924
2940
|
|
|
2925
|
-
#
|
|
2941
|
+
# Smart conservative check: prevent inappropriate assignments to isolated features
|
|
2942
|
+
# Check if both features are isolated (single-member groups) with [M+?]1+ assignments
|
|
2943
|
+
def is_isolated_unknown_feature(feature):
|
|
2944
|
+
"""Check if a feature is isolated with unknown adduct"""
|
|
2945
|
+
if not feature["adduct_top"] or "[M+?]" not in feature["adduct_top"]:
|
|
2946
|
+
return False # Not unknown, safe to process
|
|
2947
|
+
|
|
2948
|
+
# Check group size
|
|
2949
|
+
try:
|
|
2950
|
+
feature_row = study.consensus_df.filter(study.consensus_df["consensus_uid"] == feature["consensus_uid"])
|
|
2951
|
+
if len(feature_row) > 0:
|
|
2952
|
+
adduct_group = feature_row["adduct_group"].iloc[0]
|
|
2953
|
+
if adduct_group > 0:
|
|
2954
|
+
group_members = study.consensus_df.filter(study.consensus_df["adduct_group"] == adduct_group)
|
|
2955
|
+
return len(group_members) <= 1 # Isolated if group size <= 1
|
|
2956
|
+
except Exception:
|
|
2957
|
+
pass
|
|
2958
|
+
return True # Default to isolated if can't determine
|
|
2959
|
+
|
|
2960
|
+
from_isolated = is_isolated_unknown_feature(from_feature)
|
|
2961
|
+
to_isolated = is_isolated_unknown_feature(to_feature)
|
|
2962
|
+
|
|
2963
|
+
# Only skip assignment if BOTH features are isolated AND would get the SAME adduct
|
|
2964
|
+
# (This prevents inappropriate duplicate assignments to isolated features)
|
|
2965
|
+
skip_assignment = (from_isolated and to_isolated and from_adduct_name == to_adduct_name)
|
|
2966
|
+
|
|
2967
|
+
if skip_assignment:
|
|
2968
|
+
study.logger.debug(
|
|
2969
|
+
f"Skipping inappropriate assignment: both isolated features would get {from_adduct_name} "
|
|
2970
|
+
f"(UIDs {from_feature['consensus_uid']}, {to_feature['consensus_uid']})"
|
|
2971
|
+
)
|
|
2972
|
+
continue # Skip this pair, continue to next relationship
|
|
2973
|
+
|
|
2974
|
+
# Store updates (legitimate pair or at least one feature already has specific adduct)
|
|
2926
2975
|
adduct_updates[from_feature["consensus_uid"]] = {
|
|
2927
2976
|
"adduct_top": from_adduct_name,
|
|
2928
2977
|
"adduct_charge_top": from_charge,
|
|
@@ -3083,164 +3132,518 @@ def __merge_feature_lookup(study_obj, features_df):
|
|
|
3083
3132
|
return features_lookup
|
|
3084
3133
|
|
|
3085
3134
|
|
|
3135
|
+
def _get_features_matrix(study, consensus_data, quant_col="inty"):
|
|
3136
|
+
"""
|
|
3137
|
+
Create a local intensity matrix from features_df for correlation calculations.
|
|
3138
|
+
|
|
3139
|
+
Args:
|
|
3140
|
+
study: Study object with features_df and samples_df
|
|
3141
|
+
consensus_data: List of consensus feature dictionaries
|
|
3142
|
+
quant_col: Column name to use for quantification (default: "inty")
|
|
3143
|
+
|
|
3144
|
+
Returns:
|
|
3145
|
+
pandas.DataFrame: Matrix with consensus_uid as index, sample names as columns
|
|
3146
|
+
"""
|
|
3147
|
+
import pandas as pd
|
|
3148
|
+
import numpy as np
|
|
3149
|
+
|
|
3150
|
+
# Get all sample names
|
|
3151
|
+
sample_names = study.samples_df["sample_name"].to_list()
|
|
3152
|
+
consensus_uids = [int(f["consensus_uid"]) for f in consensus_data]
|
|
3153
|
+
|
|
3154
|
+
# Initialize matrix with zeros
|
|
3155
|
+
matrix_data = pd.DataFrame(
|
|
3156
|
+
index=pd.Index(consensus_uids, name="consensus_uid"),
|
|
3157
|
+
columns=sample_names,
|
|
3158
|
+
data=0.0,
|
|
3159
|
+
dtype=float
|
|
3160
|
+
)
|
|
3161
|
+
|
|
3162
|
+
study.logger.debug(f"Building local features matrix: {len(consensus_uids)} features x {len(sample_names)} samples")
|
|
3163
|
+
|
|
3164
|
+
# Fill matrix with actual intensity values
|
|
3165
|
+
features_df_pandas = study.features_df.to_pandas()
|
|
3166
|
+
samples_df_pandas = study.samples_df.to_pandas()
|
|
3167
|
+
consensus_mapping_pandas = study.consensus_mapping_df.to_pandas()
|
|
3168
|
+
|
|
3169
|
+
# Create sample_uid to sample_name mapping
|
|
3170
|
+
uid_to_name = dict(zip(samples_df_pandas["sample_uid"], samples_df_pandas["sample_name"]))
|
|
3171
|
+
|
|
3172
|
+
# For each consensus feature, get intensities from all samples
|
|
3173
|
+
for consensus_uid in consensus_uids:
|
|
3174
|
+
# Get all feature_uids that map to this consensus_uid
|
|
3175
|
+
feature_mappings = consensus_mapping_pandas[
|
|
3176
|
+
consensus_mapping_pandas["consensus_uid"] == consensus_uid
|
|
3177
|
+
]
|
|
3178
|
+
|
|
3179
|
+
for _, mapping in feature_mappings.iterrows():
|
|
3180
|
+
feature_uid = mapping["feature_uid"]
|
|
3181
|
+
sample_uid = mapping["sample_uid"]
|
|
3182
|
+
sample_name = uid_to_name.get(sample_uid, f"sample_{sample_uid}")
|
|
3183
|
+
|
|
3184
|
+
# Get intensity for this feature
|
|
3185
|
+
feature_row = features_df_pandas[
|
|
3186
|
+
(features_df_pandas["feature_uid"] == feature_uid) &
|
|
3187
|
+
(features_df_pandas["sample_uid"] == sample_uid)
|
|
3188
|
+
]
|
|
3189
|
+
|
|
3190
|
+
if len(feature_row) > 0:
|
|
3191
|
+
intensity = feature_row[quant_col].iloc[0]
|
|
3192
|
+
if pd.notna(intensity):
|
|
3193
|
+
matrix_data.loc[consensus_uid, sample_name] = float(intensity)
|
|
3194
|
+
|
|
3195
|
+
# Convert any remaining NaN to 0
|
|
3196
|
+
matrix_data = matrix_data.fillna(0.0)
|
|
3197
|
+
|
|
3198
|
+
study.logger.debug(f"Local matrix built successfully with shape {matrix_data.shape}")
|
|
3199
|
+
|
|
3200
|
+
return matrix_data
|
|
3201
|
+
|
|
3202
|
+
|
|
3203
|
+
def _get_adduct_deltas_with_likelihood(study):
|
|
3204
|
+
"""
|
|
3205
|
+
Extract all pairwise mass differences between adducts with joint likelihood scoring.
|
|
3206
|
+
|
|
3207
|
+
Args:
|
|
3208
|
+
study: Study object with _get_adducts method
|
|
3209
|
+
|
|
3210
|
+
Returns:
|
|
3211
|
+
List of tuples: (mass_delta, joint_likelihood, adduct1_name, adduct2_name)
|
|
3212
|
+
Sorted by joint_likelihood descending (most likely pairs first)
|
|
3213
|
+
"""
|
|
3214
|
+
try:
|
|
3215
|
+
adducts_df = study._get_adducts()
|
|
3216
|
+
|
|
3217
|
+
if adducts_df is None or adducts_df.is_empty():
|
|
3218
|
+
study.logger.warning("No adducts dataframe available for study")
|
|
3219
|
+
return []
|
|
3220
|
+
|
|
3221
|
+
# Convert to pandas for easier manipulation
|
|
3222
|
+
adducts_pd = adducts_df.to_pandas()
|
|
3223
|
+
|
|
3224
|
+
# Check if we have likelihood/probability information
|
|
3225
|
+
likelihood_col = None
|
|
3226
|
+
for col in ['likelihood', 'probability', 'freq', 'frequency', 'score']:
|
|
3227
|
+
if col in adducts_pd.columns:
|
|
3228
|
+
likelihood_col = col
|
|
3229
|
+
break
|
|
3230
|
+
|
|
3231
|
+
# If no likelihood column, estimate based on adduct type
|
|
3232
|
+
if likelihood_col is None:
|
|
3233
|
+
adducts_pd['estimated_likelihood'] = adducts_pd.apply(_estimate_adduct_likelihood, axis=1)
|
|
3234
|
+
likelihood_col = 'estimated_likelihood'
|
|
3235
|
+
|
|
3236
|
+
# Get mass column (try different possible column names)
|
|
3237
|
+
mass_col = None
|
|
3238
|
+
for col_name in ['mass_shift', 'mass', 'mass_shift_da', 'mass_da']:
|
|
3239
|
+
if col_name in adducts_pd.columns:
|
|
3240
|
+
mass_col = col_name
|
|
3241
|
+
break
|
|
3242
|
+
|
|
3243
|
+
if mass_col is None:
|
|
3244
|
+
study.logger.warning(f"No mass column found in adducts dataframe. Available columns: {list(adducts_pd.columns)}")
|
|
3245
|
+
return []
|
|
3246
|
+
|
|
3247
|
+
# Calculate all pairwise differences with joint likelihoods
|
|
3248
|
+
adduct_pairs = []
|
|
3249
|
+
for i in range(len(adducts_pd)):
|
|
3250
|
+
for j in range(i + 1, len(adducts_pd)):
|
|
3251
|
+
row_i = adducts_pd.iloc[i]
|
|
3252
|
+
row_j = adducts_pd.iloc[j]
|
|
3253
|
+
|
|
3254
|
+
# Skip if masses are NaN or invalid
|
|
3255
|
+
if (hasattr(row_i[mass_col], '__iter__') and not isinstance(row_i[mass_col], str)) or \
|
|
3256
|
+
(hasattr(row_j[mass_col], '__iter__') and not isinstance(row_j[mass_col], str)):
|
|
3257
|
+
continue
|
|
3258
|
+
|
|
3259
|
+
mass_i = float(row_i[mass_col])
|
|
3260
|
+
mass_j = float(row_j[mass_col])
|
|
3261
|
+
delta = abs(mass_i - mass_j)
|
|
3262
|
+
|
|
3263
|
+
if delta > 0.1: # Only meaningful mass differences
|
|
3264
|
+
# Joint likelihood is sum of individual likelihoods
|
|
3265
|
+
joint_likelihood = float(row_i[likelihood_col]) + float(row_j[likelihood_col])
|
|
3266
|
+
|
|
3267
|
+
adduct1_name = row_i.get('adduct', row_i.get('name', f'adduct_{i}'))
|
|
3268
|
+
adduct2_name = row_j.get('adduct', row_j.get('name', f'adduct_{j}'))
|
|
3269
|
+
|
|
3270
|
+
# CRITICAL FIX: Order adducts consistently from lower mass to higher mass
|
|
3271
|
+
# This ensures consistent assignment: lower mass adduct = from_adduct, higher mass adduct = to_adduct
|
|
3272
|
+
if mass_i <= mass_j:
|
|
3273
|
+
# row_i has lower or equal mass shift -> from_adduct
|
|
3274
|
+
# row_j has higher mass shift -> to_adduct
|
|
3275
|
+
adduct_pairs.append((round(delta, 4), joint_likelihood, adduct1_name, adduct2_name))
|
|
3276
|
+
else:
|
|
3277
|
+
# row_j has lower mass shift -> from_adduct
|
|
3278
|
+
# row_i has higher mass shift -> to_adduct
|
|
3279
|
+
adduct_pairs.append((round(delta, 4), joint_likelihood, adduct2_name, adduct1_name))
|
|
3280
|
+
|
|
3281
|
+
# Sort by joint likelihood descending (most likely pairs first)
|
|
3282
|
+
adduct_pairs.sort(key=lambda x: x[1], reverse=True)
|
|
3283
|
+
|
|
3284
|
+
study.logger.debug(f"Extracted {len(adduct_pairs)} adduct pairs with likelihood scoring")
|
|
3285
|
+
return adduct_pairs
|
|
3286
|
+
|
|
3287
|
+
except Exception as e:
|
|
3288
|
+
study.logger.warning(f"Could not extract adduct deltas with likelihood: {e}. No adducts defined - returning empty list.")
|
|
3289
|
+
return []
|
|
3290
|
+
|
|
3291
|
+
|
|
3292
|
+
def _estimate_adduct_likelihood(adduct_row):
|
|
3293
|
+
"""
|
|
3294
|
+
Estimate likelihood of an adduct based on common knowledge.
|
|
3295
|
+
|
|
3296
|
+
Args:
|
|
3297
|
+
adduct_row: pandas Series with adduct information
|
|
3298
|
+
|
|
3299
|
+
Returns:
|
|
3300
|
+
float: Estimated likelihood (0.0 to 1.0)
|
|
3301
|
+
"""
|
|
3302
|
+
adduct_name = str(adduct_row.get('adduct', adduct_row.get('name', ''))).lower()
|
|
3303
|
+
|
|
3304
|
+
# Common likelihood estimates based on adduct frequency in positive mode
|
|
3305
|
+
likelihood_map = {
|
|
3306
|
+
'[m+h]': 0.9, # Most common
|
|
3307
|
+
'[m+na]': 0.7, # Very common
|
|
3308
|
+
'[m+nh4]': 0.6, # Common
|
|
3309
|
+
'[m+k]': 0.3, # Less common
|
|
3310
|
+
'[m+2h]': 0.2, # Doubly charged, less frequent
|
|
3311
|
+
'[m+3h]': 0.1, # Triply charged, rare
|
|
3312
|
+
'[m+h-h2o]': 0.4, # Loss adducts, moderately common
|
|
3313
|
+
}
|
|
3314
|
+
|
|
3315
|
+
# Find best match
|
|
3316
|
+
for pattern, likelihood in likelihood_map.items():
|
|
3317
|
+
if pattern in adduct_name:
|
|
3318
|
+
return likelihood
|
|
3319
|
+
|
|
3320
|
+
# Default for unknown adducts
|
|
3321
|
+
return 0.2
|
|
3322
|
+
|
|
3323
|
+
|
|
3324
|
+
def _get_adduct_deltas(study):
|
|
3325
|
+
"""
|
|
3326
|
+
Extract all pairwise mass differences between adducts from study adducts data.
|
|
3327
|
+
|
|
3328
|
+
Args:
|
|
3329
|
+
study: Study object with _get_adducts method
|
|
3330
|
+
|
|
3331
|
+
Returns:
|
|
3332
|
+
List of mass differences (deltas) for adduct filtering
|
|
3333
|
+
"""
|
|
3334
|
+
# Use the enhanced function and extract just the deltas for backward compatibility
|
|
3335
|
+
adduct_pairs = _get_adduct_deltas_with_likelihood(study)
|
|
3336
|
+
return [pair[0] for pair in adduct_pairs] # Extract just the mass deltas
|
|
3337
|
+
|
|
3338
|
+
|
|
3339
|
+
def _fast_correlation(vec1, vec2):
|
|
3340
|
+
"""
|
|
3341
|
+
Fast Pearson correlation coefficient calculation.
|
|
3342
|
+
Optimized for repeated use in tight loops.
|
|
3343
|
+
"""
|
|
3344
|
+
if len(vec1) != len(vec2):
|
|
3345
|
+
return 0.0
|
|
3346
|
+
|
|
3347
|
+
# Remove NaN values and corresponding positions
|
|
3348
|
+
mask = ~(np.isnan(vec1) | np.isnan(vec2))
|
|
3349
|
+
if np.sum(mask) < 2: # Need at least 2 valid points
|
|
3350
|
+
return 0.0
|
|
3351
|
+
|
|
3352
|
+
v1 = vec1[mask]
|
|
3353
|
+
v2 = vec2[mask]
|
|
3354
|
+
|
|
3355
|
+
# Fast correlation using numpy built-in
|
|
3356
|
+
try:
|
|
3357
|
+
corr_matrix = np.corrcoef(v1, v2)
|
|
3358
|
+
return corr_matrix[0, 1] if not np.isnan(corr_matrix[0, 1]) else 0.0
|
|
3359
|
+
except Exception:
|
|
3360
|
+
return 0.0
|
|
3361
|
+
|
|
3362
|
+
|
|
3086
3363
|
def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
3087
3364
|
"""
|
|
3088
|
-
|
|
3365
|
+
Groups consensus features that represent the same molecule with different adducts.
|
|
3366
|
+
Uses multi-step filtering:
|
|
3367
|
+
1. Build local intensity matrix once
|
|
3368
|
+
2. RT coelution filtering with spatial indexing
|
|
3369
|
+
3. Mass shift validation with hash lookup
|
|
3370
|
+
4. Hierarchical boss structure (prevent transitivity)
|
|
3371
|
+
5. Correlation-based confirmation
|
|
3372
|
+
6. Intensity-based ranking for final selection
|
|
3089
3373
|
|
|
3090
3374
|
Args:
|
|
3091
|
-
study: Study object
|
|
3375
|
+
study: Study object
|
|
3092
3376
|
consensus_data: List of consensus feature dictionaries
|
|
3093
|
-
rt_tol:
|
|
3094
|
-
mz_tol:
|
|
3377
|
+
rt_tol: Retention time tolerance (seconds)
|
|
3378
|
+
mz_tol: M/z tolerance (Da)
|
|
3095
3379
|
|
|
3096
3380
|
Returns:
|
|
3097
3381
|
Tuple of (adduct_group_list, adduct_of_list)
|
|
3098
3382
|
"""
|
|
3383
|
+
|
|
3099
3384
|
if not consensus_data:
|
|
3100
3385
|
return [], []
|
|
3101
|
-
|
|
3386
|
+
|
|
3102
3387
|
n_features = len(consensus_data)
|
|
3103
|
-
|
|
3104
|
-
study.logger.info(f"Adduct grouping for {n_features} consensus features...")
|
|
3105
|
-
else:
|
|
3106
|
-
study.logger.debug(f"Adduct grouping for {n_features} consensus features...")
|
|
3388
|
+
study.logger.info(f"Starting adduct grouping for {n_features} features")
|
|
3107
3389
|
|
|
3108
|
-
#
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3390
|
+
# Step 1: Build local intensity matrix ONCE
|
|
3391
|
+
try:
|
|
3392
|
+
intensity_matrix_pd = _get_features_matrix(study, consensus_data, quant_col="inty")
|
|
3393
|
+
|
|
3394
|
+
if intensity_matrix_pd is None or len(intensity_matrix_pd) == 0:
|
|
3395
|
+
study.logger.warning("Could not build local intensity matrix - creating single-feature groups")
|
|
3396
|
+
adduct_group_list = list(range(1, len(consensus_data) + 1))
|
|
3397
|
+
adduct_of_list = [0] * len(consensus_data)
|
|
3398
|
+
return adduct_group_list, adduct_of_list
|
|
3399
|
+
|
|
3400
|
+
study.logger.info(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
|
|
3401
|
+
|
|
3402
|
+
except Exception as e:
|
|
3403
|
+
study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
|
|
3404
|
+
adduct_group_list = list(range(1, len(consensus_data) + 1))
|
|
3405
|
+
adduct_of_list = [0] * len(consensus_data)
|
|
3406
|
+
return adduct_group_list, adduct_of_list
|
|
3407
|
+
|
|
3408
|
+
# Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
|
|
3409
|
+
adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
|
|
3410
|
+
study.logger.info(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
|
|
3411
|
+
|
|
3412
|
+
# Build hash map for O(1) mass shift lookup
|
|
3413
|
+
mass_shift_map = {} # rounded_delta -> [(likelihood, adduct1, adduct2), ...]
|
|
3414
|
+
for mass_delta, joint_likelihood, adduct1, adduct2 in adduct_pairs_with_likelihood:
|
|
3415
|
+
key = round(mass_delta / mz_tol) * mz_tol # Round to tolerance grid
|
|
3416
|
+
if key not in mass_shift_map:
|
|
3417
|
+
mass_shift_map[key] = []
|
|
3418
|
+
mass_shift_map[key].append((joint_likelihood, adduct1, adduct2))
|
|
3419
|
+
|
|
3420
|
+
# Sort each mass shift group by likelihood (highest first)
|
|
3421
|
+
for key in mass_shift_map:
|
|
3422
|
+
mass_shift_map[key].sort(key=lambda x: x[0], reverse=True)
|
|
3423
|
+
|
|
3424
|
+
# Step 3: Pre-compute feature properties and sort by RT for spatial filtering
|
|
3425
|
+
feature_props = []
|
|
3426
|
+
for i, feature in enumerate(consensus_data):
|
|
3427
|
+
uid = feature["consensus_uid"]
|
|
3115
3428
|
rt = feature["rt"]
|
|
3116
|
-
|
|
3429
|
+
mz = feature["mz"]
|
|
3117
3430
|
intensity = feature.get("inty_mean", 0)
|
|
3118
|
-
adduct = feature.get("adduct_top", "")
|
|
3119
3431
|
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
|
|
3432
|
+
# Get matrix vector once
|
|
3433
|
+
matrix_vector = intensity_matrix_pd.loc[uid].values if uid in intensity_matrix_pd.index else None
|
|
3434
|
+
|
|
3435
|
+
feature_props.append({
|
|
3436
|
+
'index': i,
|
|
3437
|
+
'uid': uid,
|
|
3438
|
+
'rt': rt,
|
|
3439
|
+
'mz': mz,
|
|
3440
|
+
'intensity': intensity,
|
|
3441
|
+
'vector': matrix_vector,
|
|
3442
|
+
'feature': feature
|
|
3443
|
+
})
|
|
3124
3444
|
|
|
3125
|
-
#
|
|
3126
|
-
|
|
3127
|
-
|
|
3128
|
-
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
|
|
3445
|
+
# Sort by RT for efficient spatial filtering
|
|
3446
|
+
feature_props.sort(key=lambda x: x['rt'])
|
|
3447
|
+
|
|
3448
|
+
# Initialize grouping structures
|
|
3449
|
+
uid_to_boss = {} # Hierarchical structure: uid -> boss_uid
|
|
3450
|
+
boss_to_members = {} # boss_uid -> [member_uids]
|
|
3451
|
+
processed_uids = set()
|
|
3452
|
+
|
|
3453
|
+
# Step 4: Process features with optimized RT filtering
|
|
3454
|
+
for i, boss_prop in enumerate(feature_props):
|
|
3455
|
+
boss_uid = boss_prop['uid']
|
|
3456
|
+
|
|
3457
|
+
if boss_uid in processed_uids:
|
|
3458
|
+
continue
|
|
3459
|
+
|
|
3460
|
+
if boss_prop['vector'] is None:
|
|
3461
|
+
processed_uids.add(boss_uid)
|
|
3462
|
+
continue
|
|
3463
|
+
|
|
3464
|
+
# Initialize as boss
|
|
3465
|
+
if boss_uid not in uid_to_boss:
|
|
3466
|
+
uid_to_boss[boss_uid] = boss_uid
|
|
3467
|
+
boss_to_members[boss_uid] = []
|
|
3468
|
+
|
|
3469
|
+
boss_rt = boss_prop['rt']
|
|
3470
|
+
boss_mz = boss_prop['mz']
|
|
3471
|
+
boss_vector = boss_prop['vector']
|
|
3472
|
+
|
|
3473
|
+
# Step 5: Efficient RT coelution filtering using sorted array
|
|
3474
|
+
candidate_pairs = []
|
|
3475
|
+
|
|
3476
|
+
# Search backwards from current position
|
|
3477
|
+
j = i - 1
|
|
3478
|
+
while j >= 0 and (boss_rt - feature_props[j]['rt']) <= rt_tol:
|
|
3479
|
+
candidate = feature_props[j]
|
|
3480
|
+
if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
|
|
3481
|
+
if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
|
|
3482
|
+
# Calculate mz difference and check mass shift
|
|
3483
|
+
mz_diff = abs(boss_mz - candidate['mz'])
|
|
3484
|
+
mass_shift_key = round(mz_diff / mz_tol) * mz_tol
|
|
3485
|
+
|
|
3486
|
+
if mass_shift_key in mass_shift_map:
|
|
3487
|
+
likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
|
|
3488
|
+
candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
|
|
3489
|
+
j -= 1
|
|
3490
|
+
|
|
3491
|
+
# Search forwards from current position
|
|
3492
|
+
j = i + 1
|
|
3493
|
+
while j < len(feature_props) and (feature_props[j]['rt'] - boss_rt) <= rt_tol:
|
|
3494
|
+
candidate = feature_props[j]
|
|
3495
|
+
if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
|
|
3496
|
+
if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
|
|
3497
|
+
# Calculate mz difference and check mass shift
|
|
3498
|
+
mz_diff = abs(boss_mz - candidate['mz'])
|
|
3499
|
+
mass_shift_key = round(mz_diff / mz_tol) * mz_tol
|
|
3500
|
+
|
|
3501
|
+
if mass_shift_key in mass_shift_map:
|
|
3502
|
+
likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
|
|
3503
|
+
candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
|
|
3504
|
+
j += 1
|
|
3505
|
+
|
|
3506
|
+
# Sort candidates by likelihood (descending) to prioritize chemically meaningful pairs
|
|
3507
|
+
candidate_pairs.sort(key=lambda x: x[1], reverse=True)
|
|
3508
|
+
|
|
3509
|
+
# Step 6: Process candidates in likelihood priority order
|
|
3510
|
+
for candidate_prop, likelihood, adduct_info in candidate_pairs:
|
|
3511
|
+
candidate_uid = candidate_prop['uid']
|
|
3512
|
+
candidate_vector = candidate_prop['vector']
|
|
3513
|
+
|
|
3514
|
+
# Correlation confirmation with optimized threshold
|
|
3515
|
+
try:
|
|
3516
|
+
correlation = _fast_correlation(boss_vector, candidate_vector)
|
|
3160
3517
|
|
|
3161
|
-
|
|
3162
|
-
if pair in checked_pairs:
|
|
3518
|
+
if correlation < 0.5: # More permissive for legitimate adduct relationships
|
|
3163
3519
|
continue
|
|
3164
|
-
checked_pairs.add(pair)
|
|
3165
3520
|
|
|
3166
|
-
|
|
3167
|
-
|
|
3521
|
+
except Exception:
|
|
3522
|
+
continue
|
|
3523
|
+
|
|
3524
|
+
# Step 7: Hierarchical assignment (merge groups if needed)
|
|
3525
|
+
if candidate_uid in boss_to_members:
|
|
3526
|
+
old_members = boss_to_members[candidate_uid].copy()
|
|
3527
|
+
del boss_to_members[candidate_uid]
|
|
3168
3528
|
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3529
|
+
# Reassign old members to new boss
|
|
3530
|
+
for member in old_members:
|
|
3531
|
+
uid_to_boss[member] = boss_uid
|
|
3532
|
+
boss_to_members[boss_uid].append(member)
|
|
3533
|
+
|
|
3534
|
+
# Assign candidate to current boss
|
|
3535
|
+
uid_to_boss[candidate_uid] = boss_uid
|
|
3536
|
+
boss_to_members[boss_uid].append(candidate_uid)
|
|
3537
|
+
processed_uids.add(candidate_uid)
|
|
3538
|
+
|
|
3539
|
+
processed_uids.add(boss_uid)
|
|
3540
|
+
|
|
3541
|
+
# Step 8: Intensity-based ranking within groups (optimized)
|
|
3542
|
+
for boss_uid in list(boss_to_members.keys()):
|
|
3543
|
+
members = boss_to_members[boss_uid]
|
|
3544
|
+
if len(members) == 0:
|
|
3545
|
+
continue
|
|
3546
|
+
|
|
3547
|
+
all_group_members = [boss_uid] + members
|
|
3548
|
+
|
|
3549
|
+
# Find member with highest intensity efficiently
|
|
3550
|
+
max_intensity = -1
|
|
3551
|
+
new_boss = boss_uid
|
|
3552
|
+
|
|
3553
|
+
for member_uid in all_group_members:
|
|
3554
|
+
# Find member_uid in feature_props
|
|
3555
|
+
member_intensity = next((fp['intensity'] for fp in feature_props if fp['uid'] == member_uid), 0)
|
|
3556
|
+
if member_intensity > max_intensity:
|
|
3557
|
+
max_intensity = member_intensity
|
|
3558
|
+
new_boss = member_uid
|
|
3559
|
+
|
|
3560
|
+
# Update boss if needed
|
|
3561
|
+
if new_boss != boss_uid:
|
|
3562
|
+
boss_to_members[new_boss] = [m for m in all_group_members if m != new_boss]
|
|
3563
|
+
del boss_to_members[boss_uid]
|
|
3564
|
+
|
|
3565
|
+
# Update all member references
|
|
3566
|
+
for member in all_group_members:
|
|
3567
|
+
uid_to_boss[member] = new_boss
|
|
3172
3568
|
|
|
3173
|
-
#
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
groups_by_root[root].append(valid_features[i])
|
|
3569
|
+
# Count and log results
|
|
3570
|
+
total_groups = len(boss_to_members)
|
|
3571
|
+
multi_member_groups = sum(1 for members in boss_to_members.values() if len(members) > 0)
|
|
3572
|
+
total_grouped_features = sum(len(members) + 1 for members in boss_to_members.values())
|
|
3178
3573
|
|
|
3179
|
-
groups
|
|
3180
|
-
group_id = 1
|
|
3181
|
-
assigned_groups = {}
|
|
3574
|
+
study.logger.info(f"Grouping results: {total_groups} groups ({multi_member_groups} multi-member, {total_grouped_features} features)")
|
|
3182
3575
|
|
|
3183
|
-
|
|
3184
|
-
|
|
3185
|
-
|
|
3186
|
-
|
|
3187
|
-
assigned_groups[uid] = group_id
|
|
3188
|
-
groups[group_id] = member_uids
|
|
3189
|
-
group_id += 1
|
|
3576
|
+
# Step 9: Convert to return format (optimized)
|
|
3577
|
+
uid_to_index = {fp['uid']: fp['index'] for fp in feature_props}
|
|
3578
|
+
adduct_group_list = [0] * n_features
|
|
3579
|
+
adduct_of_list = [0] * n_features
|
|
3190
3580
|
|
|
3191
|
-
|
|
3192
|
-
for
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
|
|
3203
|
-
best_priority = -1
|
|
3204
|
-
best_intensity = 0
|
|
3205
|
-
|
|
3206
|
-
for uid in member_uids:
|
|
3207
|
-
feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
|
|
3208
|
-
if not feature_data:
|
|
3209
|
-
continue
|
|
3210
|
-
|
|
3211
|
-
adduct = feature_data.get("adduct_top", "")
|
|
3212
|
-
intensity = feature_data.get("inty_mean", 0)
|
|
3213
|
-
|
|
3214
|
-
priority = 0
|
|
3215
|
-
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
3216
|
-
priority = 3
|
|
3217
|
-
elif adduct and "[M-H]" in adduct:
|
|
3218
|
-
priority = 2
|
|
3219
|
-
elif adduct and "M" in adduct:
|
|
3220
|
-
priority = 1
|
|
3221
|
-
|
|
3222
|
-
if priority > best_priority or (priority == best_priority and intensity > best_intensity):
|
|
3223
|
-
best_uid = uid
|
|
3224
|
-
best_priority = priority
|
|
3225
|
-
best_intensity = intensity
|
|
3581
|
+
group_counter = 1
|
|
3582
|
+
for boss_uid, members in boss_to_members.items():
|
|
3583
|
+
# Assign boss
|
|
3584
|
+
boss_idx = uid_to_index[boss_uid]
|
|
3585
|
+
adduct_group_list[boss_idx] = group_counter
|
|
3586
|
+
adduct_of_list[boss_idx] = 0
|
|
3587
|
+
|
|
3588
|
+
# Assign members
|
|
3589
|
+
for member_uid in members:
|
|
3590
|
+
member_idx = uid_to_index[member_uid]
|
|
3591
|
+
adduct_group_list[member_idx] = group_counter
|
|
3592
|
+
adduct_of_list[member_idx] = boss_uid
|
|
3226
3593
|
|
|
3227
|
-
|
|
3594
|
+
group_counter += 1
|
|
3228
3595
|
|
|
3229
|
-
#
|
|
3230
|
-
|
|
3231
|
-
|
|
3596
|
+
# Handle ungrouped features
|
|
3597
|
+
for i in range(n_features):
|
|
3598
|
+
if adduct_group_list[i] == 0:
|
|
3599
|
+
adduct_group_list[i] = group_counter
|
|
3600
|
+
adduct_of_list[i] = 0
|
|
3601
|
+
group_counter += 1
|
|
3232
3602
|
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3603
|
+
return adduct_group_list, adduct_of_list
|
|
3604
|
+
|
|
3605
|
+
|
|
3606
|
+
def _fast_correlation(x, y):
|
|
3607
|
+
"""
|
|
3608
|
+
Fast correlation coefficient calculation for consensus matrix data.
|
|
3609
|
+
|
|
3610
|
+
In the consensus matrix:
|
|
3611
|
+
- Negative values (typically -1.0) indicate missing features
|
|
3612
|
+
- Zero and positive values are actual intensities
|
|
3613
|
+
- Only consider intensities >= 1000 for meaningful correlation
|
|
3614
|
+
|
|
3615
|
+
Args:
|
|
3616
|
+
x, y: numpy arrays of the same length
|
|
3237
3617
|
|
|
3238
|
-
|
|
3239
|
-
|
|
3618
|
+
Returns:
|
|
3619
|
+
Correlation coefficient (float), 0 if cannot be calculated
|
|
3620
|
+
"""
|
|
3621
|
+
import numpy as np
|
|
3240
3622
|
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3623
|
+
# For consensus matrix: exclude negative values (missing features) and very low intensities
|
|
3624
|
+
# Use a very low threshold since processed matrix values are often scaled/normalized
|
|
3625
|
+
valid = ~(np.isnan(x) | np.isnan(y) | (x < 0) | (y < 0) | (x < 0.1) | (y < 0.1))
|
|
3626
|
+
|
|
3627
|
+
if np.sum(valid) < 3: # Need at least 3 valid pairs
|
|
3628
|
+
return 0.0
|
|
3629
|
+
|
|
3630
|
+
x_valid = x[valid]
|
|
3631
|
+
y_valid = y[valid]
|
|
3632
|
+
|
|
3633
|
+
# If all values are the same (e.g., all zeros), correlation is undefined
|
|
3634
|
+
if np.var(x_valid) == 0 or np.var(y_valid) == 0:
|
|
3635
|
+
return 0.0
|
|
3636
|
+
|
|
3637
|
+
# Fast correlation using numpy
|
|
3638
|
+
try:
|
|
3639
|
+
correlation_matrix = np.corrcoef(x_valid, y_valid)
|
|
3640
|
+
correlation = correlation_matrix[0, 1]
|
|
3641
|
+
|
|
3642
|
+
# Handle NaN result
|
|
3643
|
+
if np.isnan(correlation):
|
|
3644
|
+
return 0.0
|
|
3645
|
+
|
|
3646
|
+
return correlation
|
|
3647
|
+
|
|
3648
|
+
except Exception:
|
|
3649
|
+
return 0.0
|
masster/study/study.py
CHANGED
|
@@ -108,7 +108,7 @@ from masster.study.parameters import get_parameters_property
|
|
|
108
108
|
from masster.study.parameters import set_parameters_property
|
|
109
109
|
from masster.study.save import save, save_consensus, save_samples
|
|
110
110
|
from masster.study.export import export_mgf, export_mztab, export_xlsx, export_parquet
|
|
111
|
-
from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset
|
|
111
|
+
from masster.study.id import lib_load, identify, get_id, id_reset, lib_reset, _get_adducts
|
|
112
112
|
|
|
113
113
|
from masster.logger import MassterLogger
|
|
114
114
|
from masster.study.defaults.study_def import study_defaults
|
|
@@ -380,6 +380,7 @@ class Study:
|
|
|
380
380
|
get_orphans = get_orphans
|
|
381
381
|
get_sample_stats = get_sample_stats
|
|
382
382
|
get_consensus_stats = get_consensus_stats
|
|
383
|
+
_get_adducts = _get_adducts
|
|
383
384
|
|
|
384
385
|
# === Data Selection and Filtering ===
|
|
385
386
|
samples_select = samples_select
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
masster/__init__.py,sha256=ueZ224WPNRRjQEYTaQUol818nwQgJwB93HbEfmtPRmg,1041
|
|
2
|
-
masster/_version.py,sha256=
|
|
2
|
+
masster/_version.py,sha256=BXXXNsuN4ipe6lTSVTBWB-FcZgzIiyQ_OToEKfd6hos,256
|
|
3
3
|
masster/chromatogram.py,sha256=iYpdv8C17zVnlWvOFgAn9ns2uFGiF-GgoYf5QVVAbHs,19319
|
|
4
4
|
masster/logger.py,sha256=tR65N23zfrNpcZNbZm2ot_Aual9XrGB1MWjLrovZkMs,16749
|
|
5
5
|
masster/spectrum.py,sha256=TWIgDcl0lveG40cLVZTWGp8-FxMolu-P8EjZyRBtXL4,49850
|
|
@@ -25,7 +25,7 @@ masster/sample/helpers.py,sha256=27eZFFidr02-DlSi4-eF4bpSk_y-qU3eoFCAOshRO20,421
|
|
|
25
25
|
masster/sample/lib.py,sha256=E-j9c3Wd8f9a-H8xj7CAOwlA8KcyXPoFyYm3c8r7LtI,33755
|
|
26
26
|
masster/sample/load.py,sha256=swjRBCoFGni9iPztHIKPVB5ru_xDMVryB_inPXdujTw,51819
|
|
27
27
|
masster/sample/parameters.py,sha256=Gg2KcuNbV_wZ_Wwv93QlM5J19ji0oSIvZLPV1NoBmq0,4456
|
|
28
|
-
masster/sample/plot.py,sha256
|
|
28
|
+
masster/sample/plot.py,sha256=-rHqdi6q7jqjS8ENpTlxjwJBMZAwo-6OsNmE_d1JVQk,86617
|
|
29
29
|
masster/sample/processing.py,sha256=CjaLCElDKECeCvYWqzT5EH_-rPQ0Y4A30zKjZfqmS5s,55915
|
|
30
30
|
masster/sample/quant.py,sha256=tHNjvUFTdehKR31BXBZnVsBxMD9XJHgaltITOjr71uE,7562
|
|
31
31
|
masster/sample/sample.py,sha256=VhQik_ev1liRqGUtbZvV1NOjfFzgfZI1orfQT87gai4,20643
|
|
@@ -42,15 +42,15 @@ masster/study/__init__.py,sha256=55axdFuqRX4aXtJ8ocnhcLB32fNtmmJpCi58moO0r4g,237
|
|
|
42
42
|
masster/study/analysis.py,sha256=L-wXBnGZCLB5UUDrjIdOiMG9zdej3Tw_SftcEmmTukM,84264
|
|
43
43
|
masster/study/export.py,sha256=joFK9jip2UM4lVAvhkdKVeUdNdM4D8uP2WE49IaVJgw,60172
|
|
44
44
|
masster/study/h5.py,sha256=84plxM7gYFdn_mNbcg8XxE_NRZmiIBqs_XhfHMiXshk,95364
|
|
45
|
-
masster/study/helpers.py,sha256=
|
|
46
|
-
masster/study/id.py,sha256=
|
|
45
|
+
masster/study/helpers.py,sha256=dOj7rJlVx7uKCRt1iMOsZHuz4b9Kch5d68biUyIK1mE,190834
|
|
46
|
+
masster/study/id.py,sha256=Tiw_i2jDxUWaPnzd5PzgSnLSRDDDJkwLYbjzA0XcBwQ,80082
|
|
47
47
|
masster/study/load.py,sha256=7d11294YYEGrSKox3cwvetv2vqcstYT1SnyAhHH5V_Q,107706
|
|
48
|
-
masster/study/merge.py,sha256
|
|
48
|
+
masster/study/merge.py,sha256=-SNlroqQVuOyzsJimvgf9c6T9V3yt-mx_2lW3L2kE-g,169501
|
|
49
49
|
masster/study/parameters.py,sha256=bTvmcwX9INxzcrEAmTiFH8qeWVhwkvMTZjuP394pz5o,3279
|
|
50
50
|
masster/study/plot.py,sha256=LEIzoYiUyq1aswh-sw8S-ESvN2DaQKN5l22yLW8gZe8,107647
|
|
51
51
|
masster/study/processing.py,sha256=n5208v-JQGq3bBP-ncgl2__hHWSQQYHx2fl4Mm0THdI,58538
|
|
52
52
|
masster/study/save.py,sha256=47AP518epJJ9TjaGGyrLKsMsyjIk8_J4ka7bmsnRtFQ,9268
|
|
53
|
-
masster/study/study.py,sha256=
|
|
53
|
+
masster/study/study.py,sha256=gudugPJk3LOtZh-YsszSRCBDrBG78cexoG0CSM86EPs,38701
|
|
54
54
|
masster/study/study5_schema.json,sha256=0IZxM9VVI0TUlx74BPzJDT44kySi6NZZ6iLR0j8bU_s,7736
|
|
55
55
|
masster/study/defaults/__init__.py,sha256=m3Z5KXGqsTdh7GjYzZoENERt39yRg0ceVRV1DeCt1P0,610
|
|
56
56
|
masster/study/defaults/align_def.py,sha256=Du0F592ej2einT8kOx8EUs610axSvur8_-6N19O-uJY,10209
|
|
@@ -61,14 +61,14 @@ masster/study/defaults/find_ms2_def.py,sha256=RL0DFG41wQ05U8UQKUGr3vzSl3mU0m0knQ
|
|
|
61
61
|
masster/study/defaults/identify_def.py,sha256=96rxoCAPQj_yX-3mRoD2LTkTLJgG27eJQqwarLv5jL0,10580
|
|
62
62
|
masster/study/defaults/integrate_chrom_def.py,sha256=0MNIWGTjty-Zu-NTQsIweuj3UVqEY3x1x8pK0mPwYak,7264
|
|
63
63
|
masster/study/defaults/integrate_def.py,sha256=Vf4SAzdBfnsSZ3IRaF0qZvWu3gMDPHdgPfMYoPKeWv8,7246
|
|
64
|
-
masster/study/defaults/merge_def.py,sha256=
|
|
64
|
+
masster/study/defaults/merge_def.py,sha256=krR099IkENLlJVxpSjdje3E6h-_qtlc3Ep6Hpy6inrU,12978
|
|
65
65
|
masster/study/defaults/study_def.py,sha256=h8dYbi9xv0sesCSQik49Z53IkskMmNtW6ixl7it5pL0,16033
|
|
66
66
|
masster/wizard/README.md,sha256=mL1A3YWJZOefpJ6D0-HqGLkVRmUlOpwyVFdvJBeeoZM,14149
|
|
67
67
|
masster/wizard/__init__.py,sha256=a2hcZnHASjfuw1lqZhZnvTR58rc33rRnoGAY_JfvGhI,683
|
|
68
68
|
masster/wizard/example.py,sha256=xEZFTH9UZ8HKOm6s3JL8Js0Uw5ChnISWBHSZCL32vsM,7983
|
|
69
69
|
masster/wizard/wizard.py,sha256=UobIGFZtp1s_9WJlpl6DQ2-pp7flPQ6dlYZJqYE92OM,38131
|
|
70
|
-
masster-0.5.
|
|
71
|
-
masster-0.5.
|
|
72
|
-
masster-0.5.
|
|
73
|
-
masster-0.5.
|
|
74
|
-
masster-0.5.
|
|
70
|
+
masster-0.5.8.dist-info/METADATA,sha256=Y_1eR5BbxbKoJmfrJE2W_gShiIa7ba2bw8vAPD6hMD4,45113
|
|
71
|
+
masster-0.5.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
72
|
+
masster-0.5.8.dist-info/entry_points.txt,sha256=ZHguQ_vPmdbpqq2uGtmEOLJfgP-DQ1T0c07Lxh30wc8,58
|
|
73
|
+
masster-0.5.8.dist-info/licenses/LICENSE,sha256=bx5iLIKjgAdYQ7sISn7DsfHRKkoCUm1154sJJKhgqnU,35184
|
|
74
|
+
masster-0.5.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|