masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
CHANGED
|
@@ -19,196 +19,195 @@ from masster.study.defaults import merge_defaults
|
|
|
19
19
|
def _process_kd_chunk_parallel(chunk_data):
|
|
20
20
|
"""
|
|
21
21
|
Process a single KD chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
Args:
|
|
24
24
|
chunk_data: Dictionary containing chunk processing parameters
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
Returns:
|
|
27
27
|
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
28
28
|
"""
|
|
29
29
|
import pyopenms as oms
|
|
30
|
-
|
|
31
|
-
chunk_start_idx = chunk_data[
|
|
32
|
-
chunk_features_data = chunk_data[
|
|
33
|
-
chunk_samples_data = chunk_data[
|
|
34
|
-
params_dict = chunk_data[
|
|
35
|
-
|
|
30
|
+
|
|
31
|
+
chunk_start_idx = chunk_data["chunk_start_idx"]
|
|
32
|
+
chunk_features_data = chunk_data["chunk_features_data"] # List of feature dicts
|
|
33
|
+
chunk_samples_data = chunk_data["chunk_samples_data"] # List of sample dicts
|
|
34
|
+
params_dict = chunk_data["params"]
|
|
35
|
+
|
|
36
36
|
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
37
37
|
chunk_maps = []
|
|
38
|
-
|
|
38
|
+
|
|
39
39
|
for sample_data in chunk_samples_data:
|
|
40
|
-
sample_uid = sample_data[
|
|
41
|
-
|
|
40
|
+
sample_uid = sample_data["sample_uid"]
|
|
41
|
+
|
|
42
42
|
# Filter features for this specific sample
|
|
43
|
-
sample_features = [f for f in chunk_features_data if f[
|
|
44
|
-
|
|
43
|
+
sample_features = [f for f in chunk_features_data if f["sample_uid"] == sample_uid]
|
|
44
|
+
|
|
45
45
|
# Create FeatureMap for this sample
|
|
46
46
|
feature_map = oms.FeatureMap()
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
# Add each feature to the map
|
|
49
49
|
for feature_dict in sample_features:
|
|
50
50
|
feature = oms.Feature()
|
|
51
|
-
feature.setRT(float(feature_dict[
|
|
52
|
-
feature.setMZ(float(feature_dict[
|
|
53
|
-
feature.setIntensity(float(feature_dict[
|
|
54
|
-
feature.setCharge(int(feature_dict.get(
|
|
55
|
-
|
|
51
|
+
feature.setRT(float(feature_dict["rt"]))
|
|
52
|
+
feature.setMZ(float(feature_dict["mz"]))
|
|
53
|
+
feature.setIntensity(float(feature_dict["inty"]))
|
|
54
|
+
feature.setCharge(int(feature_dict.get("charge", 0)))
|
|
55
|
+
|
|
56
56
|
# Set unique ID using feature_id for mapping back
|
|
57
|
-
feature.setUniqueId(int(feature_dict[
|
|
58
|
-
|
|
57
|
+
feature.setUniqueId(int(feature_dict["feature_id"]))
|
|
58
|
+
|
|
59
59
|
feature_map.push_back(feature)
|
|
60
|
-
|
|
60
|
+
|
|
61
61
|
chunk_maps.append(feature_map)
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
# Create the chunk consensus map
|
|
64
64
|
chunk_consensus_map = oms.ConsensusMap()
|
|
65
|
-
|
|
65
|
+
|
|
66
66
|
# Set up file descriptions for chunk
|
|
67
67
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
68
68
|
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
69
69
|
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
70
|
-
file_description.filename = sample_data[
|
|
70
|
+
file_description.filename = sample_data["sample_name"]
|
|
71
71
|
file_description.size = feature_map.size()
|
|
72
72
|
file_description.unique_id = feature_map.getUniqueId()
|
|
73
73
|
file_descriptions[j] = file_description
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
# Use KD algorithm for chunk
|
|
78
78
|
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
79
79
|
chunk_params = grouper.getParameters()
|
|
80
80
|
chunk_params.setValue("mz_unit", "Da")
|
|
81
|
-
chunk_params.setValue("nr_partitions", params_dict[
|
|
81
|
+
chunk_params.setValue("nr_partitions", params_dict["nr_partitions"])
|
|
82
82
|
chunk_params.setValue("warp:enabled", "true")
|
|
83
|
-
chunk_params.setValue("warp:rt_tol", params_dict[
|
|
84
|
-
chunk_params.setValue("warp:mz_tol", params_dict[
|
|
85
|
-
chunk_params.setValue("link:rt_tol", params_dict[
|
|
86
|
-
chunk_params.setValue("link:mz_tol", params_dict[
|
|
87
|
-
chunk_params.setValue("link:min_rel_cc_size", params_dict[
|
|
88
|
-
chunk_params.setValue("link:max_pairwise_log_fc", params_dict[
|
|
89
|
-
chunk_params.setValue("link:max_nr_conflicts", params_dict[
|
|
90
|
-
|
|
83
|
+
chunk_params.setValue("warp:rt_tol", params_dict["rt_tol"])
|
|
84
|
+
chunk_params.setValue("warp:mz_tol", params_dict["mz_tol"])
|
|
85
|
+
chunk_params.setValue("link:rt_tol", params_dict["rt_tol"])
|
|
86
|
+
chunk_params.setValue("link:mz_tol", params_dict["mz_tol"])
|
|
87
|
+
chunk_params.setValue("link:min_rel_cc_size", params_dict["min_rel_cc_size"])
|
|
88
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params_dict["max_pairwise_log_fc"])
|
|
89
|
+
chunk_params.setValue("link:max_nr_conflicts", params_dict["max_nr_conflicts"])
|
|
90
|
+
|
|
91
91
|
grouper.setParameters(chunk_params)
|
|
92
92
|
grouper.group(chunk_maps, chunk_consensus_map)
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
# Serialize the consensus map result for cross-process communication
|
|
95
95
|
consensus_features = []
|
|
96
96
|
for consensus_feature in chunk_consensus_map:
|
|
97
97
|
feature_data = {
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
98
|
+
"rt": consensus_feature.getRT(),
|
|
99
|
+
"mz": consensus_feature.getMZ(),
|
|
100
|
+
"intensity": consensus_feature.getIntensity(),
|
|
101
|
+
"quality": consensus_feature.getQuality(),
|
|
102
|
+
"unique_id": str(consensus_feature.getUniqueId()),
|
|
103
|
+
"features": [],
|
|
104
104
|
}
|
|
105
|
-
|
|
105
|
+
|
|
106
106
|
# Get constituent features
|
|
107
107
|
for feature_handle in consensus_feature.getFeatureList():
|
|
108
108
|
feature_handle_data = {
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
"unique_id": str(feature_handle.getUniqueId()),
|
|
110
|
+
"map_index": feature_handle.getMapIndex(),
|
|
111
111
|
}
|
|
112
|
-
feature_data[
|
|
113
|
-
|
|
112
|
+
feature_data["features"].append(feature_handle_data)
|
|
113
|
+
|
|
114
114
|
consensus_features.append(feature_data)
|
|
115
|
-
|
|
115
|
+
|
|
116
116
|
return chunk_start_idx, consensus_features
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
def _process_qt_chunk_parallel(chunk_data):
|
|
120
120
|
"""
|
|
121
121
|
Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
Args:
|
|
124
124
|
chunk_data: Dictionary containing chunk processing parameters
|
|
125
|
-
|
|
125
|
+
|
|
126
126
|
Returns:
|
|
127
127
|
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
128
128
|
"""
|
|
129
129
|
import pyopenms as oms
|
|
130
|
-
|
|
131
|
-
chunk_start_idx = chunk_data[
|
|
132
|
-
chunk_features_data = chunk_data[
|
|
133
|
-
chunk_samples_data = chunk_data[
|
|
134
|
-
params_dict = chunk_data[
|
|
135
|
-
|
|
130
|
+
|
|
131
|
+
chunk_start_idx = chunk_data["chunk_start_idx"]
|
|
132
|
+
chunk_features_data = chunk_data["chunk_features_data"] # List of feature dicts
|
|
133
|
+
chunk_samples_data = chunk_data["chunk_samples_data"] # List of sample dicts
|
|
134
|
+
params_dict = chunk_data["params"]
|
|
135
|
+
|
|
136
136
|
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
137
137
|
chunk_maps = []
|
|
138
|
-
|
|
138
|
+
|
|
139
139
|
for sample_data in chunk_samples_data:
|
|
140
|
-
sample_uid = sample_data[
|
|
141
|
-
|
|
140
|
+
sample_uid = sample_data["sample_uid"]
|
|
141
|
+
|
|
142
142
|
# Filter features for this specific sample
|
|
143
|
-
sample_features = [f for f in chunk_features_data if f[
|
|
144
|
-
|
|
143
|
+
sample_features = [f for f in chunk_features_data if f["sample_uid"] == sample_uid]
|
|
144
|
+
|
|
145
145
|
# Create FeatureMap for this sample
|
|
146
146
|
feature_map = oms.FeatureMap()
|
|
147
|
-
|
|
147
|
+
|
|
148
148
|
# Add each feature to the map
|
|
149
149
|
for feature_dict in sample_features:
|
|
150
150
|
feature = oms.Feature()
|
|
151
|
-
feature.setRT(float(feature_dict[
|
|
152
|
-
feature.setMZ(float(feature_dict[
|
|
153
|
-
feature.setIntensity(float(feature_dict[
|
|
154
|
-
feature.setCharge(int(feature_dict.get(
|
|
155
|
-
|
|
151
|
+
feature.setRT(float(feature_dict["rt"]))
|
|
152
|
+
feature.setMZ(float(feature_dict["mz"]))
|
|
153
|
+
feature.setIntensity(float(feature_dict["inty"]))
|
|
154
|
+
feature.setCharge(int(feature_dict.get("charge", 0)))
|
|
155
|
+
|
|
156
156
|
# Set unique ID using feature_id for mapping back
|
|
157
|
-
feature.setUniqueId(int(feature_dict[
|
|
158
|
-
|
|
157
|
+
feature.setUniqueId(int(feature_dict["feature_id"]))
|
|
158
|
+
|
|
159
159
|
feature_map.push_back(feature)
|
|
160
|
-
|
|
160
|
+
|
|
161
161
|
chunk_maps.append(feature_map)
|
|
162
|
-
|
|
162
|
+
|
|
163
163
|
# Create the chunk consensus map
|
|
164
164
|
chunk_consensus_map = oms.ConsensusMap()
|
|
165
|
-
|
|
165
|
+
|
|
166
166
|
# Set up file descriptions for chunk
|
|
167
167
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
168
168
|
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
169
169
|
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
170
|
-
file_description.filename = sample_data[
|
|
170
|
+
file_description.filename = sample_data["sample_name"]
|
|
171
171
|
file_description.size = feature_map.size()
|
|
172
172
|
file_description.unique_id = feature_map.getUniqueId()
|
|
173
173
|
file_descriptions[j] = file_description
|
|
174
|
-
|
|
174
|
+
|
|
175
175
|
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
176
|
-
|
|
176
|
+
|
|
177
177
|
# Use QT algorithm for chunk
|
|
178
178
|
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
179
179
|
chunk_params = grouper.getParameters()
|
|
180
|
-
chunk_params.setValue("distance_RT:max_difference", params_dict[
|
|
181
|
-
chunk_params.setValue("distance_MZ:max_difference", params_dict[
|
|
180
|
+
chunk_params.setValue("distance_RT:max_difference", params_dict["rt_tol"])
|
|
181
|
+
chunk_params.setValue("distance_MZ:max_difference", params_dict["mz_tol"])
|
|
182
182
|
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
183
183
|
chunk_params.setValue("ignore_charge", "true")
|
|
184
|
-
chunk_params.setValue("nr_partitions", params_dict[
|
|
185
|
-
|
|
184
|
+
chunk_params.setValue("nr_partitions", params_dict["nr_partitions"])
|
|
186
185
|
|
|
187
186
|
grouper.setParameters(chunk_params)
|
|
188
187
|
grouper.group(chunk_maps, chunk_consensus_map)
|
|
189
|
-
|
|
188
|
+
|
|
190
189
|
# Serialize the consensus map result for cross-process communication
|
|
191
190
|
consensus_features = []
|
|
192
191
|
for consensus_feature in chunk_consensus_map:
|
|
193
192
|
feature_data = {
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
193
|
+
"rt": consensus_feature.getRT(),
|
|
194
|
+
"mz": consensus_feature.getMZ(),
|
|
195
|
+
"intensity": consensus_feature.getIntensity(),
|
|
196
|
+
"quality": consensus_feature.getQuality(),
|
|
197
|
+
"unique_id": str(consensus_feature.getUniqueId()),
|
|
198
|
+
"features": [],
|
|
200
199
|
}
|
|
201
|
-
|
|
200
|
+
|
|
202
201
|
# Get constituent features
|
|
203
202
|
for feature_handle in consensus_feature.getFeatureList():
|
|
204
203
|
feature_handle_data = {
|
|
205
|
-
|
|
206
|
-
|
|
204
|
+
"unique_id": str(feature_handle.getUniqueId()),
|
|
205
|
+
"map_index": feature_handle.getMapIndex(),
|
|
207
206
|
}
|
|
208
|
-
feature_data[
|
|
209
|
-
|
|
207
|
+
feature_data["features"].append(feature_handle_data)
|
|
208
|
+
|
|
210
209
|
consensus_features.append(feature_data)
|
|
211
|
-
|
|
210
|
+
|
|
212
211
|
return chunk_start_idx, consensus_features
|
|
213
212
|
|
|
214
213
|
|
|
@@ -225,7 +224,7 @@ def merge(study, **kwargs) -> None:
|
|
|
225
224
|
Parameters from merge_defaults class:
|
|
226
225
|
- method : str, default 'kd'
|
|
227
226
|
Merge algorithm: 'kd', 'qt', 'kd_chunked', 'qt_chunked'
|
|
228
|
-
- min_samples : int, default 2
|
|
227
|
+
- min_samples : int, default 2
|
|
229
228
|
Minimum number of samples for consensus feature
|
|
230
229
|
- rt_tol : float, default 5.0
|
|
231
230
|
RT tolerance in seconds
|
|
@@ -261,7 +260,7 @@ def merge(study, **kwargs) -> None:
|
|
|
261
260
|
- Uses spatial partitioning for efficient feature matching
|
|
262
261
|
|
|
263
262
|
**QT (Quality Threshold)**
|
|
264
|
-
- Thorough O(n²) clustering algorithm
|
|
263
|
+
- Thorough O(n²) clustering algorithm
|
|
265
264
|
- Most accurate but slowest method
|
|
266
265
|
- Recommended for small datasets (<1,000 samples)
|
|
267
266
|
- Guarantees quality threshold constraints
|
|
@@ -326,7 +325,7 @@ def merge(study, **kwargs) -> None:
|
|
|
326
325
|
study.merge(method='qt', rt_tol=2.0, mz_tol=0.005, min_samples=5)
|
|
327
326
|
|
|
328
327
|
Large dataset with parallel processing:
|
|
329
|
-
study.merge(method='kd_chunked', threads=8, chunk_size=500,
|
|
328
|
+
study.merge(method='kd_chunked', threads=8, chunk_size=500,
|
|
330
329
|
dechunking='hierarchical')
|
|
331
330
|
|
|
332
331
|
Custom tolerances for specific instrument:
|
|
@@ -341,11 +340,11 @@ def merge(study, **kwargs) -> None:
|
|
|
341
340
|
- Adduct relationships are identified and stored after merging
|
|
342
341
|
"""
|
|
343
342
|
# Initialize with defaults and override with kwargs
|
|
344
|
-
params = merge_defaults()
|
|
345
|
-
|
|
343
|
+
params = merge_defaults()
|
|
344
|
+
|
|
346
345
|
# Handle 'params' keyword argument specifically (like merge does)
|
|
347
|
-
if
|
|
348
|
-
provided_params = kwargs.pop(
|
|
346
|
+
if "params" in kwargs:
|
|
347
|
+
provided_params = kwargs.pop("params")
|
|
349
348
|
if isinstance(provided_params, merge_defaults):
|
|
350
349
|
params = provided_params
|
|
351
350
|
study.logger.debug("Using provided merge_defaults parameters from 'params' argument")
|
|
@@ -370,71 +369,69 @@ def merge(study, **kwargs) -> None:
|
|
|
370
369
|
|
|
371
370
|
# Backward compatibility: Map old method names to new names
|
|
372
371
|
method_mapping = {
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
372
|
+
"qtchunked": "qt_chunked", # QT chunked variants
|
|
373
|
+
"qt-chunked": "qt_chunked",
|
|
374
|
+
"kdchunked": "kd_chunked", # KD chunked variants
|
|
375
|
+
"kd-chunked": "kd_chunked",
|
|
377
376
|
}
|
|
378
|
-
|
|
377
|
+
|
|
379
378
|
if params.method in method_mapping:
|
|
380
379
|
old_method = params.method
|
|
381
380
|
params.method = method_mapping[old_method]
|
|
382
381
|
study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
383
|
-
|
|
382
|
+
|
|
384
383
|
# Validate method
|
|
385
|
-
if params.method not in [
|
|
384
|
+
if params.method not in ["kd", "qt", "kd_chunked", "qt_chunked"]:
|
|
386
385
|
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['kd', 'qt', 'kd_chunked', 'qt_chunked']")
|
|
387
|
-
|
|
386
|
+
|
|
388
387
|
# Check if chunked method is advisable for large datasets
|
|
389
|
-
num_samples = len(study.samples_df) if hasattr(study,
|
|
388
|
+
num_samples = len(study.samples_df) if hasattr(study, "samples_df") and study.samples_df is not None else 0
|
|
390
389
|
if num_samples == 0:
|
|
391
390
|
raise ValueError("No samples loaded in study. Load features before merging.")
|
|
392
|
-
if params.method ==
|
|
393
|
-
params.method =
|
|
391
|
+
if params.method == "kd" and num_samples > params.chunk_size:
|
|
392
|
+
params.method = "kd_chunked"
|
|
394
393
|
study.logger.info(
|
|
395
394
|
f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
|
|
396
395
|
)
|
|
397
|
-
if params.method ==
|
|
398
|
-
params.method =
|
|
396
|
+
if params.method == "qt" and num_samples > params.chunk_size:
|
|
397
|
+
params.method = "qt_chunked"
|
|
399
398
|
study.logger.info(
|
|
400
399
|
f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
|
|
401
400
|
)
|
|
402
401
|
|
|
403
402
|
if num_samples > 500:
|
|
404
|
-
if params.method not in {
|
|
403
|
+
if params.method not in {"kd_chunked", "qt_chunked"}:
|
|
405
404
|
study.logger.warning(
|
|
406
405
|
f"Large dataset detected ({num_samples} samples > 500). Consider dropping chunk_size to 500 to use chunked methods."
|
|
407
406
|
)
|
|
408
|
-
|
|
407
|
+
|
|
409
408
|
# Persist last used params for diagnostics
|
|
410
409
|
try:
|
|
411
410
|
study._merge_params_last = params.to_dict()
|
|
412
411
|
except Exception:
|
|
413
412
|
study._merge_params_last = {}
|
|
414
|
-
|
|
413
|
+
|
|
415
414
|
# Store merge parameters in history
|
|
416
415
|
try:
|
|
417
|
-
if hasattr(study,
|
|
418
|
-
study.update_history([
|
|
416
|
+
if hasattr(study, "store_history"):
|
|
417
|
+
study.update_history(["merge"], params.to_dict())
|
|
419
418
|
else:
|
|
420
419
|
study.logger.warning("History storage not available - parameters not saved to history")
|
|
421
420
|
except Exception as e:
|
|
422
421
|
study.logger.warning(f"Failed to store merge parameters in history: {e}")
|
|
423
|
-
|
|
422
|
+
|
|
424
423
|
# Ensure feature maps are available for merging (regenerate if needed)
|
|
425
424
|
if len(study.features_maps) < len(study.samples_df):
|
|
426
425
|
study.features_maps = []
|
|
427
426
|
# Feature maps will be generated on-demand within each merge method
|
|
428
|
-
|
|
427
|
+
|
|
429
428
|
study.logger.info(
|
|
430
|
-
|
|
431
|
-
|
|
429
|
+
f"Merging samples using {params.method}, min_samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
|
|
430
|
+
)
|
|
432
431
|
if "chunked" in params.method:
|
|
433
|
-
study.logger.info(
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
# Initialize
|
|
432
|
+
study.logger.info(f"threads={params.threads}, chunk_size={params.chunk_size}, dechunking='{params.dechunking}'")
|
|
433
|
+
|
|
434
|
+
# Initialize
|
|
438
435
|
study.consensus_df = pl.DataFrame()
|
|
439
436
|
study.consensus_ms2 = pl.DataFrame()
|
|
440
437
|
study.consensus_mapping_df = pl.DataFrame()
|
|
@@ -451,67 +448,67 @@ def merge(study, **kwargs) -> None:
|
|
|
451
448
|
except Exception as e:
|
|
452
449
|
study.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
453
450
|
cached_valid_adducts = set()
|
|
454
|
-
|
|
451
|
+
|
|
455
452
|
# Always allow '?' adducts
|
|
456
453
|
cached_valid_adducts.add("?")
|
|
457
|
-
|
|
458
|
-
# Route to algorithm implementation
|
|
459
|
-
if params.method ==
|
|
454
|
+
|
|
455
|
+
# Route to algorithm implementation
|
|
456
|
+
if params.method == "kd":
|
|
460
457
|
consensus_map = _merge_kd(study, params)
|
|
461
458
|
# Extract consensus features
|
|
462
459
|
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
463
|
-
elif params.method ==
|
|
460
|
+
elif params.method == "qt":
|
|
464
461
|
consensus_map = _merge_qt(study, params)
|
|
465
462
|
# Extract consensus features
|
|
466
463
|
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
467
|
-
elif params.method ==
|
|
464
|
+
elif params.method == "kd_chunked":
|
|
468
465
|
consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
469
466
|
# Note: _merge_kd_chunked populates consensus_df directly, no need to extract
|
|
470
|
-
elif params.method ==
|
|
467
|
+
elif params.method == "qt_chunked":
|
|
471
468
|
consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
472
469
|
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
473
|
-
|
|
470
|
+
|
|
474
471
|
# Enhanced post-clustering to merge over-segmented features (for non-chunked methods)
|
|
475
472
|
# Chunked methods already perform their own cross-chunk consensus building
|
|
476
|
-
if params.method in [
|
|
473
|
+
if params.method in ["qt", "kd"]:
|
|
477
474
|
__consensus_cleanup(study, params.rt_tol, params.mz_tol)
|
|
478
|
-
|
|
475
|
+
|
|
479
476
|
# Perform adduct grouping
|
|
480
477
|
_perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
|
|
481
|
-
|
|
478
|
+
|
|
482
479
|
# Identify coeluting consensus features by mass shifts and update adduct information
|
|
483
480
|
__identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
|
|
484
|
-
|
|
481
|
+
|
|
485
482
|
# Post-processing for chunked methods: merge partial consensus features
|
|
486
|
-
if params.method in [
|
|
483
|
+
if params.method in ["qt_chunked", "kd_chunked"]:
|
|
487
484
|
_merge_partial_consensus_features(study, params.rt_tol, params.mz_tol)
|
|
488
|
-
|
|
485
|
+
|
|
489
486
|
# Finalize merge: filter by min_samples and add isotope/MS2 data
|
|
490
487
|
__finalize_merge(study, params.link_ms2, params.extract_ms1, params.min_samples)
|
|
491
488
|
|
|
492
489
|
|
|
493
490
|
def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
494
491
|
"""KD-tree based merge (fast, recommended)"""
|
|
495
|
-
|
|
492
|
+
|
|
496
493
|
# Generate temporary feature maps on-demand from features_df
|
|
497
494
|
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
498
|
-
|
|
495
|
+
|
|
499
496
|
consensus_map = oms.ConsensusMap()
|
|
500
497
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
501
|
-
|
|
498
|
+
|
|
502
499
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
503
500
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
504
501
|
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
505
502
|
file_description.size = feature_map.size()
|
|
506
503
|
file_description.unique_id = feature_map.getUniqueId()
|
|
507
504
|
file_descriptions[i] = file_description
|
|
508
|
-
|
|
505
|
+
|
|
509
506
|
consensus_map.setColumnHeaders(file_descriptions)
|
|
510
|
-
|
|
507
|
+
|
|
511
508
|
# Configure KD algorithm
|
|
512
509
|
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
513
510
|
params_oms = grouper.getParameters()
|
|
514
|
-
|
|
511
|
+
|
|
515
512
|
params_oms.setValue("mz_unit", "Da")
|
|
516
513
|
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
517
514
|
params_oms.setValue("warp:enabled", "true")
|
|
@@ -519,10 +516,10 @@ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
519
516
|
params_oms.setValue("warp:mz_tol", params.mz_tol)
|
|
520
517
|
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
521
518
|
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
522
|
-
|
|
519
|
+
|
|
523
520
|
grouper.setParameters(params_oms)
|
|
524
521
|
grouper.group(temp_feature_maps, consensus_map)
|
|
525
|
-
|
|
522
|
+
|
|
526
523
|
return consensus_map
|
|
527
524
|
|
|
528
525
|
|
|
@@ -530,49 +527,49 @@ def _generate_feature_maps_on_demand(study):
|
|
|
530
527
|
"""
|
|
531
528
|
Generate feature maps on-demand using Sample-level _load_ms1() for merge operations.
|
|
532
529
|
Returns temporary feature maps that are not cached in the study.
|
|
533
|
-
|
|
530
|
+
|
|
534
531
|
Args:
|
|
535
532
|
study: Study object containing samples
|
|
536
|
-
|
|
533
|
+
|
|
537
534
|
Returns:
|
|
538
535
|
list: List of temporary FeatureMap objects
|
|
539
536
|
"""
|
|
540
537
|
import polars as pl
|
|
541
538
|
import pyopenms as oms
|
|
542
539
|
import numpy as np
|
|
543
|
-
|
|
540
|
+
|
|
544
541
|
# Check if we should use Sample-level loading instead of features_df
|
|
545
542
|
use_sample_loading = True # Default to Sample-level loading as requested
|
|
546
|
-
|
|
543
|
+
|
|
547
544
|
# Use Sample-level loading if requested and samples_df is available
|
|
548
|
-
#if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
|
|
545
|
+
# if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
|
|
549
546
|
# study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
|
|
550
547
|
# return _generate_feature_maps_from_samples(study)
|
|
551
|
-
|
|
548
|
+
|
|
552
549
|
# Fallback to original features_df approach
|
|
553
550
|
if study.features_df is None or len(study.features_df) == 0:
|
|
554
551
|
study.logger.error("No features_df available for generating feature maps")
|
|
555
552
|
return []
|
|
556
|
-
|
|
553
|
+
|
|
557
554
|
temp_feature_maps = []
|
|
558
555
|
n_samples = len(study.samples_df)
|
|
559
556
|
n_features = len(study.features_df)
|
|
560
|
-
|
|
557
|
+
|
|
561
558
|
# Performance optimization: use efficient polars groupby for large datasets
|
|
562
559
|
use_groupby_optimization = n_features > 5000
|
|
563
560
|
if use_groupby_optimization:
|
|
564
561
|
study.logger.debug(f"Using polars groupby optimization for {n_features} features across {n_samples} samples")
|
|
565
|
-
|
|
562
|
+
|
|
566
563
|
# Pre-group features by sample_uid - this is much more efficient than repeated filtering
|
|
567
564
|
features_by_sample = study.features_df.group_by("sample_uid").agg([
|
|
568
565
|
pl.col("feature_id"),
|
|
569
|
-
pl.col("mz"),
|
|
566
|
+
pl.col("mz"),
|
|
570
567
|
pl.col("rt"),
|
|
571
568
|
pl.col("inty"),
|
|
572
569
|
pl.col("quality").fill_null(1.0),
|
|
573
|
-
pl.col("charge").fill_null(0)
|
|
570
|
+
pl.col("charge").fill_null(0),
|
|
574
571
|
])
|
|
575
|
-
|
|
572
|
+
|
|
576
573
|
# Convert to dictionary for fast lookups
|
|
577
574
|
sample_feature_dict = {}
|
|
578
575
|
for row in features_by_sample.iter_rows(named=True):
|
|
@@ -584,31 +581,31 @@ def _generate_feature_maps_on_demand(study):
|
|
|
584
581
|
"rt": np.array(row["rt"]),
|
|
585
582
|
"inty": np.array(row["inty"]),
|
|
586
583
|
"quality": np.array(row["quality"]),
|
|
587
|
-
"charge": np.array(row["charge"])
|
|
584
|
+
"charge": np.array(row["charge"]),
|
|
588
585
|
}
|
|
589
|
-
|
|
586
|
+
|
|
590
587
|
# Process each sample in order
|
|
591
588
|
for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
|
|
592
589
|
sample_uid = row_dict["sample_uid"]
|
|
593
|
-
|
|
590
|
+
|
|
594
591
|
if use_groupby_optimization:
|
|
595
592
|
# Use pre-grouped data with vectorized operations
|
|
596
593
|
if sample_uid not in sample_feature_dict:
|
|
597
594
|
feature_map = oms.FeatureMap()
|
|
598
595
|
temp_feature_maps.append(feature_map)
|
|
599
596
|
continue
|
|
600
|
-
|
|
597
|
+
|
|
601
598
|
sample_data = sample_feature_dict[sample_uid]
|
|
602
599
|
n_sample_features = len(sample_data["feature_id"])
|
|
603
|
-
|
|
600
|
+
|
|
604
601
|
if n_sample_features == 0:
|
|
605
602
|
feature_map = oms.FeatureMap()
|
|
606
603
|
temp_feature_maps.append(feature_map)
|
|
607
604
|
continue
|
|
608
|
-
|
|
605
|
+
|
|
609
606
|
# Create new FeatureMap
|
|
610
607
|
feature_map = oms.FeatureMap()
|
|
611
|
-
|
|
608
|
+
|
|
612
609
|
# Use vectorized data directly (no conversion needed)
|
|
613
610
|
for i in range(n_sample_features):
|
|
614
611
|
try:
|
|
@@ -626,14 +623,14 @@ def _generate_feature_maps_on_demand(study):
|
|
|
626
623
|
else:
|
|
627
624
|
# Use original polars-based approach for smaller datasets
|
|
628
625
|
sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
629
|
-
|
|
626
|
+
|
|
630
627
|
# Create new FeatureMap
|
|
631
628
|
feature_map = oms.FeatureMap()
|
|
632
|
-
|
|
629
|
+
|
|
633
630
|
# Convert DataFrame features to OpenMS Features
|
|
634
631
|
for feature_row in sample_features.iter_rows(named=True):
|
|
635
632
|
feature = oms.Feature()
|
|
636
|
-
|
|
633
|
+
|
|
637
634
|
# Set properties from DataFrame (handle missing values gracefully)
|
|
638
635
|
try:
|
|
639
636
|
feature.setUniqueId(int(feature_row["feature_id"]))
|
|
@@ -642,45 +639,45 @@ def _generate_feature_maps_on_demand(study):
|
|
|
642
639
|
feature.setIntensity(float(feature_row["inty"]))
|
|
643
640
|
feature.setOverallQuality(float(feature_row["quality"]))
|
|
644
641
|
feature.setCharge(int(feature_row["charge"]))
|
|
645
|
-
|
|
642
|
+
|
|
646
643
|
# Add to feature map
|
|
647
644
|
feature_map.push_back(feature)
|
|
648
645
|
except (ValueError, TypeError) as e:
|
|
649
646
|
study.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
650
647
|
continue
|
|
651
|
-
|
|
648
|
+
|
|
652
649
|
temp_feature_maps.append(feature_map)
|
|
653
|
-
|
|
650
|
+
|
|
654
651
|
study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df")
|
|
655
652
|
return temp_feature_maps
|
|
656
653
|
|
|
657
654
|
|
|
658
655
|
def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
659
656
|
"""QT (Quality Threshold) based merge"""
|
|
660
|
-
|
|
657
|
+
|
|
661
658
|
# Generate temporary feature maps on-demand from features_df
|
|
662
659
|
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
663
|
-
|
|
660
|
+
|
|
664
661
|
n_samples = len(temp_feature_maps)
|
|
665
662
|
if n_samples > 1000:
|
|
666
663
|
study.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
|
|
667
|
-
|
|
664
|
+
|
|
668
665
|
consensus_map = oms.ConsensusMap()
|
|
669
666
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
670
|
-
|
|
667
|
+
|
|
671
668
|
for i, feature_map in enumerate(temp_feature_maps):
|
|
672
669
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
673
670
|
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
674
671
|
file_description.size = feature_map.size()
|
|
675
672
|
file_description.unique_id = feature_map.getUniqueId()
|
|
676
673
|
file_descriptions[i] = file_description
|
|
677
|
-
|
|
674
|
+
|
|
678
675
|
consensus_map.setColumnHeaders(file_descriptions)
|
|
679
|
-
|
|
676
|
+
|
|
680
677
|
# Configure QT algorithm
|
|
681
678
|
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
682
679
|
params_oms = grouper.getParameters()
|
|
683
|
-
|
|
680
|
+
|
|
684
681
|
params_oms.setValue("distance_RT:max_difference", params.rt_tol)
|
|
685
682
|
params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
686
683
|
params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
|
|
@@ -689,16 +686,18 @@ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
689
686
|
|
|
690
687
|
grouper.setParameters(params_oms)
|
|
691
688
|
grouper.group(temp_feature_maps, consensus_map)
|
|
692
|
-
|
|
689
|
+
|
|
693
690
|
return consensus_map
|
|
694
691
|
|
|
695
692
|
|
|
696
|
-
def _merge_kd_chunked(
|
|
693
|
+
def _merge_kd_chunked(
|
|
694
|
+
study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None
|
|
695
|
+
) -> oms.ConsensusMap:
|
|
697
696
|
"""KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
698
|
-
|
|
697
|
+
|
|
699
698
|
# Generate temporary feature maps on-demand from features_df
|
|
700
699
|
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
701
|
-
|
|
700
|
+
|
|
702
701
|
n_samples = len(temp_feature_maps)
|
|
703
702
|
if n_samples <= params.chunk_size:
|
|
704
703
|
study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
|
|
@@ -706,23 +705,31 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
706
705
|
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
707
706
|
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
708
707
|
return consensus_map
|
|
709
|
-
|
|
708
|
+
|
|
710
709
|
# Process in chunks
|
|
711
710
|
chunks = []
|
|
712
711
|
for i in range(0, n_samples, params.chunk_size):
|
|
713
712
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
714
713
|
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
715
|
-
|
|
716
|
-
study.logger.debug(
|
|
717
|
-
|
|
714
|
+
|
|
715
|
+
study.logger.debug(
|
|
716
|
+
f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)"
|
|
717
|
+
)
|
|
718
|
+
|
|
718
719
|
# Process each chunk to create chunk consensus maps
|
|
719
720
|
chunk_consensus_maps = []
|
|
720
|
-
|
|
721
|
+
|
|
721
722
|
if params.threads is None:
|
|
722
723
|
# Sequential processing (original behavior)
|
|
723
|
-
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(
|
|
724
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(
|
|
725
|
+
tqdm(
|
|
726
|
+
chunks,
|
|
727
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}KD Chunk",
|
|
728
|
+
disable=study.log_level not in ["TRACE", "DEBUG", "INFO"],
|
|
729
|
+
)
|
|
730
|
+
):
|
|
724
731
|
chunk_consensus_map = oms.ConsensusMap()
|
|
725
|
-
|
|
732
|
+
|
|
726
733
|
# Set up file descriptions for chunk
|
|
727
734
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
728
735
|
for j, feature_map in enumerate(chunk_maps):
|
|
@@ -731,9 +738,9 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
731
738
|
file_description.size = feature_map.size()
|
|
732
739
|
file_description.unique_id = feature_map.getUniqueId()
|
|
733
740
|
file_descriptions[j] = file_description
|
|
734
|
-
|
|
741
|
+
|
|
735
742
|
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
736
|
-
|
|
743
|
+
|
|
737
744
|
# Use KD algorithm for chunk
|
|
738
745
|
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
739
746
|
chunk_params = grouper.getParameters()
|
|
@@ -747,16 +754,16 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
747
754
|
chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
748
755
|
chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
749
756
|
chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
750
|
-
|
|
757
|
+
|
|
751
758
|
grouper.setParameters(chunk_params)
|
|
752
759
|
grouper.group(chunk_maps, chunk_consensus_map)
|
|
753
|
-
|
|
760
|
+
|
|
754
761
|
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
755
|
-
|
|
762
|
+
|
|
756
763
|
else:
|
|
757
764
|
# Parallel processing
|
|
758
|
-
#study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
759
|
-
|
|
765
|
+
# study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
766
|
+
|
|
760
767
|
# Prepare chunk data for parallel processing using features_df slices
|
|
761
768
|
chunk_data_list = []
|
|
762
769
|
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
@@ -765,58 +772,65 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
765
772
|
chunk_samples_df_rows = []
|
|
766
773
|
for j in range(len(chunk_maps)):
|
|
767
774
|
sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
|
|
768
|
-
chunk_sample_uids.append(sample_row[
|
|
775
|
+
chunk_sample_uids.append(sample_row["sample_uid"])
|
|
769
776
|
chunk_samples_df_rows.append(sample_row)
|
|
770
|
-
|
|
777
|
+
|
|
771
778
|
# Create a DataFrame for this chunk's samples
|
|
772
779
|
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
773
|
-
|
|
780
|
+
|
|
774
781
|
# Filter features_df for this chunk's samples and select only necessary columns
|
|
775
|
-
chunk_features_df = study.features_df.filter(
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
782
|
+
chunk_features_df = study.features_df.filter(pl.col("sample_uid").is_in(chunk_sample_uids)).select([
|
|
783
|
+
"sample_uid",
|
|
784
|
+
"rt",
|
|
785
|
+
"mz",
|
|
786
|
+
"inty",
|
|
787
|
+
"charge",
|
|
788
|
+
"feature_id",
|
|
779
789
|
])
|
|
780
|
-
|
|
790
|
+
|
|
781
791
|
# Convert DataFrames to serializable format (lists of dicts)
|
|
782
792
|
chunk_features_data = chunk_features_df.to_dicts()
|
|
783
793
|
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
784
|
-
|
|
794
|
+
|
|
785
795
|
chunk_data = {
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
}
|
|
796
|
+
"chunk_start_idx": chunk_start_idx,
|
|
797
|
+
"chunk_features_data": chunk_features_data, # List of dicts instead of DataFrame
|
|
798
|
+
"chunk_samples_data": chunk_samples_data, # List of dicts instead of DataFrame
|
|
799
|
+
"params": {
|
|
800
|
+
"nr_partitions": params.nr_partitions,
|
|
801
|
+
"rt_tol": params.rt_tol,
|
|
802
|
+
"mz_tol": params.mz_tol,
|
|
803
|
+
"min_rel_cc_size": params.min_rel_cc_size,
|
|
804
|
+
"max_pairwise_log_fc": params.max_pairwise_log_fc,
|
|
805
|
+
"max_nr_conflicts": params.max_nr_conflicts,
|
|
806
|
+
},
|
|
797
807
|
}
|
|
798
808
|
chunk_data_list.append(chunk_data)
|
|
799
|
-
|
|
809
|
+
|
|
800
810
|
# Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
|
|
801
811
|
try:
|
|
802
812
|
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
803
813
|
# Submit all chunk processing tasks
|
|
804
|
-
future_to_chunk = {
|
|
805
|
-
|
|
806
|
-
|
|
814
|
+
future_to_chunk = {
|
|
815
|
+
executor.submit(_process_kd_chunk_parallel, chunk_data): i
|
|
816
|
+
for i, chunk_data in enumerate(chunk_data_list)
|
|
817
|
+
}
|
|
818
|
+
|
|
807
819
|
# Collect results with progress tracking
|
|
808
820
|
completed_chunks = 0
|
|
809
821
|
total_chunks = len(chunk_data_list)
|
|
810
822
|
serialized_chunk_results = []
|
|
811
|
-
|
|
823
|
+
|
|
812
824
|
for future in as_completed(future_to_chunk):
|
|
813
825
|
chunk_idx = future_to_chunk[future]
|
|
814
826
|
try:
|
|
815
827
|
chunk_start_idx, consensus_features = future.result()
|
|
816
828
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
817
829
|
completed_chunks += 1
|
|
818
|
-
n_samples_in_chunk = len(chunk_data_list[chunk_idx][
|
|
819
|
-
study.logger.info(
|
|
830
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
|
|
831
|
+
study.logger.info(
|
|
832
|
+
f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
|
|
833
|
+
)
|
|
820
834
|
except Exception as exc:
|
|
821
835
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
822
836
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
@@ -825,60 +839,71 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
825
839
|
else:
|
|
826
840
|
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
827
841
|
raise exc
|
|
828
|
-
|
|
842
|
+
|
|
829
843
|
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
830
844
|
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
831
|
-
if (
|
|
832
|
-
"
|
|
845
|
+
if (
|
|
846
|
+
"freeze_support" in str(e)
|
|
847
|
+
or "spawn" in str(e)
|
|
848
|
+
or "bootstrapping" in str(e)
|
|
849
|
+
or "process pool" in str(e).lower()
|
|
850
|
+
or "Windows multiprocessing failure" in str(e)
|
|
851
|
+
):
|
|
833
852
|
study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
834
853
|
study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
835
|
-
|
|
854
|
+
|
|
836
855
|
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
837
856
|
# Submit all chunk processing tasks
|
|
838
|
-
future_to_chunk = {
|
|
839
|
-
|
|
840
|
-
|
|
857
|
+
future_to_chunk = {
|
|
858
|
+
executor.submit(_process_kd_chunk_parallel, chunk_data): i
|
|
859
|
+
for i, chunk_data in enumerate(chunk_data_list)
|
|
860
|
+
}
|
|
861
|
+
|
|
841
862
|
# Collect results with progress tracking
|
|
842
863
|
completed_chunks = 0
|
|
843
864
|
total_chunks = len(chunk_data_list)
|
|
844
865
|
serialized_chunk_results = []
|
|
845
|
-
|
|
866
|
+
|
|
846
867
|
for future in as_completed(future_to_chunk):
|
|
847
868
|
chunk_idx = future_to_chunk[future]
|
|
848
869
|
try:
|
|
849
870
|
chunk_start_idx, consensus_features = future.result()
|
|
850
871
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
851
872
|
completed_chunks += 1
|
|
852
|
-
n_samples_in_chunk = len(chunk_data_list[chunk_idx][
|
|
853
|
-
study.logger.info(
|
|
873
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
|
|
874
|
+
study.logger.info(
|
|
875
|
+
f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
|
|
876
|
+
)
|
|
854
877
|
except Exception as exc:
|
|
855
878
|
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
856
879
|
raise exc
|
|
857
880
|
else:
|
|
858
881
|
# Re-raise other exceptions
|
|
859
882
|
raise
|
|
860
|
-
|
|
861
|
-
# Store serialized results for _merge_chunk_results to handle directly
|
|
883
|
+
|
|
884
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
862
885
|
chunk_consensus_maps = []
|
|
863
886
|
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
864
887
|
# Store serialized data directly for _merge_chunk_results to handle
|
|
865
888
|
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
866
|
-
|
|
867
|
-
# Merge chunk results with proper cross-chunk consensus building
|
|
889
|
+
|
|
890
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
868
891
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
869
892
|
_dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
870
|
-
|
|
893
|
+
|
|
871
894
|
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
872
895
|
consensus_map = oms.ConsensusMap()
|
|
873
896
|
return consensus_map
|
|
874
897
|
|
|
875
898
|
|
|
876
|
-
def _merge_qt_chunked(
|
|
899
|
+
def _merge_qt_chunked(
|
|
900
|
+
study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None
|
|
901
|
+
) -> oms.ConsensusMap:
|
|
877
902
|
"""QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
878
|
-
|
|
903
|
+
|
|
879
904
|
# Generate temporary feature maps on-demand from features_df
|
|
880
905
|
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
881
|
-
|
|
906
|
+
|
|
882
907
|
n_samples = len(temp_feature_maps)
|
|
883
908
|
if n_samples <= params.chunk_size:
|
|
884
909
|
study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
|
|
@@ -886,23 +911,31 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
886
911
|
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
887
912
|
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
888
913
|
return consensus_map
|
|
889
|
-
|
|
914
|
+
|
|
890
915
|
# Process in chunks
|
|
891
916
|
chunks = []
|
|
892
917
|
for i in range(0, n_samples, params.chunk_size):
|
|
893
918
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
894
919
|
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
895
|
-
|
|
896
|
-
study.logger.debug(
|
|
897
|
-
|
|
920
|
+
|
|
921
|
+
study.logger.debug(
|
|
922
|
+
f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)"
|
|
923
|
+
)
|
|
924
|
+
|
|
898
925
|
# Process each chunk to create chunk consensus maps
|
|
899
926
|
chunk_consensus_maps = []
|
|
900
|
-
|
|
927
|
+
|
|
901
928
|
if params.threads is None:
|
|
902
929
|
# Sequential processing (original behavior)
|
|
903
|
-
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(
|
|
930
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(
|
|
931
|
+
tqdm(
|
|
932
|
+
chunks,
|
|
933
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}QT Chunk",
|
|
934
|
+
disable=study.log_level not in ["TRACE", "DEBUG", "INFO"],
|
|
935
|
+
)
|
|
936
|
+
):
|
|
904
937
|
chunk_consensus_map = oms.ConsensusMap()
|
|
905
|
-
|
|
938
|
+
|
|
906
939
|
# Set up file descriptions for chunk
|
|
907
940
|
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
908
941
|
for j, feature_map in enumerate(chunk_maps):
|
|
@@ -911,9 +944,9 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
911
944
|
file_description.size = feature_map.size()
|
|
912
945
|
file_description.unique_id = feature_map.getUniqueId()
|
|
913
946
|
file_descriptions[j] = file_description
|
|
914
|
-
|
|
947
|
+
|
|
915
948
|
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
916
|
-
|
|
949
|
+
|
|
917
950
|
# Use QT algorithm for chunk (main difference from KD chunked)
|
|
918
951
|
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
919
952
|
chunk_params = grouper.getParameters()
|
|
@@ -922,16 +955,16 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
922
955
|
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
923
956
|
chunk_params.setValue("ignore_charge", "true")
|
|
924
957
|
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
925
|
-
|
|
958
|
+
|
|
926
959
|
grouper.setParameters(chunk_params)
|
|
927
960
|
grouper.group(chunk_maps, chunk_consensus_map)
|
|
928
|
-
|
|
961
|
+
|
|
929
962
|
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
930
|
-
|
|
963
|
+
|
|
931
964
|
else:
|
|
932
965
|
# Parallel processing
|
|
933
|
-
#study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
934
|
-
|
|
966
|
+
# study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
967
|
+
|
|
935
968
|
# Prepare chunk data for parallel processing using features_df slices
|
|
936
969
|
chunk_data_list = []
|
|
937
970
|
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
@@ -940,58 +973,65 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
940
973
|
chunk_samples_df_rows = []
|
|
941
974
|
for j in range(len(chunk_maps)):
|
|
942
975
|
sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
|
|
943
|
-
chunk_sample_uids.append(sample_row[
|
|
976
|
+
chunk_sample_uids.append(sample_row["sample_uid"])
|
|
944
977
|
chunk_samples_df_rows.append(sample_row)
|
|
945
|
-
|
|
978
|
+
|
|
946
979
|
# Create a DataFrame for this chunk's samples
|
|
947
980
|
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
948
|
-
|
|
981
|
+
|
|
949
982
|
# Filter features_df for this chunk's samples and select only necessary columns
|
|
950
|
-
chunk_features_df = study.features_df.filter(
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
983
|
+
chunk_features_df = study.features_df.filter(pl.col("sample_uid").is_in(chunk_sample_uids)).select([
|
|
984
|
+
"sample_uid",
|
|
985
|
+
"rt",
|
|
986
|
+
"mz",
|
|
987
|
+
"inty",
|
|
988
|
+
"charge",
|
|
989
|
+
"feature_id",
|
|
954
990
|
])
|
|
955
|
-
|
|
991
|
+
|
|
956
992
|
# Convert DataFrames to serializable format (lists of dicts)
|
|
957
993
|
chunk_features_data = chunk_features_df.to_dicts()
|
|
958
994
|
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
959
|
-
|
|
995
|
+
|
|
960
996
|
chunk_data = {
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
}
|
|
997
|
+
"chunk_start_idx": chunk_start_idx,
|
|
998
|
+
"chunk_features_data": chunk_features_data, # List of dicts instead of DataFrame
|
|
999
|
+
"chunk_samples_data": chunk_samples_data, # List of dicts instead of DataFrame
|
|
1000
|
+
"params": {
|
|
1001
|
+
"nr_partitions": params.nr_partitions,
|
|
1002
|
+
"rt_tol": params.rt_tol,
|
|
1003
|
+
"mz_tol": params.mz_tol,
|
|
1004
|
+
},
|
|
969
1005
|
}
|
|
970
1006
|
chunk_data_list.append(chunk_data)
|
|
971
|
-
|
|
1007
|
+
|
|
972
1008
|
# Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
|
|
973
1009
|
executor_class = ProcessPoolExecutor
|
|
974
1010
|
executor_name = "processes"
|
|
975
|
-
|
|
1011
|
+
|
|
976
1012
|
try:
|
|
977
1013
|
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
978
1014
|
# Submit all chunk processing tasks
|
|
979
|
-
future_to_chunk = {
|
|
980
|
-
|
|
981
|
-
|
|
1015
|
+
future_to_chunk = {
|
|
1016
|
+
executor.submit(_process_qt_chunk_parallel, chunk_data): i
|
|
1017
|
+
for i, chunk_data in enumerate(chunk_data_list)
|
|
1018
|
+
}
|
|
1019
|
+
|
|
982
1020
|
# Collect results with progress tracking
|
|
983
1021
|
completed_chunks = 0
|
|
984
1022
|
total_chunks = len(chunk_data_list)
|
|
985
1023
|
serialized_chunk_results = []
|
|
986
|
-
|
|
1024
|
+
|
|
987
1025
|
for future in as_completed(future_to_chunk):
|
|
988
1026
|
chunk_idx = future_to_chunk[future]
|
|
989
1027
|
try:
|
|
990
1028
|
chunk_start_idx, consensus_features = future.result()
|
|
991
1029
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
992
1030
|
completed_chunks += 1
|
|
993
|
-
n_samples_in_chunk = len(chunk_data_list[chunk_idx][
|
|
994
|
-
study.logger.info(
|
|
1031
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
|
|
1032
|
+
study.logger.info(
|
|
1033
|
+
f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
|
|
1034
|
+
)
|
|
995
1035
|
except Exception as exc:
|
|
996
1036
|
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
997
1037
|
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
@@ -1000,64 +1040,75 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
1000
1040
|
else:
|
|
1001
1041
|
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1002
1042
|
raise exc
|
|
1003
|
-
|
|
1043
|
+
|
|
1004
1044
|
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
1005
1045
|
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
1006
|
-
if (
|
|
1007
|
-
"
|
|
1046
|
+
if (
|
|
1047
|
+
"freeze_support" in str(e)
|
|
1048
|
+
or "spawn" in str(e)
|
|
1049
|
+
or "bootstrapping" in str(e)
|
|
1050
|
+
or "process pool" in str(e).lower()
|
|
1051
|
+
or "Windows multiprocessing failure" in str(e)
|
|
1052
|
+
):
|
|
1008
1053
|
study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
1009
1054
|
study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
1010
|
-
|
|
1055
|
+
|
|
1011
1056
|
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
1012
1057
|
# Submit all chunk processing tasks
|
|
1013
|
-
future_to_chunk = {
|
|
1014
|
-
|
|
1015
|
-
|
|
1058
|
+
future_to_chunk = {
|
|
1059
|
+
executor.submit(_process_qt_chunk_parallel, chunk_data): i
|
|
1060
|
+
for i, chunk_data in enumerate(chunk_data_list)
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1016
1063
|
# Collect results with progress tracking
|
|
1017
1064
|
completed_chunks = 0
|
|
1018
1065
|
total_chunks = len(chunk_data_list)
|
|
1019
1066
|
serialized_chunk_results = []
|
|
1020
|
-
|
|
1067
|
+
|
|
1021
1068
|
for future in as_completed(future_to_chunk):
|
|
1022
1069
|
chunk_idx = future_to_chunk[future]
|
|
1023
1070
|
try:
|
|
1024
1071
|
chunk_start_idx, consensus_features = future.result()
|
|
1025
1072
|
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1026
1073
|
completed_chunks += 1
|
|
1027
|
-
n_samples_in_chunk = len(chunk_data_list[chunk_idx][
|
|
1028
|
-
study.logger.info(
|
|
1074
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
|
|
1075
|
+
study.logger.info(
|
|
1076
|
+
f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
|
|
1077
|
+
)
|
|
1029
1078
|
except Exception as exc:
|
|
1030
1079
|
study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1031
1080
|
raise exc
|
|
1032
1081
|
else:
|
|
1033
1082
|
# Re-raise other exceptions
|
|
1034
1083
|
raise
|
|
1035
|
-
|
|
1036
|
-
# Store serialized results for _merge_chunk_results to handle directly
|
|
1084
|
+
|
|
1085
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
1037
1086
|
chunk_consensus_maps = []
|
|
1038
1087
|
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
1039
1088
|
# Store serialized data directly for _merge_chunk_results to handle
|
|
1040
1089
|
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
1041
|
-
|
|
1042
|
-
# Merge chunk results with proper cross-chunk consensus building
|
|
1090
|
+
|
|
1091
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
1043
1092
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1044
1093
|
_dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1045
|
-
|
|
1094
|
+
|
|
1046
1095
|
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
1047
1096
|
consensus_map = oms.ConsensusMap()
|
|
1048
1097
|
return consensus_map
|
|
1049
1098
|
|
|
1050
1099
|
|
|
1051
|
-
def _dechunk_results(
|
|
1100
|
+
def _dechunk_results(
|
|
1101
|
+
study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None
|
|
1102
|
+
) -> None:
|
|
1052
1103
|
"""
|
|
1053
1104
|
Scalable aggregation of chunk consensus maps into final consensus_df.
|
|
1054
|
-
|
|
1105
|
+
|
|
1055
1106
|
This function implements cross-chunk consensus building by:
|
|
1056
1107
|
1. Extracting feature_uids from each chunk consensus map
|
|
1057
1108
|
2. Aggregating features close in RT/m/z across chunks
|
|
1058
1109
|
3. Building consensus_df and consensus_mapping_df directly
|
|
1059
1110
|
"""
|
|
1060
|
-
|
|
1111
|
+
|
|
1061
1112
|
if len(chunk_consensus_maps) == 1:
|
|
1062
1113
|
# Single chunk case - just extract using the true global min_samples.
|
|
1063
1114
|
# No need for permissive threshold because we are not discarding singletons pre-aggregation.
|
|
@@ -1069,19 +1120,16 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1069
1120
|
cached_valid_adducts,
|
|
1070
1121
|
)
|
|
1071
1122
|
return
|
|
1072
|
-
|
|
1123
|
+
|
|
1073
1124
|
# Build feature_uid to feature_data lookup for fast access
|
|
1074
|
-
feature_uid_map = {
|
|
1075
|
-
|
|
1076
|
-
for row in study.features_df.iter_rows(named=True)
|
|
1077
|
-
}
|
|
1078
|
-
|
|
1125
|
+
feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in study.features_df.iter_rows(named=True)}
|
|
1126
|
+
|
|
1079
1127
|
features_lookup = __merge_feature_lookup(study, study.features_df)
|
|
1080
|
-
|
|
1128
|
+
|
|
1081
1129
|
# Extract all consensus features from chunks with their feature_uids
|
|
1082
1130
|
all_chunk_consensus = []
|
|
1083
1131
|
consensus_id_counter = 0
|
|
1084
|
-
|
|
1132
|
+
|
|
1085
1133
|
for chunk_idx, (chunk_start_idx, chunk_data) in enumerate(chunk_consensus_maps):
|
|
1086
1134
|
# Handle both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1087
1135
|
if isinstance(chunk_data, list):
|
|
@@ -1091,45 +1139,45 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1091
1139
|
# Sequential processing: chunk_data is a ConsensusMap object
|
|
1092
1140
|
chunk_consensus_map = chunk_data
|
|
1093
1141
|
consensus_features_data = []
|
|
1094
|
-
|
|
1142
|
+
|
|
1095
1143
|
# Extract data from ConsensusMap and convert to serialized format
|
|
1096
1144
|
for consensus_feature in chunk_consensus_map:
|
|
1097
1145
|
# Extract feature_uids from this consensus feature
|
|
1098
1146
|
feature_uids = []
|
|
1099
1147
|
feature_data_list = []
|
|
1100
1148
|
sample_uids = []
|
|
1101
|
-
|
|
1149
|
+
|
|
1102
1150
|
for feature_handle in consensus_feature.getFeatureList():
|
|
1103
1151
|
fuid = str(feature_handle.getUniqueId())
|
|
1104
1152
|
if fuid not in feature_uid_map:
|
|
1105
1153
|
continue
|
|
1106
|
-
|
|
1154
|
+
|
|
1107
1155
|
feature_uid = feature_uid_map[fuid]
|
|
1108
1156
|
feature_data = features_lookup.get(feature_uid)
|
|
1109
1157
|
if feature_data:
|
|
1110
1158
|
feature_uids.append(feature_uid)
|
|
1111
1159
|
feature_data_list.append(feature_data)
|
|
1112
|
-
|
|
1160
|
+
|
|
1113
1161
|
# Use feature_uid to lookup actual sample_uid instead of chunk position
|
|
1114
|
-
actual_sample_uid = feature_data[
|
|
1162
|
+
actual_sample_uid = feature_data["sample_uid"]
|
|
1115
1163
|
sample_uids.append(actual_sample_uid)
|
|
1116
1164
|
|
|
1117
1165
|
if not feature_data_list:
|
|
1118
1166
|
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
1119
1167
|
continue
|
|
1120
|
-
|
|
1168
|
+
|
|
1121
1169
|
# Convert ConsensusFeature to serialized format
|
|
1122
1170
|
consensus_feature_data = {
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1171
|
+
"rt": consensus_feature.getRT(),
|
|
1172
|
+
"mz": consensus_feature.getMZ(),
|
|
1173
|
+
"intensity": consensus_feature.getIntensity(),
|
|
1174
|
+
"quality": consensus_feature.getQuality(),
|
|
1175
|
+
"feature_uids": feature_uids,
|
|
1176
|
+
"feature_data_list": feature_data_list,
|
|
1177
|
+
"sample_uids": sample_uids,
|
|
1130
1178
|
}
|
|
1131
1179
|
consensus_features_data.append(consensus_feature_data)
|
|
1132
|
-
|
|
1180
|
+
|
|
1133
1181
|
# Process the consensus features (now all in serialized format)
|
|
1134
1182
|
for consensus_feature_data in consensus_features_data:
|
|
1135
1183
|
# For parallel processing, feature data is already extracted
|
|
@@ -1138,44 +1186,44 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1138
1186
|
feature_uids = []
|
|
1139
1187
|
feature_data_list = []
|
|
1140
1188
|
sample_uids = []
|
|
1141
|
-
|
|
1142
|
-
for handle_data in consensus_feature_data[
|
|
1143
|
-
fuid = str(handle_data[
|
|
1189
|
+
|
|
1190
|
+
for handle_data in consensus_feature_data["features"]:
|
|
1191
|
+
fuid = str(handle_data["unique_id"])
|
|
1144
1192
|
if fuid not in feature_uid_map:
|
|
1145
1193
|
continue
|
|
1146
|
-
|
|
1194
|
+
|
|
1147
1195
|
feature_uid = feature_uid_map[fuid]
|
|
1148
1196
|
feature_data = features_lookup.get(feature_uid)
|
|
1149
1197
|
if feature_data:
|
|
1150
1198
|
feature_uids.append(feature_uid)
|
|
1151
1199
|
feature_data_list.append(feature_data)
|
|
1152
|
-
|
|
1200
|
+
|
|
1153
1201
|
# Use feature_uid to lookup actual sample_uid instead of chunk position
|
|
1154
|
-
actual_sample_uid = feature_data[
|
|
1202
|
+
actual_sample_uid = feature_data["sample_uid"]
|
|
1155
1203
|
sample_uids.append(actual_sample_uid)
|
|
1156
|
-
|
|
1204
|
+
|
|
1157
1205
|
if not feature_data_list:
|
|
1158
1206
|
continue
|
|
1159
|
-
|
|
1207
|
+
|
|
1160
1208
|
# Get RT/MZ from consensus feature data
|
|
1161
|
-
consensus_rt = consensus_feature_data[
|
|
1162
|
-
consensus_mz = consensus_feature_data[
|
|
1163
|
-
consensus_intensity = consensus_feature_data[
|
|
1164
|
-
consensus_quality = consensus_feature_data[
|
|
1209
|
+
consensus_rt = consensus_feature_data["rt"]
|
|
1210
|
+
consensus_mz = consensus_feature_data["mz"]
|
|
1211
|
+
consensus_intensity = consensus_feature_data["intensity"]
|
|
1212
|
+
consensus_quality = consensus_feature_data["quality"]
|
|
1165
1213
|
else:
|
|
1166
1214
|
# Sequential processing: data is already extracted above
|
|
1167
|
-
feature_uids = consensus_feature_data[
|
|
1168
|
-
feature_data_list = consensus_feature_data[
|
|
1169
|
-
sample_uids = consensus_feature_data[
|
|
1170
|
-
consensus_rt = consensus_feature_data[
|
|
1171
|
-
consensus_mz = consensus_feature_data[
|
|
1172
|
-
consensus_intensity = consensus_feature_data[
|
|
1173
|
-
consensus_quality = consensus_feature_data[
|
|
1215
|
+
feature_uids = consensus_feature_data["feature_uids"]
|
|
1216
|
+
feature_data_list = consensus_feature_data["feature_data_list"]
|
|
1217
|
+
sample_uids = consensus_feature_data["sample_uids"]
|
|
1218
|
+
consensus_rt = consensus_feature_data["rt"]
|
|
1219
|
+
consensus_mz = consensus_feature_data["mz"]
|
|
1220
|
+
consensus_intensity = consensus_feature_data["intensity"]
|
|
1221
|
+
consensus_quality = consensus_feature_data["quality"]
|
|
1174
1222
|
|
|
1175
1223
|
if not feature_data_list:
|
|
1176
1224
|
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
1177
1225
|
continue
|
|
1178
|
-
|
|
1226
|
+
|
|
1179
1227
|
# Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
|
|
1180
1228
|
rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
|
|
1181
1229
|
mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
|
|
@@ -1189,30 +1237,31 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1189
1237
|
mz_max_local = max(mz_vals_local)
|
|
1190
1238
|
else:
|
|
1191
1239
|
mz_min_local = mz_max_local = consensus_mz
|
|
1192
|
-
|
|
1240
|
+
|
|
1193
1241
|
# Store chunk consensus with feature tracking
|
|
1194
1242
|
# Generate unique 16-character consensus_id string
|
|
1195
1243
|
import uuid
|
|
1196
|
-
|
|
1197
|
-
|
|
1244
|
+
|
|
1245
|
+
consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
|
|
1246
|
+
|
|
1198
1247
|
chunk_consensus_data = {
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1248
|
+
"consensus_id": consensus_id_str,
|
|
1249
|
+
"chunk_idx": chunk_idx,
|
|
1250
|
+
"chunk_start_idx": chunk_start_idx,
|
|
1251
|
+
"mz": consensus_mz,
|
|
1252
|
+
"rt": consensus_rt,
|
|
1253
|
+
"mz_min": mz_min_local,
|
|
1254
|
+
"mz_max": mz_max_local,
|
|
1255
|
+
"rt_min": rt_min_local,
|
|
1256
|
+
"rt_max": rt_max_local,
|
|
1257
|
+
"intensity": consensus_intensity,
|
|
1258
|
+
"quality": consensus_quality,
|
|
1259
|
+
"feature_uids": feature_uids,
|
|
1260
|
+
"feature_data_list": feature_data_list,
|
|
1261
|
+
"sample_uids": sample_uids,
|
|
1262
|
+
"sample_count": len(feature_data_list),
|
|
1214
1263
|
}
|
|
1215
|
-
|
|
1264
|
+
|
|
1216
1265
|
all_chunk_consensus.append(chunk_consensus_data)
|
|
1217
1266
|
|
|
1218
1267
|
if not all_chunk_consensus:
|
|
@@ -1220,37 +1269,38 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1220
1269
|
study.consensus_df = pl.DataFrame()
|
|
1221
1270
|
study.consensus_mapping_df = pl.DataFrame()
|
|
1222
1271
|
return
|
|
1223
|
-
|
|
1272
|
+
|
|
1224
1273
|
# CROSS-CHUNK DECHUNKING ALGORITHMS
|
|
1225
1274
|
# Multiple algorithms available for combining chunk results
|
|
1226
|
-
|
|
1275
|
+
|
|
1227
1276
|
class HierarchicalAnchorMerger:
|
|
1228
1277
|
"""
|
|
1229
1278
|
Hierarchical Anchor Merger: Comprehensive cross-chunk feature preservation.
|
|
1230
1279
|
Uses Union-Find clustering for transitive matching across multiple chunks.
|
|
1231
1280
|
"""
|
|
1281
|
+
|
|
1232
1282
|
def __init__(self, rt_tol: float, mz_tol: float):
|
|
1233
1283
|
self.rt_tol = rt_tol
|
|
1234
1284
|
self.mz_tol = mz_tol
|
|
1235
|
-
|
|
1285
|
+
|
|
1236
1286
|
def merge(self, chunk_consensus_list: list) -> list:
|
|
1237
1287
|
"""Fixed hierarchical merging with union-find clustering for complete feature preservation"""
|
|
1238
1288
|
if not chunk_consensus_list:
|
|
1239
1289
|
return []
|
|
1240
|
-
|
|
1290
|
+
|
|
1241
1291
|
study.logger.debug(f"FIXED HierarchicalAnchorMerger: processing {len(chunk_consensus_list)} chunk features")
|
|
1242
|
-
|
|
1292
|
+
|
|
1243
1293
|
# Union-Find data structure for transitive clustering
|
|
1244
1294
|
class UnionFind:
|
|
1245
1295
|
def __init__(self, n):
|
|
1246
1296
|
self.parent = list(range(n))
|
|
1247
1297
|
self.rank = [0] * n
|
|
1248
|
-
|
|
1298
|
+
|
|
1249
1299
|
def find(self, x):
|
|
1250
1300
|
if self.parent[x] != x:
|
|
1251
1301
|
self.parent[x] = self.find(self.parent[x]) # Path compression
|
|
1252
1302
|
return self.parent[x]
|
|
1253
|
-
|
|
1303
|
+
|
|
1254
1304
|
def union(self, x, y):
|
|
1255
1305
|
px, py = self.find(x), self.find(y)
|
|
1256
1306
|
if px == py:
|
|
@@ -1262,55 +1312,55 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1262
1312
|
if self.rank[px] == self.rank[py]:
|
|
1263
1313
|
self.rank[px] += 1
|
|
1264
1314
|
return True # Union was performed
|
|
1265
|
-
|
|
1315
|
+
|
|
1266
1316
|
n_features = len(chunk_consensus_list)
|
|
1267
1317
|
uf = UnionFind(n_features)
|
|
1268
1318
|
merges_made = 0
|
|
1269
|
-
|
|
1319
|
+
|
|
1270
1320
|
# Optimized cross-chunk feature matching using KD-tree spatial indexing
|
|
1271
|
-
|
|
1321
|
+
|
|
1272
1322
|
# Proper dimensional scaling for RT vs m/z
|
|
1273
|
-
rt_scale = 1.0
|
|
1323
|
+
rt_scale = 1.0 # RT in seconds (1-30 min range)
|
|
1274
1324
|
mz_scale = 100.0 # m/z in Da (100-1000 range) - scale to match RT magnitude
|
|
1275
|
-
|
|
1325
|
+
|
|
1276
1326
|
# Build spatial index with scaled coordinates
|
|
1277
|
-
points = np.array([[f[
|
|
1327
|
+
points = np.array([[f["rt"] * rt_scale, f["mz"] * mz_scale] for f in chunk_consensus_list])
|
|
1278
1328
|
tree = cKDTree(points, balanced_tree=True, compact_nodes=True)
|
|
1279
|
-
|
|
1329
|
+
|
|
1280
1330
|
# Calculate proper Euclidean radius in scaled space
|
|
1281
1331
|
scaled_rt_tol = self.rt_tol * rt_scale
|
|
1282
|
-
scaled_mz_tol = self.mz_tol * mz_scale
|
|
1332
|
+
scaled_mz_tol = self.mz_tol * mz_scale
|
|
1283
1333
|
radius = np.sqrt(scaled_rt_tol**2 + scaled_mz_tol**2)
|
|
1284
|
-
|
|
1334
|
+
|
|
1285
1335
|
# Efficient neighbor search for feature matching
|
|
1286
1336
|
for i in range(n_features):
|
|
1287
1337
|
feature_i = chunk_consensus_list[i]
|
|
1288
|
-
chunk_i = feature_i.get(
|
|
1289
|
-
|
|
1338
|
+
chunk_i = feature_i.get("chunk_idx", -1)
|
|
1339
|
+
|
|
1290
1340
|
# Query spatial index for nearby features
|
|
1291
1341
|
neighbor_indices = tree.query_ball_point(points[i], r=radius, p=2)
|
|
1292
|
-
|
|
1342
|
+
|
|
1293
1343
|
for j in neighbor_indices:
|
|
1294
1344
|
if i >= j: # Skip duplicates and self
|
|
1295
1345
|
continue
|
|
1296
|
-
|
|
1346
|
+
|
|
1297
1347
|
feature_j = chunk_consensus_list[j]
|
|
1298
|
-
chunk_j = feature_j.get(
|
|
1299
|
-
|
|
1348
|
+
chunk_j = feature_j.get("chunk_idx", -1)
|
|
1349
|
+
|
|
1300
1350
|
# Skip features from same chunk (already clustered within chunk)
|
|
1301
1351
|
if chunk_i == chunk_j:
|
|
1302
1352
|
continue
|
|
1303
|
-
|
|
1353
|
+
|
|
1304
1354
|
# Verify with precise original tolerances (more accurate than scaled)
|
|
1305
|
-
rt_diff = abs(feature_i[
|
|
1306
|
-
mz_diff = abs(feature_i[
|
|
1307
|
-
|
|
1355
|
+
rt_diff = abs(feature_i["rt"] - feature_j["rt"])
|
|
1356
|
+
mz_diff = abs(feature_i["mz"] - feature_j["mz"])
|
|
1357
|
+
|
|
1308
1358
|
if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
|
|
1309
1359
|
if uf.union(i, j): # Merge if not already connected
|
|
1310
1360
|
merges_made += 1
|
|
1311
|
-
|
|
1361
|
+
|
|
1312
1362
|
study.logger.debug(f"FIXED HierarchicalAnchorMerger: made {merges_made} cross-chunk merges")
|
|
1313
|
-
|
|
1363
|
+
|
|
1314
1364
|
# Group features by their connected component
|
|
1315
1365
|
clusters = {}
|
|
1316
1366
|
for i in range(n_features):
|
|
@@ -1318,190 +1368,196 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1318
1368
|
if root not in clusters:
|
|
1319
1369
|
clusters[root] = []
|
|
1320
1370
|
clusters[root].append(chunk_consensus_list[i])
|
|
1321
|
-
|
|
1371
|
+
|
|
1322
1372
|
# Merge each cluster into a single consensus feature
|
|
1323
1373
|
result = []
|
|
1324
1374
|
for cluster_features in clusters.values():
|
|
1325
1375
|
merged = self._merge_cluster(cluster_features)
|
|
1326
1376
|
result.append(merged)
|
|
1327
|
-
|
|
1328
|
-
study.logger.debug(
|
|
1329
|
-
|
|
1377
|
+
|
|
1378
|
+
study.logger.debug(
|
|
1379
|
+
f"FIXED HierarchicalAnchorMerger: output {len(result)} merged features (from {n_features} inputs)"
|
|
1380
|
+
)
|
|
1381
|
+
|
|
1330
1382
|
# VERIFICATION: Ensure we haven't lost features
|
|
1331
1383
|
if len(result) > len(chunk_consensus_list):
|
|
1332
|
-
study.logger.warning(
|
|
1333
|
-
|
|
1384
|
+
study.logger.warning(
|
|
1385
|
+
f"FIXED HierarchicalAnchorMerger: More outputs than inputs ({len(result)} > {n_features})"
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1334
1388
|
return result
|
|
1335
|
-
|
|
1389
|
+
|
|
1336
1390
|
def _merge_cluster(self, cluster: list) -> dict:
|
|
1337
1391
|
"""Merge cluster using sample-weighted consensus with robust error handling"""
|
|
1338
1392
|
if len(cluster) == 1:
|
|
1339
1393
|
return cluster[0] # No merging needed for single feature
|
|
1340
|
-
|
|
1394
|
+
|
|
1341
1395
|
# Calculate weights robustly to prevent division by zero
|
|
1342
1396
|
weights = []
|
|
1343
1397
|
for c in cluster:
|
|
1344
|
-
sample_count = c.get(
|
|
1398
|
+
sample_count = c.get("sample_count", 0)
|
|
1345
1399
|
# Use minimum weight of 1 to prevent zero weights
|
|
1346
1400
|
weights.append(max(sample_count, 1))
|
|
1347
|
-
|
|
1401
|
+
|
|
1348
1402
|
total_weight = sum(weights)
|
|
1349
1403
|
# Fallback for edge cases
|
|
1350
1404
|
if total_weight == 0:
|
|
1351
1405
|
total_weight = len(cluster)
|
|
1352
1406
|
weights = [1] * len(cluster)
|
|
1353
|
-
|
|
1407
|
+
|
|
1354
1408
|
# Weighted consensus for RT/mz coordinates
|
|
1355
1409
|
merged = {
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1410
|
+
"consensus_id": cluster[0]["consensus_id"], # Use first feature's ID
|
|
1411
|
+
"chunk_indices": [c.get("chunk_idx", 0) for c in cluster],
|
|
1412
|
+
"mz": sum(c["mz"] * w for c, w in zip(cluster, weights)) / total_weight,
|
|
1413
|
+
"rt": sum(c["rt"] * w for c, w in zip(cluster, weights)) / total_weight,
|
|
1414
|
+
"intensity": sum(c.get("intensity", 0) for c in cluster),
|
|
1415
|
+
"quality": sum(c.get("quality", 1) * w for c, w in zip(cluster, weights)) / total_weight,
|
|
1416
|
+
"feature_uids": [],
|
|
1417
|
+
"feature_data_list": [],
|
|
1418
|
+
"sample_uids": [],
|
|
1419
|
+
"sample_count": 0,
|
|
1366
1420
|
}
|
|
1367
|
-
|
|
1421
|
+
|
|
1368
1422
|
# Aggregate all features and samples from all chunks
|
|
1369
1423
|
all_feature_uids = []
|
|
1370
1424
|
all_feature_data = []
|
|
1371
1425
|
all_sample_uids = []
|
|
1372
|
-
|
|
1426
|
+
|
|
1373
1427
|
for chunk in cluster:
|
|
1374
1428
|
# Collect feature UIDs
|
|
1375
|
-
chunk_feature_uids = chunk.get(
|
|
1429
|
+
chunk_feature_uids = chunk.get("feature_uids", [])
|
|
1376
1430
|
all_feature_uids.extend(chunk_feature_uids)
|
|
1377
|
-
|
|
1431
|
+
|
|
1378
1432
|
# Collect feature data
|
|
1379
|
-
chunk_feature_data = chunk.get(
|
|
1433
|
+
chunk_feature_data = chunk.get("feature_data_list", [])
|
|
1380
1434
|
all_feature_data.extend(chunk_feature_data)
|
|
1381
|
-
|
|
1435
|
+
|
|
1382
1436
|
# Collect sample UIDs
|
|
1383
|
-
chunk_sample_uids = chunk.get(
|
|
1437
|
+
chunk_sample_uids = chunk.get("sample_uids", [])
|
|
1384
1438
|
all_sample_uids.extend(chunk_sample_uids)
|
|
1385
|
-
|
|
1439
|
+
|
|
1386
1440
|
# Remove duplicates properly and count unique samples
|
|
1387
|
-
merged[
|
|
1388
|
-
merged[
|
|
1389
|
-
merged[
|
|
1390
|
-
merged[
|
|
1391
|
-
|
|
1441
|
+
merged["feature_uids"] = list(set(all_feature_uids))
|
|
1442
|
+
merged["feature_data_list"] = all_feature_data # Keep all feature data
|
|
1443
|
+
merged["sample_uids"] = list(set(all_sample_uids)) # Unique sample UIDs only
|
|
1444
|
+
merged["sample_count"] = len(merged["sample_uids"]) # Count of unique samples
|
|
1445
|
+
|
|
1392
1446
|
return merged
|
|
1393
|
-
|
|
1447
|
+
|
|
1394
1448
|
class KDTreeSpatialMerger:
|
|
1395
1449
|
"""
|
|
1396
1450
|
KD-Tree Spatial Merger: Optimized for high-sample features.
|
|
1397
1451
|
"""
|
|
1452
|
+
|
|
1398
1453
|
def __init__(self, rt_tol: float, mz_tol: float):
|
|
1399
1454
|
self.rt_tol = rt_tol
|
|
1400
1455
|
self.mz_tol = mz_tol
|
|
1401
|
-
|
|
1456
|
+
|
|
1402
1457
|
def merge(self, chunk_consensus_list: list) -> list:
|
|
1403
1458
|
"""KD-tree based spatial merging"""
|
|
1404
1459
|
if not chunk_consensus_list:
|
|
1405
1460
|
return []
|
|
1406
|
-
|
|
1461
|
+
|
|
1407
1462
|
try:
|
|
1408
1463
|
from scipy.spatial import cKDTree
|
|
1409
1464
|
import numpy as np
|
|
1410
1465
|
except ImportError:
|
|
1411
1466
|
# Fallback to simple clustering if scipy not available
|
|
1412
1467
|
return self._fallback_merge(chunk_consensus_list)
|
|
1413
|
-
|
|
1468
|
+
|
|
1414
1469
|
# Build spatial index
|
|
1415
|
-
points = np.array([[c[
|
|
1470
|
+
points = np.array([[c["rt"], c["mz"]] for c in chunk_consensus_list])
|
|
1416
1471
|
tree = cKDTree(points)
|
|
1417
|
-
|
|
1472
|
+
|
|
1418
1473
|
# Scale tolerances for KD-tree query
|
|
1419
1474
|
rt_scale = 1.0 / self.rt_tol if self.rt_tol > 0 else 1.0
|
|
1420
1475
|
mz_scale = 1.0 / self.mz_tol if self.mz_tol > 0 else 1.0
|
|
1421
1476
|
scaled_points = points * np.array([rt_scale, mz_scale])
|
|
1422
1477
|
scaled_tree = cKDTree(scaled_points)
|
|
1423
|
-
|
|
1478
|
+
|
|
1424
1479
|
clusters = []
|
|
1425
1480
|
used = set()
|
|
1426
|
-
|
|
1481
|
+
|
|
1427
1482
|
# Priority processing for high-sample features
|
|
1428
|
-
high_sample_indices = [i for i, c in enumerate(chunk_consensus_list) if c[
|
|
1483
|
+
high_sample_indices = [i for i, c in enumerate(chunk_consensus_list) if c["sample_count"] >= 100]
|
|
1429
1484
|
remaining_indices = [i for i in range(len(chunk_consensus_list)) if i not in high_sample_indices]
|
|
1430
|
-
|
|
1485
|
+
|
|
1431
1486
|
for idx in high_sample_indices + remaining_indices:
|
|
1432
1487
|
if idx in used:
|
|
1433
1488
|
continue
|
|
1434
|
-
|
|
1489
|
+
|
|
1435
1490
|
# Find neighbors in scaled space
|
|
1436
1491
|
neighbors = scaled_tree.query_ball_point(scaled_points[idx], r=1.0)
|
|
1437
1492
|
cluster_indices = [i for i in neighbors if i not in used and i != idx]
|
|
1438
1493
|
cluster_indices.append(idx)
|
|
1439
|
-
|
|
1494
|
+
|
|
1440
1495
|
if cluster_indices:
|
|
1441
1496
|
cluster = [chunk_consensus_list[i] for i in cluster_indices]
|
|
1442
1497
|
clusters.append(self._merge_cluster(cluster))
|
|
1443
1498
|
used.update(cluster_indices)
|
|
1444
|
-
|
|
1499
|
+
|
|
1445
1500
|
return clusters
|
|
1446
|
-
|
|
1501
|
+
|
|
1447
1502
|
def _fallback_merge(self, chunk_consensus_list: list) -> list:
|
|
1448
1503
|
"""Simple distance-based fallback when scipy unavailable"""
|
|
1449
1504
|
clusters = []
|
|
1450
1505
|
used = set()
|
|
1451
|
-
|
|
1506
|
+
|
|
1452
1507
|
for i, anchor in enumerate(chunk_consensus_list):
|
|
1453
1508
|
if i in used:
|
|
1454
1509
|
continue
|
|
1455
|
-
|
|
1510
|
+
|
|
1456
1511
|
cluster = [anchor]
|
|
1457
1512
|
used.add(i)
|
|
1458
|
-
|
|
1513
|
+
|
|
1459
1514
|
for j, candidate in enumerate(chunk_consensus_list):
|
|
1460
1515
|
if j in used or j == i:
|
|
1461
1516
|
continue
|
|
1462
|
-
|
|
1463
|
-
rt_diff = abs(candidate[
|
|
1464
|
-
mz_diff = abs(candidate[
|
|
1465
|
-
|
|
1517
|
+
|
|
1518
|
+
rt_diff = abs(candidate["rt"] - anchor["rt"])
|
|
1519
|
+
mz_diff = abs(candidate["mz"] - anchor["mz"])
|
|
1520
|
+
|
|
1466
1521
|
if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
|
|
1467
1522
|
cluster.append(candidate)
|
|
1468
1523
|
used.add(j)
|
|
1469
|
-
|
|
1524
|
+
|
|
1470
1525
|
clusters.append(self._merge_cluster(cluster))
|
|
1471
|
-
|
|
1526
|
+
|
|
1472
1527
|
return clusters
|
|
1473
|
-
|
|
1528
|
+
|
|
1474
1529
|
def _merge_cluster(self, cluster: list) -> dict:
|
|
1475
1530
|
"""Merge cluster with intensity-weighted consensus"""
|
|
1476
1531
|
if len(cluster) == 1:
|
|
1477
1532
|
return cluster[0]
|
|
1478
|
-
|
|
1533
|
+
|
|
1479
1534
|
# Weight by intensity for spatial accuracy
|
|
1480
|
-
total_intensity = sum(c[
|
|
1481
|
-
|
|
1535
|
+
total_intensity = sum(c["intensity"] for c in cluster)
|
|
1536
|
+
|
|
1482
1537
|
merged = {
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1538
|
+
"consensus_id": cluster[0]["consensus_id"],
|
|
1539
|
+
"chunk_indices": [c["chunk_idx"] for c in cluster],
|
|
1540
|
+
"mz": sum(c["mz"] * c["intensity"] for c in cluster) / total_intensity,
|
|
1541
|
+
"rt": sum(c["rt"] * c["intensity"] for c in cluster) / total_intensity,
|
|
1542
|
+
"intensity": total_intensity,
|
|
1543
|
+
"quality": sum(c["quality"] for c in cluster) / len(cluster),
|
|
1544
|
+
"feature_uids": [],
|
|
1545
|
+
"feature_data_list": [],
|
|
1546
|
+
"sample_uids": [],
|
|
1547
|
+
"sample_count": 0,
|
|
1493
1548
|
}
|
|
1494
|
-
|
|
1549
|
+
|
|
1495
1550
|
# Aggregate features
|
|
1496
1551
|
for chunk in cluster:
|
|
1497
|
-
merged[
|
|
1498
|
-
merged[
|
|
1499
|
-
merged[
|
|
1500
|
-
|
|
1501
|
-
merged[
|
|
1502
|
-
merged[
|
|
1503
|
-
|
|
1552
|
+
merged["feature_uids"].extend(chunk["feature_uids"])
|
|
1553
|
+
merged["feature_data_list"].extend(chunk["feature_data_list"])
|
|
1554
|
+
merged["sample_uids"].extend(chunk["sample_uids"])
|
|
1555
|
+
|
|
1556
|
+
merged["feature_uids"] = list(set(merged["feature_uids"]))
|
|
1557
|
+
merged["sample_count"] = len(set(merged["sample_uids"]))
|
|
1558
|
+
|
|
1504
1559
|
return merged
|
|
1560
|
+
|
|
1505
1561
|
# SELECT DECHUNKING ALGORITHM BASED ON PARAMETER
|
|
1506
1562
|
if params.dechunking == "hierarchical":
|
|
1507
1563
|
merger = HierarchicalAnchorMerger(params.rt_tol, params.mz_tol)
|
|
@@ -1523,7 +1579,7 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1523
1579
|
for group in refined_groups:
|
|
1524
1580
|
if not group:
|
|
1525
1581
|
continue
|
|
1526
|
-
|
|
1582
|
+
|
|
1527
1583
|
# Aggregate underlying feature data (deduplicated by feature_uid)
|
|
1528
1584
|
feature_data_acc = {}
|
|
1529
1585
|
sample_uids_acc = set()
|
|
@@ -1533,25 +1589,25 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1533
1589
|
quality_values_chunk = []
|
|
1534
1590
|
|
|
1535
1591
|
for cf in group:
|
|
1536
|
-
rt_values_chunk.append(cf[
|
|
1537
|
-
mz_values_chunk.append(cf[
|
|
1538
|
-
intensity_values_chunk.append(cf.get(
|
|
1539
|
-
quality_values_chunk.append(cf.get(
|
|
1540
|
-
|
|
1541
|
-
for fd, samp_uid in zip(cf[
|
|
1542
|
-
fid = fd.get(
|
|
1592
|
+
rt_values_chunk.append(cf["rt"])
|
|
1593
|
+
mz_values_chunk.append(cf["mz"])
|
|
1594
|
+
intensity_values_chunk.append(cf.get("intensity", 0.0) or 0.0)
|
|
1595
|
+
quality_values_chunk.append(cf.get("quality", 1.0) or 1.0)
|
|
1596
|
+
|
|
1597
|
+
for fd, samp_uid in zip(cf["feature_data_list"], cf["sample_uids"]):
|
|
1598
|
+
fid = fd.get("feature_uid") or fd.get("uid") or fd.get("feature_id")
|
|
1543
1599
|
# feature_uid expected in fd under 'feature_uid'; fallback attempts just in case
|
|
1544
1600
|
if fid is None:
|
|
1545
1601
|
continue
|
|
1546
1602
|
if fid not in feature_data_acc:
|
|
1547
1603
|
feature_data_acc[fid] = fd
|
|
1548
1604
|
sample_uids_acc.add(samp_uid)
|
|
1549
|
-
|
|
1605
|
+
|
|
1550
1606
|
if not feature_data_acc:
|
|
1551
1607
|
continue
|
|
1552
1608
|
|
|
1553
1609
|
number_samples = len(sample_uids_acc)
|
|
1554
|
-
|
|
1610
|
+
|
|
1555
1611
|
# This allows proper cross-chunk consensus building before final filtering
|
|
1556
1612
|
|
|
1557
1613
|
metadata = _calculate_consensus_statistics(
|
|
@@ -1567,46 +1623,46 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1567
1623
|
cached_adducts_df=cached_adducts_df,
|
|
1568
1624
|
cached_valid_adducts=cached_valid_adducts,
|
|
1569
1625
|
)
|
|
1570
|
-
|
|
1626
|
+
|
|
1571
1627
|
# Validate RT and m/z spread don't exceed tolerance limits
|
|
1572
|
-
rt_spread = metadata.get(
|
|
1573
|
-
mz_spread = metadata.get(
|
|
1628
|
+
rt_spread = metadata.get("rt_max", 0) - metadata.get("rt_min", 0)
|
|
1629
|
+
mz_spread = metadata.get("mz_max", 0) - metadata.get("mz_min", 0)
|
|
1574
1630
|
max_allowed_rt_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
|
|
1575
1631
|
max_allowed_mz_spread = params.mz_tol * 2 # Enforce strict m/z spread limit
|
|
1576
|
-
|
|
1632
|
+
|
|
1577
1633
|
skip_feature = False
|
|
1578
1634
|
skip_reason = ""
|
|
1579
|
-
|
|
1635
|
+
|
|
1580
1636
|
if rt_spread > max_allowed_rt_spread:
|
|
1581
1637
|
skip_feature = True
|
|
1582
1638
|
skip_reason = f"RT spread {rt_spread:.3f}s > {max_allowed_rt_spread:.3f}s"
|
|
1583
|
-
|
|
1639
|
+
|
|
1584
1640
|
if mz_spread > max_allowed_mz_spread:
|
|
1585
1641
|
skip_feature = True
|
|
1586
1642
|
if skip_reason:
|
|
1587
1643
|
skip_reason += f" AND m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
|
|
1588
1644
|
else:
|
|
1589
1645
|
skip_reason = f"m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
|
|
1590
|
-
|
|
1646
|
+
|
|
1591
1647
|
if skip_feature:
|
|
1592
1648
|
# Skip consensus features with excessive spread
|
|
1593
1649
|
study.logger.debug(f"Skipping consensus feature {consensus_uid_counter}: {skip_reason}")
|
|
1594
1650
|
consensus_uid_counter += 1
|
|
1595
1651
|
continue
|
|
1596
|
-
|
|
1652
|
+
|
|
1597
1653
|
consensus_metadata.append(metadata)
|
|
1598
1654
|
|
|
1599
1655
|
# Build mapping rows (deduplicated)
|
|
1600
1656
|
for fid, fd in feature_data_acc.items():
|
|
1601
|
-
samp_uid = fd.get(
|
|
1602
|
-
|
|
1657
|
+
samp_uid = fd.get("sample_uid") or fd.get("sample_id") or fd.get("sample")
|
|
1658
|
+
|
|
1603
1659
|
# If absent we attempt to derive from original group sample_uids pairing
|
|
1604
1660
|
# but most feature_data rows should include sample_uid already.
|
|
1605
1661
|
if samp_uid is None:
|
|
1606
1662
|
# fallback: search for cf containing this fid
|
|
1607
1663
|
for cf in group:
|
|
1608
|
-
for fd2, samp2 in zip(cf[
|
|
1609
|
-
f2id = fd2.get(
|
|
1664
|
+
for fd2, samp2 in zip(cf["feature_data_list"], cf["sample_uids"]):
|
|
1665
|
+
f2id = fd2.get("feature_uid") or fd2.get("uid") or fd2.get("feature_id")
|
|
1610
1666
|
if f2id == fid:
|
|
1611
1667
|
samp_uid = samp2
|
|
1612
1668
|
break
|
|
@@ -1615,9 +1671,9 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1615
1671
|
if samp_uid is None:
|
|
1616
1672
|
continue
|
|
1617
1673
|
consensus_mapping_list.append({
|
|
1618
|
-
|
|
1619
|
-
|
|
1620
|
-
|
|
1674
|
+
"consensus_uid": consensus_uid_counter,
|
|
1675
|
+
"sample_uid": samp_uid,
|
|
1676
|
+
"feature_uid": fid,
|
|
1621
1677
|
})
|
|
1622
1678
|
|
|
1623
1679
|
consensus_uid_counter += 1
|
|
@@ -1628,9 +1684,9 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1628
1684
|
|
|
1629
1685
|
# Ensure mapping only contains features from retained consensus_df
|
|
1630
1686
|
if len(study.consensus_df) > 0:
|
|
1631
|
-
valid_consensus_ids = set(study.consensus_df[
|
|
1687
|
+
valid_consensus_ids = set(study.consensus_df["consensus_uid"].to_list())
|
|
1632
1688
|
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
1633
|
-
pl.col(
|
|
1689
|
+
pl.col("consensus_uid").is_in(list(valid_consensus_ids))
|
|
1634
1690
|
)
|
|
1635
1691
|
else:
|
|
1636
1692
|
study.consensus_mapping_df = pl.DataFrame()
|
|
@@ -1640,28 +1696,36 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
|
|
|
1640
1696
|
return
|
|
1641
1697
|
|
|
1642
1698
|
|
|
1643
|
-
def _calculate_consensus_statistics(
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1699
|
+
def _calculate_consensus_statistics(
|
|
1700
|
+
study_obj,
|
|
1701
|
+
consensus_uid: int,
|
|
1702
|
+
feature_data_list: list,
|
|
1703
|
+
rt_values: list,
|
|
1704
|
+
mz_values: list,
|
|
1705
|
+
intensity_values: list,
|
|
1706
|
+
quality_values: list,
|
|
1707
|
+
number_features: int | None = None,
|
|
1708
|
+
number_samples: int | None = None,
|
|
1709
|
+
cached_adducts_df=None,
|
|
1710
|
+
cached_valid_adducts=None,
|
|
1711
|
+
) -> dict:
|
|
1648
1712
|
"""
|
|
1649
1713
|
Calculate comprehensive statistics for a consensus feature from aggregated feature data.
|
|
1650
|
-
|
|
1714
|
+
|
|
1651
1715
|
Args:
|
|
1652
1716
|
consensus_uid: Unique ID for this consensus feature
|
|
1653
1717
|
feature_data_list: List of individual feature dictionaries
|
|
1654
1718
|
rt_values: RT values from chunk consensus features
|
|
1655
|
-
mz_values: m/z values from chunk consensus features
|
|
1719
|
+
mz_values: m/z values from chunk consensus features
|
|
1656
1720
|
intensity_values: Intensity values from chunk consensus features
|
|
1657
1721
|
quality_values: Quality values from chunk consensus features
|
|
1658
|
-
|
|
1722
|
+
|
|
1659
1723
|
Returns:
|
|
1660
1724
|
Dictionary with consensus feature metadata
|
|
1661
1725
|
"""
|
|
1662
1726
|
if not feature_data_list:
|
|
1663
1727
|
return {}
|
|
1664
|
-
|
|
1728
|
+
|
|
1665
1729
|
# Convert feature data to numpy arrays for vectorized computation
|
|
1666
1730
|
rt_feat_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
|
|
1667
1731
|
mz_feat_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
|
|
@@ -1671,41 +1735,51 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
1671
1735
|
mz_start_values = np.array([fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None])
|
|
1672
1736
|
mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
|
|
1673
1737
|
inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
|
|
1674
|
-
coherence_values = np.array([
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1738
|
+
coherence_values = np.array([
|
|
1739
|
+
fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
|
|
1740
|
+
])
|
|
1741
|
+
prominence_values = np.array([
|
|
1742
|
+
fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
|
|
1743
|
+
])
|
|
1744
|
+
prominence_scaled_values = np.array([
|
|
1745
|
+
fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
|
|
1746
|
+
])
|
|
1747
|
+
height_scaled_values = np.array([
|
|
1748
|
+
fd.get("chrom_prominence_scaled", 0)
|
|
1749
|
+
for fd in feature_data_list
|
|
1750
|
+
if fd.get("chrom_prominence_scaled") is not None
|
|
1751
|
+
])
|
|
1678
1752
|
iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
|
|
1679
1753
|
charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
|
|
1680
|
-
|
|
1754
|
+
|
|
1681
1755
|
# Process adducts with cached validation
|
|
1682
1756
|
all_adducts = []
|
|
1683
1757
|
valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
|
|
1684
1758
|
valid_adducts.add("?") # Always allow '?' adducts
|
|
1685
|
-
|
|
1759
|
+
|
|
1686
1760
|
for fd in feature_data_list:
|
|
1687
1761
|
adduct = fd.get("adduct")
|
|
1688
1762
|
if adduct is not None:
|
|
1689
1763
|
# Only include adducts that are valid (from cached study adducts or contain '?')
|
|
1690
1764
|
if adduct in valid_adducts or "?" in adduct:
|
|
1691
1765
|
all_adducts.append(adduct)
|
|
1692
|
-
|
|
1766
|
+
|
|
1693
1767
|
# Calculate adduct consensus
|
|
1694
1768
|
adduct_values = []
|
|
1695
1769
|
adduct_top = None
|
|
1696
1770
|
adduct_charge_top = None
|
|
1697
1771
|
adduct_mass_neutral_top = None
|
|
1698
1772
|
adduct_mass_shift_top = None
|
|
1699
|
-
|
|
1773
|
+
|
|
1700
1774
|
if all_adducts:
|
|
1701
1775
|
adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
|
|
1702
1776
|
total_count = sum(adduct_counts.values())
|
|
1703
1777
|
for adduct, count in adduct_counts.items():
|
|
1704
1778
|
percentage = (count / total_count) * 100 if total_count > 0 else 0
|
|
1705
1779
|
adduct_values.append([str(adduct), int(count), float(round(percentage, 2))])
|
|
1706
|
-
|
|
1780
|
+
|
|
1707
1781
|
adduct_values.sort(key=lambda x: x[1], reverse=True)
|
|
1708
|
-
|
|
1782
|
+
|
|
1709
1783
|
if adduct_values:
|
|
1710
1784
|
adduct_top = adduct_values[0][0]
|
|
1711
1785
|
# Try to get charge and mass shift from cached study adducts
|
|
@@ -1719,7 +1793,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
1719
1793
|
adduct_charge_top = adduct_row["charge"]
|
|
1720
1794
|
adduct_mass_shift_top = adduct_row["mass_shift"]
|
|
1721
1795
|
adduct_found = True
|
|
1722
|
-
|
|
1796
|
+
|
|
1723
1797
|
if not adduct_found:
|
|
1724
1798
|
# Set default charge and mass shift for top adduct
|
|
1725
1799
|
adduct_charge_top = 1
|
|
@@ -1735,26 +1809,27 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
1735
1809
|
adduct_top = "[M+?]1+"
|
|
1736
1810
|
adduct_charge_top = 1
|
|
1737
1811
|
adduct_mass_shift_top = 1.007825
|
|
1738
|
-
|
|
1812
|
+
|
|
1739
1813
|
adduct_values = [[adduct_top, 1, 100.0]]
|
|
1740
|
-
|
|
1814
|
+
|
|
1741
1815
|
# Calculate neutral mass
|
|
1742
1816
|
consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
1743
1817
|
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
1744
1818
|
adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
1745
|
-
|
|
1819
|
+
|
|
1746
1820
|
# Calculate MS2 count
|
|
1747
1821
|
ms2_count = 0
|
|
1748
1822
|
for fd in feature_data_list:
|
|
1749
1823
|
ms2_scans = fd.get("ms2_scans")
|
|
1750
1824
|
if ms2_scans is not None:
|
|
1751
1825
|
ms2_count += len(ms2_scans)
|
|
1752
|
-
|
|
1826
|
+
|
|
1753
1827
|
# Build consensus metadata
|
|
1754
1828
|
# Generate unique 16-character consensus_id string
|
|
1755
1829
|
import uuid
|
|
1756
|
-
|
|
1757
|
-
|
|
1830
|
+
|
|
1831
|
+
consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
|
|
1832
|
+
|
|
1758
1833
|
return {
|
|
1759
1834
|
"consensus_uid": int(consensus_uid),
|
|
1760
1835
|
"consensus_id": consensus_id_str, # Use unique 16-char string ID
|
|
@@ -1777,8 +1852,12 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
1777
1852
|
"bl": -1.0,
|
|
1778
1853
|
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
|
|
1779
1854
|
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
|
|
1780
|
-
"chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3)
|
|
1781
|
-
|
|
1855
|
+
"chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3)
|
|
1856
|
+
if len(prominence_scaled_values) > 0
|
|
1857
|
+
else 0.0,
|
|
1858
|
+
"chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
|
|
1859
|
+
if len(height_scaled_values) > 0
|
|
1860
|
+
else 0.0,
|
|
1782
1861
|
"iso": None, # Will be filled by find_iso() function
|
|
1783
1862
|
"iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
|
|
1784
1863
|
"charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
|
|
@@ -1799,10 +1878,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
1799
1878
|
def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
|
|
1800
1879
|
"""Extract consensus features and build metadata."""
|
|
1801
1880
|
# create a dict to map uid to feature_uid using study.features_df
|
|
1802
|
-
feature_uid_map = {
|
|
1803
|
-
row["feature_id"]: row["feature_uid"]
|
|
1804
|
-
for row in study.features_df.iter_rows(named=True)
|
|
1805
|
-
}
|
|
1881
|
+
feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in study.features_df.iter_rows(named=True)}
|
|
1806
1882
|
imax = consensus_map.size()
|
|
1807
1883
|
|
|
1808
1884
|
study.logger.debug(f"Found {imax} feature groups by clustering.")
|
|
@@ -1862,67 +1938,31 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
1862
1938
|
[fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
|
|
1863
1939
|
)
|
|
1864
1940
|
rt_start_values = np.array(
|
|
1865
|
-
[
|
|
1866
|
-
fd.get("rt_start", 0)
|
|
1867
|
-
for fd in feature_data_list
|
|
1868
|
-
if fd.get("rt_start") is not None
|
|
1869
|
-
],
|
|
1941
|
+
[fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None],
|
|
1870
1942
|
)
|
|
1871
1943
|
rt_end_values = np.array(
|
|
1872
|
-
[
|
|
1873
|
-
fd.get("rt_end", 0)
|
|
1874
|
-
for fd in feature_data_list
|
|
1875
|
-
if fd.get("rt_end") is not None
|
|
1876
|
-
],
|
|
1944
|
+
[fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None],
|
|
1877
1945
|
)
|
|
1878
1946
|
rt_delta_values = np.array(
|
|
1879
|
-
[
|
|
1880
|
-
fd.get("rt_delta", 0)
|
|
1881
|
-
for fd in feature_data_list
|
|
1882
|
-
if fd.get("rt_delta") is not None
|
|
1883
|
-
],
|
|
1947
|
+
[fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None],
|
|
1884
1948
|
)
|
|
1885
1949
|
mz_start_values = np.array(
|
|
1886
|
-
[
|
|
1887
|
-
fd.get("mz_start", 0)
|
|
1888
|
-
for fd in feature_data_list
|
|
1889
|
-
if fd.get("mz_start") is not None
|
|
1890
|
-
],
|
|
1950
|
+
[fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None],
|
|
1891
1951
|
)
|
|
1892
1952
|
mz_end_values = np.array(
|
|
1893
|
-
[
|
|
1894
|
-
fd.get("mz_end", 0)
|
|
1895
|
-
for fd in feature_data_list
|
|
1896
|
-
if fd.get("mz_end") is not None
|
|
1897
|
-
],
|
|
1953
|
+
[fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None],
|
|
1898
1954
|
)
|
|
1899
1955
|
inty_values = np.array(
|
|
1900
|
-
[
|
|
1901
|
-
fd.get("inty", 0)
|
|
1902
|
-
for fd in feature_data_list
|
|
1903
|
-
if fd.get("inty") is not None
|
|
1904
|
-
],
|
|
1956
|
+
[fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None],
|
|
1905
1957
|
)
|
|
1906
1958
|
coherence_values = np.array(
|
|
1907
|
-
[
|
|
1908
|
-
fd.get("chrom_coherence", 0)
|
|
1909
|
-
for fd in feature_data_list
|
|
1910
|
-
if fd.get("chrom_coherence") is not None
|
|
1911
|
-
],
|
|
1959
|
+
[fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None],
|
|
1912
1960
|
)
|
|
1913
1961
|
prominence_values = np.array(
|
|
1914
|
-
[
|
|
1915
|
-
fd.get("chrom_prominence", 0)
|
|
1916
|
-
for fd in feature_data_list
|
|
1917
|
-
if fd.get("chrom_prominence") is not None
|
|
1918
|
-
],
|
|
1962
|
+
[fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None],
|
|
1919
1963
|
)
|
|
1920
1964
|
prominence_scaled_values = np.array(
|
|
1921
|
-
[
|
|
1922
|
-
fd.get("chrom_height_scaled", 0)
|
|
1923
|
-
for fd in feature_data_list
|
|
1924
|
-
if fd.get("chrom_height_scaled") is not None
|
|
1925
|
-
],
|
|
1965
|
+
[fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None],
|
|
1926
1966
|
)
|
|
1927
1967
|
height_scaled_values = np.array(
|
|
1928
1968
|
[
|
|
@@ -1935,11 +1975,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
1935
1975
|
[fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
|
|
1936
1976
|
)
|
|
1937
1977
|
charge_values = np.array(
|
|
1938
|
-
[
|
|
1939
|
-
fd.get("charge", 0)
|
|
1940
|
-
for fd in feature_data_list
|
|
1941
|
-
if fd.get("charge") is not None
|
|
1942
|
-
],
|
|
1978
|
+
[fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None],
|
|
1943
1979
|
)
|
|
1944
1980
|
|
|
1945
1981
|
# adduct_values
|
|
@@ -1967,9 +2003,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
1967
2003
|
# Calculate adduct_values for the consensus feature
|
|
1968
2004
|
adduct_values = []
|
|
1969
2005
|
if all_adducts:
|
|
1970
|
-
adduct_counts = {
|
|
1971
|
-
adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
|
|
1972
|
-
}
|
|
2006
|
+
adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
|
|
1973
2007
|
total_count = sum(adduct_counts.values())
|
|
1974
2008
|
for adduct, count in adduct_counts.items():
|
|
1975
2009
|
percentage = (count / total_count) * 100 if total_count > 0 else 0
|
|
@@ -2055,11 +2089,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2055
2089
|
element,
|
|
2056
2090
|
1.007825,
|
|
2057
2091
|
) # Default to H if unknown
|
|
2058
|
-
mass_shift =
|
|
2059
|
-
base_mass * multiplier
|
|
2060
|
-
if sign == "+"
|
|
2061
|
-
else -base_mass * multiplier
|
|
2062
|
-
)
|
|
2092
|
+
mass_shift = base_mass * multiplier if sign == "+" else -base_mass * multiplier
|
|
2063
2093
|
adduct_mass_shift_top = mass_shift
|
|
2064
2094
|
else:
|
|
2065
2095
|
# Default fallback
|
|
@@ -2083,13 +2113,9 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2083
2113
|
consensus_adduct_values = [[adduct_top, 1, 100.0]]
|
|
2084
2114
|
|
|
2085
2115
|
# Calculate neutral mass from consensus mz (for both cases)
|
|
2086
|
-
consensus_mz = (
|
|
2087
|
-
round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
2088
|
-
)
|
|
2116
|
+
consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
2089
2117
|
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
2090
|
-
adduct_mass_neutral_top = (
|
|
2091
|
-
consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
2092
|
-
)
|
|
2118
|
+
adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
2093
2119
|
|
|
2094
2120
|
# Calculate number of MS2 spectra
|
|
2095
2121
|
ms2_count = 0
|
|
@@ -2100,7 +2126,8 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2100
2126
|
|
|
2101
2127
|
# Generate unique 16-character consensus_id string (UUID-based)
|
|
2102
2128
|
import uuid
|
|
2103
|
-
|
|
2129
|
+
|
|
2130
|
+
consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
|
|
2104
2131
|
|
|
2105
2132
|
metadata_list.append(
|
|
2106
2133
|
{
|
|
@@ -2109,48 +2136,20 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2109
2136
|
"quality": round(float(feature.getQuality()), 3),
|
|
2110
2137
|
"number_samples": len(feature_data_list),
|
|
2111
2138
|
# "number_ext": int(len(features_list)),
|
|
2112
|
-
"rt": round(float(np.mean(rt_values)), 4)
|
|
2113
|
-
if len(
|
|
2114
|
-
else 0.0,
|
|
2115
|
-
"
|
|
2116
|
-
if len(
|
|
2117
|
-
else 0.0,
|
|
2118
|
-
"
|
|
2119
|
-
if len(
|
|
2120
|
-
else 0.0,
|
|
2121
|
-
"
|
|
2122
|
-
if len(
|
|
2123
|
-
else 0.0,
|
|
2124
|
-
"
|
|
2125
|
-
if len(
|
|
2126
|
-
else 0.0,
|
|
2127
|
-
"rt_start_mean": round(float(np.mean(rt_start_values)), 3)
|
|
2128
|
-
if len(rt_start_values) > 0
|
|
2129
|
-
else 0.0,
|
|
2130
|
-
"rt_end_mean": round(float(np.mean(rt_end_values)), 3)
|
|
2131
|
-
if len(rt_end_values) > 0
|
|
2132
|
-
else 0.0,
|
|
2133
|
-
"rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
|
|
2134
|
-
if len(rt_delta_values) > 0
|
|
2135
|
-
else 0.0,
|
|
2136
|
-
"mz_min": round(float(np.min(mz_values)), 4)
|
|
2137
|
-
if len(mz_values) > 0
|
|
2138
|
-
else 0.0,
|
|
2139
|
-
"mz_max": round(float(np.max(mz_values)), 4)
|
|
2140
|
-
if len(mz_values) > 0
|
|
2141
|
-
else 0.0,
|
|
2142
|
-
"mz_mean": round(float(np.mean(mz_values)), 4)
|
|
2143
|
-
if len(mz_values) > 0
|
|
2144
|
-
else 0.0,
|
|
2145
|
-
"mz_start_mean": round(float(np.mean(mz_start_values)), 4)
|
|
2146
|
-
if len(mz_start_values) > 0
|
|
2147
|
-
else 0.0,
|
|
2148
|
-
"mz_end_mean": round(float(np.mean(mz_end_values)), 4)
|
|
2149
|
-
if len(mz_end_values) > 0
|
|
2150
|
-
else 0.0,
|
|
2151
|
-
"inty_mean": round(float(np.mean(inty_values)), 0)
|
|
2152
|
-
if len(inty_values) > 0
|
|
2153
|
-
else 0.0,
|
|
2139
|
+
"rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
|
|
2140
|
+
"mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
2141
|
+
"rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
2142
|
+
"rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
2143
|
+
"rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
|
|
2144
|
+
"rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
|
|
2145
|
+
"rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
|
|
2146
|
+
"rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
|
|
2147
|
+
"mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
2148
|
+
"mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
2149
|
+
"mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
|
|
2150
|
+
"mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
|
|
2151
|
+
"mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
|
|
2152
|
+
"inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
|
|
2154
2153
|
"bl": -1.0,
|
|
2155
2154
|
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
|
|
2156
2155
|
if len(coherence_values) > 0
|
|
@@ -2171,25 +2170,17 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2171
2170
|
if len(height_scaled_values) > 0
|
|
2172
2171
|
else 0.0,
|
|
2173
2172
|
"iso": None, # Will be filled by find_iso() function
|
|
2174
|
-
"iso_mean": round(float(np.mean(iso_values)), 2)
|
|
2175
|
-
if len(
|
|
2176
|
-
else 0.0,
|
|
2177
|
-
"charge_mean": round(float(np.mean(charge_values)), 2)
|
|
2178
|
-
if len(charge_values) > 0
|
|
2179
|
-
else 0.0,
|
|
2173
|
+
"iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
|
|
2174
|
+
"charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
|
|
2180
2175
|
"number_ms2": int(ms2_count),
|
|
2181
|
-
"adducts": consensus_adduct_values
|
|
2182
|
-
if consensus_adduct_values
|
|
2183
|
-
else [], # Ensure it's always a list
|
|
2176
|
+
"adducts": consensus_adduct_values if consensus_adduct_values else [], # Ensure it's always a list
|
|
2184
2177
|
# New columns for top-ranked adduct information
|
|
2185
2178
|
"adduct_top": adduct_top,
|
|
2186
2179
|
"adduct_charge_top": adduct_charge_top,
|
|
2187
2180
|
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
|
|
2188
2181
|
if adduct_mass_neutral_top is not None
|
|
2189
2182
|
else None,
|
|
2190
|
-
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
|
|
2191
|
-
if adduct_mass_shift_top is not None
|
|
2192
|
-
else None,
|
|
2183
|
+
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
|
|
2193
2184
|
# New columns for top-scoring identification results
|
|
2194
2185
|
"id_top_name": None,
|
|
2195
2186
|
"id_top_class": None,
|
|
@@ -2238,16 +2229,13 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2238
2229
|
)
|
|
2239
2230
|
|
|
2240
2231
|
# Log final counts
|
|
2241
|
-
study.logger.info(
|
|
2242
|
-
f"Extracted {len(study.consensus_df)} consensus features with "
|
|
2243
|
-
f"at least {min_samples} samples."
|
|
2244
|
-
)
|
|
2232
|
+
study.logger.info(f"Extracted {len(study.consensus_df)} consensus features with at least {min_samples} samples.")
|
|
2245
2233
|
|
|
2246
2234
|
|
|
2247
2235
|
def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
2248
2236
|
"""Perform adduct grouping on consensus features."""
|
|
2249
2237
|
import polars as pl
|
|
2250
|
-
|
|
2238
|
+
|
|
2251
2239
|
# Add adduct grouping and adduct_of assignment
|
|
2252
2240
|
if len(study.consensus_df) > 0:
|
|
2253
2241
|
# Get relevant columns for grouping
|
|
@@ -2264,9 +2252,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
|
2264
2252
|
},
|
|
2265
2253
|
)
|
|
2266
2254
|
|
|
2267
|
-
adduct_group_list, adduct_of_list = __merge_adduct_grouping(
|
|
2268
|
-
study, consensus_data, rt_tol/3, mz_tol
|
|
2269
|
-
)
|
|
2255
|
+
adduct_group_list, adduct_of_list = __merge_adduct_grouping(study, consensus_data, rt_tol / 3, mz_tol)
|
|
2270
2256
|
|
|
2271
2257
|
# Add the new columns to consensus_df
|
|
2272
2258
|
study.consensus_df = study.consensus_df.with_columns(
|
|
@@ -2280,52 +2266,48 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
|
2280
2266
|
def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
|
|
2281
2267
|
"""
|
|
2282
2268
|
Count consensus features grouped in tight clusters.
|
|
2283
|
-
|
|
2269
|
+
|
|
2284
2270
|
Args:
|
|
2285
2271
|
mz_tol: m/z tolerance in Daltons for cluster detection
|
|
2286
2272
|
rt_tol: RT tolerance in seconds for cluster detection
|
|
2287
|
-
|
|
2273
|
+
|
|
2288
2274
|
Returns:
|
|
2289
2275
|
Number of tight clusters found
|
|
2290
2276
|
"""
|
|
2291
2277
|
if len(study.consensus_df) < 2:
|
|
2292
2278
|
return 0
|
|
2293
|
-
|
|
2279
|
+
|
|
2294
2280
|
# Extract consensus feature coordinates efficiently
|
|
2295
|
-
feature_coords = study.consensus_df.select([
|
|
2296
|
-
|
|
2297
|
-
pl.col("mz"),
|
|
2298
|
-
pl.col("rt")
|
|
2299
|
-
]).to_numpy()
|
|
2300
|
-
|
|
2281
|
+
feature_coords = study.consensus_df.select([pl.col("consensus_uid"), pl.col("mz"), pl.col("rt")]).to_numpy()
|
|
2282
|
+
|
|
2301
2283
|
n_features = len(feature_coords)
|
|
2302
2284
|
processed = [False] * n_features
|
|
2303
2285
|
tight_clusters_count = 0
|
|
2304
|
-
|
|
2286
|
+
|
|
2305
2287
|
# Use vectorized distance calculations for efficiency
|
|
2306
2288
|
for i in range(n_features):
|
|
2307
2289
|
if processed[i]:
|
|
2308
2290
|
continue
|
|
2309
|
-
|
|
2291
|
+
|
|
2310
2292
|
# Find all features within tolerance of feature i
|
|
2311
2293
|
cluster_members = [i]
|
|
2312
2294
|
rt_i, mz_i = feature_coords[i][2], feature_coords[i][1]
|
|
2313
|
-
|
|
2295
|
+
|
|
2314
2296
|
for j in range(i + 1, n_features):
|
|
2315
2297
|
if processed[j]:
|
|
2316
2298
|
continue
|
|
2317
|
-
|
|
2299
|
+
|
|
2318
2300
|
rt_j, mz_j = feature_coords[j][2], feature_coords[j][1]
|
|
2319
|
-
|
|
2301
|
+
|
|
2320
2302
|
if abs(rt_i - rt_j) <= rt_tol and abs(mz_i - mz_j) <= mz_tol:
|
|
2321
2303
|
cluster_members.append(j)
|
|
2322
|
-
|
|
2304
|
+
|
|
2323
2305
|
# Mark cluster as tight if it has 2+ members
|
|
2324
2306
|
if len(cluster_members) >= 2:
|
|
2325
2307
|
tight_clusters_count += 1
|
|
2326
2308
|
for idx in cluster_members:
|
|
2327
2309
|
processed[idx] = True
|
|
2328
|
-
|
|
2310
|
+
|
|
2329
2311
|
return tight_clusters_count
|
|
2330
2312
|
|
|
2331
2313
|
|
|
@@ -2336,52 +2318,54 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
|
|
|
2336
2318
|
"""
|
|
2337
2319
|
if len(study.consensus_df) == 0:
|
|
2338
2320
|
return
|
|
2339
|
-
|
|
2321
|
+
|
|
2340
2322
|
initial_count = len(study.consensus_df)
|
|
2341
|
-
study.logger.debug(
|
|
2342
|
-
|
|
2323
|
+
study.logger.debug(
|
|
2324
|
+
f"Post-processing chunked results: merging partial consensus features from {initial_count} features"
|
|
2325
|
+
)
|
|
2326
|
+
|
|
2343
2327
|
# Convert to list of dictionaries for easier processing
|
|
2344
2328
|
consensus_features = []
|
|
2345
2329
|
for row in study.consensus_df.iter_rows(named=True):
|
|
2346
2330
|
consensus_features.append({
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2351
|
-
|
|
2331
|
+
"consensus_uid": row["consensus_uid"],
|
|
2332
|
+
"rt": row["rt"],
|
|
2333
|
+
"mz": row["mz"],
|
|
2334
|
+
"number_samples": row.get("number_samples", 0),
|
|
2335
|
+
"inty_mean": row.get("inty_mean", 0.0),
|
|
2352
2336
|
})
|
|
2353
|
-
|
|
2337
|
+
|
|
2354
2338
|
# Use Union-Find to group features that should be merged
|
|
2355
2339
|
class UnionFind:
|
|
2356
2340
|
def __init__(self, n):
|
|
2357
2341
|
self.parent = list(range(n))
|
|
2358
|
-
|
|
2342
|
+
|
|
2359
2343
|
def find(self, x):
|
|
2360
2344
|
if self.parent[x] != x:
|
|
2361
2345
|
self.parent[x] = self.find(self.parent[x])
|
|
2362
2346
|
return self.parent[x]
|
|
2363
|
-
|
|
2347
|
+
|
|
2364
2348
|
def union(self, x, y):
|
|
2365
2349
|
px, py = self.find(x), self.find(y)
|
|
2366
2350
|
if px != py:
|
|
2367
2351
|
self.parent[py] = px
|
|
2368
|
-
|
|
2352
|
+
|
|
2369
2353
|
n_features = len(consensus_features)
|
|
2370
2354
|
uf = UnionFind(n_features)
|
|
2371
|
-
|
|
2355
|
+
|
|
2372
2356
|
# Find features that should be merged using original tolerances
|
|
2373
2357
|
for i in range(n_features):
|
|
2374
2358
|
for j in range(i + 1, n_features):
|
|
2375
2359
|
feature_a = consensus_features[i]
|
|
2376
2360
|
feature_b = consensus_features[j]
|
|
2377
|
-
|
|
2378
|
-
rt_diff = abs(feature_a[
|
|
2379
|
-
mz_diff = abs(feature_a[
|
|
2380
|
-
|
|
2361
|
+
|
|
2362
|
+
rt_diff = abs(feature_a["rt"] - feature_b["rt"])
|
|
2363
|
+
mz_diff = abs(feature_a["mz"] - feature_b["mz"])
|
|
2364
|
+
|
|
2381
2365
|
# Merge if within tolerance
|
|
2382
2366
|
if rt_diff <= rt_tol and mz_diff <= mz_tol:
|
|
2383
2367
|
uf.union(i, j)
|
|
2384
|
-
|
|
2368
|
+
|
|
2385
2369
|
# Group features by their root
|
|
2386
2370
|
groups = {}
|
|
2387
2371
|
for i, feature in enumerate(consensus_features):
|
|
@@ -2389,12 +2373,12 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
|
|
|
2389
2373
|
if root not in groups:
|
|
2390
2374
|
groups[root] = []
|
|
2391
2375
|
groups[root].append(consensus_features[i])
|
|
2392
|
-
|
|
2376
|
+
|
|
2393
2377
|
# Create merged features
|
|
2394
2378
|
merged_features = []
|
|
2395
2379
|
merged_mapping_data = []
|
|
2396
2380
|
uids_to_remove = set()
|
|
2397
|
-
|
|
2381
|
+
|
|
2398
2382
|
for group in groups.values():
|
|
2399
2383
|
if len(group) < 2:
|
|
2400
2384
|
# Single feature, keep as is
|
|
@@ -2402,70 +2386,77 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
|
|
|
2402
2386
|
else:
|
|
2403
2387
|
# Multiple features, merge them
|
|
2404
2388
|
# Find best representative feature (highest sample count, then intensity)
|
|
2405
|
-
best_feature = max(group, key=lambda x: (x[
|
|
2406
|
-
|
|
2389
|
+
best_feature = max(group, key=lambda x: (x["number_samples"], x["inty_mean"]))
|
|
2390
|
+
|
|
2407
2391
|
# Calculate merged properties
|
|
2408
|
-
total_samples = sum(f[
|
|
2409
|
-
weighted_rt =
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2392
|
+
total_samples = sum(f["number_samples"] for f in group)
|
|
2393
|
+
weighted_rt = (
|
|
2394
|
+
sum(f["rt"] * f["number_samples"] for f in group) / total_samples
|
|
2395
|
+
if total_samples > 0
|
|
2396
|
+
else best_feature["rt"]
|
|
2397
|
+
)
|
|
2398
|
+
weighted_mz = (
|
|
2399
|
+
sum(f["mz"] * f["number_samples"] for f in group) / total_samples
|
|
2400
|
+
if total_samples > 0
|
|
2401
|
+
else best_feature["mz"]
|
|
2402
|
+
)
|
|
2403
|
+
mean_intensity = (
|
|
2404
|
+
sum(f["inty_mean"] * f["number_samples"] for f in group) / total_samples
|
|
2405
|
+
if total_samples > 0
|
|
2406
|
+
else best_feature["inty_mean"]
|
|
2407
|
+
)
|
|
2408
|
+
|
|
2413
2409
|
# Keep the best feature's UID but update its properties
|
|
2414
2410
|
merged_features.append({
|
|
2415
|
-
|
|
2416
|
-
|
|
2417
|
-
|
|
2418
|
-
|
|
2419
|
-
|
|
2411
|
+
"consensus_uid": best_feature["consensus_uid"],
|
|
2412
|
+
"rt": weighted_rt,
|
|
2413
|
+
"mz": weighted_mz,
|
|
2414
|
+
"number_samples": total_samples,
|
|
2415
|
+
"inty_mean": mean_intensity,
|
|
2420
2416
|
})
|
|
2421
|
-
|
|
2417
|
+
|
|
2422
2418
|
# Mark other features for removal
|
|
2423
2419
|
for f in group:
|
|
2424
|
-
if f[
|
|
2425
|
-
uids_to_remove.add(f[
|
|
2426
|
-
|
|
2420
|
+
if f["consensus_uid"] != best_feature["consensus_uid"]:
|
|
2421
|
+
uids_to_remove.add(f["consensus_uid"])
|
|
2422
|
+
|
|
2427
2423
|
if merged_features:
|
|
2428
2424
|
study.logger.debug(f"Merging {len(merged_features)} groups of partial consensus features")
|
|
2429
|
-
|
|
2425
|
+
|
|
2430
2426
|
# Update consensus_df with merged features
|
|
2431
2427
|
for merged_feature in merged_features:
|
|
2432
2428
|
study.consensus_df = study.consensus_df.with_columns([
|
|
2433
|
-
pl.when(pl.col(
|
|
2434
|
-
.then(pl.lit(merged_feature[
|
|
2435
|
-
.otherwise(pl.col(
|
|
2436
|
-
.alias(
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
.
|
|
2440
|
-
.
|
|
2441
|
-
.
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
.
|
|
2445
|
-
.
|
|
2446
|
-
.
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
.then(pl.lit(merged_feature['inty_mean']))
|
|
2450
|
-
.otherwise(pl.col('inty_mean'))
|
|
2451
|
-
.alias('inty_mean')
|
|
2429
|
+
pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
|
|
2430
|
+
.then(pl.lit(merged_feature["rt"]))
|
|
2431
|
+
.otherwise(pl.col("rt"))
|
|
2432
|
+
.alias("rt"),
|
|
2433
|
+
pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
|
|
2434
|
+
.then(pl.lit(merged_feature["mz"]))
|
|
2435
|
+
.otherwise(pl.col("mz"))
|
|
2436
|
+
.alias("mz"),
|
|
2437
|
+
pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
|
|
2438
|
+
.then(pl.lit(merged_feature["number_samples"]))
|
|
2439
|
+
.otherwise(pl.col("number_samples"))
|
|
2440
|
+
.alias("number_samples"),
|
|
2441
|
+
pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
|
|
2442
|
+
.then(pl.lit(merged_feature["inty_mean"]))
|
|
2443
|
+
.otherwise(pl.col("inty_mean"))
|
|
2444
|
+
.alias("inty_mean"),
|
|
2452
2445
|
])
|
|
2453
|
-
|
|
2446
|
+
|
|
2454
2447
|
# Remove duplicate features
|
|
2455
2448
|
if uids_to_remove:
|
|
2456
|
-
study.consensus_df = study.consensus_df.filter(
|
|
2457
|
-
|
|
2458
|
-
)
|
|
2459
|
-
|
|
2449
|
+
study.consensus_df = study.consensus_df.filter(~pl.col("consensus_uid").is_in(list(uids_to_remove)))
|
|
2450
|
+
|
|
2460
2451
|
# Also update consensus_mapping_df - reassign mappings from removed UIDs
|
|
2461
|
-
if hasattr(study,
|
|
2452
|
+
if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
|
|
2462
2453
|
study.consensus_mapping_df = study.consensus_mapping_df.with_columns(
|
|
2463
|
-
pl.when(pl.col(
|
|
2454
|
+
pl.when(pl.col("consensus_uid").is_in(list(uids_to_remove)))
|
|
2464
2455
|
.then(pl.lit(None)) # Will be handled by subsequent operations
|
|
2465
|
-
.otherwise(pl.col(
|
|
2466
|
-
.alias(
|
|
2456
|
+
.otherwise(pl.col("consensus_uid"))
|
|
2457
|
+
.alias("consensus_uid")
|
|
2467
2458
|
)
|
|
2468
|
-
|
|
2459
|
+
|
|
2469
2460
|
final_count = len(study.consensus_df)
|
|
2470
2461
|
study.logger.debug(f"Partial consensus merging: {initial_count} → {final_count} features")
|
|
2471
2462
|
|
|
@@ -2473,57 +2464,57 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
|
|
|
2473
2464
|
def __consensus_cleanup(study, rt_tol, mz_tol):
|
|
2474
2465
|
"""
|
|
2475
2466
|
Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
|
|
2476
|
-
|
|
2467
|
+
|
|
2477
2468
|
This function:
|
|
2478
|
-
1. Identifies and merges consensus features that are likely over-segmented
|
|
2469
|
+
1. Identifies and merges consensus features that are likely over-segmented
|
|
2479
2470
|
(too many features in very tight m/z and RT windows)
|
|
2480
2471
|
2. Performs deisotoping to remove +1 and +2 isotopic features
|
|
2481
2472
|
"""
|
|
2482
2473
|
if len(study.consensus_df) == 0:
|
|
2483
2474
|
return
|
|
2484
|
-
|
|
2475
|
+
|
|
2485
2476
|
initial_count = len(study.consensus_df)
|
|
2486
|
-
|
|
2477
|
+
|
|
2487
2478
|
# Only perform enhanced post-clustering if there are many features
|
|
2488
2479
|
if initial_count < 50:
|
|
2489
2480
|
return
|
|
2490
|
-
|
|
2481
|
+
|
|
2491
2482
|
study.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
|
|
2492
|
-
|
|
2483
|
+
|
|
2493
2484
|
# Find tight clusters using spatial binning
|
|
2494
2485
|
consensus_data = []
|
|
2495
2486
|
for row in study.consensus_df.iter_rows(named=True):
|
|
2496
2487
|
consensus_data.append({
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2488
|
+
"consensus_uid": row["consensus_uid"],
|
|
2489
|
+
"mz": row["mz"],
|
|
2490
|
+
"rt": row["rt"],
|
|
2491
|
+
"inty_mean": row.get("inty_mean", 0),
|
|
2492
|
+
"number_samples": row.get("number_samples", 0),
|
|
2502
2493
|
})
|
|
2503
|
-
|
|
2494
|
+
|
|
2504
2495
|
# Parameters for tight clustering detection - more lenient for effective merging
|
|
2505
2496
|
tight_rt_tol = min(0.5, rt_tol * 0.5) # More lenient RT tolerance (max 0.5s)
|
|
2506
2497
|
tight_mz_tol = min(0.05, max(0.03, mz_tol * 2.0)) # More lenient m/z tolerance (min 30 mDa, max 50 mDa)
|
|
2507
|
-
|
|
2498
|
+
|
|
2508
2499
|
# Build spatial index using smaller RT and m/z bins for better coverage
|
|
2509
2500
|
rt_bin_size = tight_rt_tol / 4 # Smaller bins to ensure nearby features are captured
|
|
2510
2501
|
mz_bin_size = tight_mz_tol / 4 # Smaller bins to ensure nearby features are captured
|
|
2511
|
-
|
|
2502
|
+
|
|
2512
2503
|
bins = defaultdict(list)
|
|
2513
2504
|
for feature in consensus_data:
|
|
2514
|
-
rt_bin = int(feature[
|
|
2515
|
-
mz_bin = int(feature[
|
|
2505
|
+
rt_bin = int(feature["rt"] / rt_bin_size)
|
|
2506
|
+
mz_bin = int(feature["mz"] / mz_bin_size)
|
|
2516
2507
|
bins[(rt_bin, mz_bin)].append(feature)
|
|
2517
|
-
|
|
2508
|
+
|
|
2518
2509
|
# Find clusters that need merging
|
|
2519
2510
|
merge_groups = []
|
|
2520
2511
|
processed_uids = set()
|
|
2521
|
-
|
|
2512
|
+
|
|
2522
2513
|
for bin_key, bin_features in bins.items():
|
|
2523
2514
|
# Check current bin and extended neighboring bins for complete cluster
|
|
2524
2515
|
rt_bin, mz_bin = bin_key
|
|
2525
2516
|
cluster_features = list(bin_features)
|
|
2526
|
-
|
|
2517
|
+
|
|
2527
2518
|
# Check a larger neighborhood (±2 bins) to ensure we capture all nearby features
|
|
2528
2519
|
for dr in [-2, -1, 0, 1, 2]:
|
|
2529
2520
|
for dm in [-2, -1, 0, 1, 2]:
|
|
@@ -2532,192 +2523,194 @@ def __consensus_cleanup(study, rt_tol, mz_tol):
|
|
|
2532
2523
|
neighbor_key = (rt_bin + dr, mz_bin + dm)
|
|
2533
2524
|
if neighbor_key in bins:
|
|
2534
2525
|
cluster_features.extend(bins[neighbor_key])
|
|
2535
|
-
|
|
2526
|
+
|
|
2536
2527
|
# Remove duplicates
|
|
2537
2528
|
seen_uids = set()
|
|
2538
2529
|
unique_features = []
|
|
2539
2530
|
for f in cluster_features:
|
|
2540
|
-
if f[
|
|
2531
|
+
if f["consensus_uid"] not in seen_uids:
|
|
2541
2532
|
unique_features.append(f)
|
|
2542
|
-
seen_uids.add(f[
|
|
2543
|
-
|
|
2533
|
+
seen_uids.add(f["consensus_uid"])
|
|
2534
|
+
|
|
2544
2535
|
# Only proceed if we have at least 2 features after including neighbors
|
|
2545
2536
|
if len(unique_features) < 2:
|
|
2546
2537
|
continue
|
|
2547
|
-
|
|
2538
|
+
|
|
2548
2539
|
# Calculate cluster bounds
|
|
2549
|
-
mzs = [f[
|
|
2550
|
-
rts = [f[
|
|
2551
|
-
|
|
2540
|
+
mzs = [f["mz"] for f in unique_features]
|
|
2541
|
+
rts = [f["rt"] for f in unique_features]
|
|
2542
|
+
|
|
2552
2543
|
mz_spread = max(mzs) - min(mzs)
|
|
2553
2544
|
rt_spread = max(rts) - min(rts)
|
|
2554
|
-
|
|
2545
|
+
|
|
2555
2546
|
# Only merge if features are tightly clustered
|
|
2556
2547
|
if mz_spread <= tight_mz_tol and rt_spread <= tight_rt_tol:
|
|
2557
2548
|
# Filter out features that were already processed
|
|
2558
|
-
uids_in_cluster = {f[
|
|
2559
|
-
unprocessed_features = [f for f in unique_features if f[
|
|
2560
|
-
|
|
2549
|
+
uids_in_cluster = {f["consensus_uid"] for f in unique_features}
|
|
2550
|
+
unprocessed_features = [f for f in unique_features if f["consensus_uid"] not in processed_uids]
|
|
2551
|
+
|
|
2561
2552
|
# Only proceed if we have at least 2 unprocessed features that still form a tight cluster
|
|
2562
2553
|
if len(unprocessed_features) >= 2:
|
|
2563
2554
|
# Recalculate bounds for unprocessed features only
|
|
2564
|
-
unprocessed_mzs = [f[
|
|
2565
|
-
unprocessed_rts = [f[
|
|
2566
|
-
|
|
2555
|
+
unprocessed_mzs = [f["mz"] for f in unprocessed_features]
|
|
2556
|
+
unprocessed_rts = [f["rt"] for f in unprocessed_features]
|
|
2557
|
+
|
|
2567
2558
|
unprocessed_mz_spread = max(unprocessed_mzs) - min(unprocessed_mzs)
|
|
2568
2559
|
unprocessed_rt_spread = max(unprocessed_rts) - min(unprocessed_rts)
|
|
2569
|
-
|
|
2560
|
+
|
|
2570
2561
|
# Check if unprocessed features still meet tight clustering criteria
|
|
2571
2562
|
if unprocessed_mz_spread <= tight_mz_tol and unprocessed_rt_spread <= tight_rt_tol:
|
|
2572
2563
|
merge_groups.append(unprocessed_features)
|
|
2573
|
-
processed_uids.update({f[
|
|
2574
|
-
|
|
2564
|
+
processed_uids.update({f["consensus_uid"] for f in unprocessed_features})
|
|
2565
|
+
|
|
2575
2566
|
if not merge_groups:
|
|
2576
2567
|
return
|
|
2577
|
-
|
|
2568
|
+
|
|
2578
2569
|
study.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
|
|
2579
|
-
|
|
2570
|
+
|
|
2580
2571
|
# Merge clusters by keeping the most representative feature
|
|
2581
2572
|
uids_to_remove = set()
|
|
2582
|
-
|
|
2573
|
+
|
|
2583
2574
|
for group in merge_groups:
|
|
2584
2575
|
if len(group) < 2:
|
|
2585
2576
|
continue
|
|
2586
|
-
|
|
2577
|
+
|
|
2587
2578
|
# Find the most representative feature (highest intensity and sample count)
|
|
2588
|
-
best_feature = max(group, key=lambda x: (x[
|
|
2589
|
-
|
|
2579
|
+
best_feature = max(group, key=lambda x: (x["number_samples"], x["inty_mean"]))
|
|
2580
|
+
|
|
2590
2581
|
# Mark other features for removal
|
|
2591
2582
|
for f in group:
|
|
2592
|
-
if f[
|
|
2593
|
-
uids_to_remove.add(f[
|
|
2594
|
-
|
|
2583
|
+
if f["consensus_uid"] != best_feature["consensus_uid"]:
|
|
2584
|
+
uids_to_remove.add(f["consensus_uid"])
|
|
2585
|
+
|
|
2595
2586
|
if uids_to_remove:
|
|
2596
2587
|
# Remove merged features from consensus_df
|
|
2597
|
-
study.consensus_df = study.consensus_df.filter(
|
|
2598
|
-
|
|
2599
|
-
)
|
|
2600
|
-
|
|
2588
|
+
study.consensus_df = study.consensus_df.filter(~pl.col("consensus_uid").is_in(list(uids_to_remove)))
|
|
2589
|
+
|
|
2601
2590
|
# Also update consensus_mapping_df if it exists
|
|
2602
|
-
if hasattr(study,
|
|
2591
|
+
if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
|
|
2603
2592
|
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2604
|
-
~pl.col(
|
|
2593
|
+
~pl.col("consensus_uid").is_in(list(uids_to_remove))
|
|
2605
2594
|
)
|
|
2606
|
-
|
|
2595
|
+
|
|
2607
2596
|
final_count = len(study.consensus_df)
|
|
2608
2597
|
reduction = initial_count - final_count
|
|
2609
2598
|
reduction_pct = (reduction / initial_count) * 100
|
|
2610
|
-
|
|
2599
|
+
|
|
2611
2600
|
if reduction > 0:
|
|
2612
|
-
study.logger.debug(
|
|
2613
|
-
|
|
2601
|
+
study.logger.debug(
|
|
2602
|
+
f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)"
|
|
2603
|
+
)
|
|
2604
|
+
|
|
2614
2605
|
# Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
|
|
2615
2606
|
pre_deisotoping_count = len(study.consensus_df)
|
|
2616
2607
|
isotope_uids_to_remove = set()
|
|
2617
|
-
|
|
2608
|
+
|
|
2618
2609
|
# Use strict tolerances for deisotoping (same as declustering)
|
|
2619
2610
|
deisotope_rt_tol = min(0.3, rt_tol * 0.3) # Strict RT tolerance for isotope detection
|
|
2620
2611
|
deisotope_mz_tol = min(0.01, mz_tol * 0.5) # Strict m/z tolerance for isotope detection
|
|
2621
|
-
|
|
2612
|
+
|
|
2622
2613
|
# Get current consensus data for isotope detection
|
|
2623
2614
|
current_consensus_data = []
|
|
2624
2615
|
for row in study.consensus_df.iter_rows(named=True):
|
|
2625
2616
|
current_consensus_data.append({
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2617
|
+
"consensus_uid": row["consensus_uid"],
|
|
2618
|
+
"mz": row["mz"],
|
|
2619
|
+
"rt": row["rt"],
|
|
2620
|
+
"number_samples": row.get("number_samples", 0),
|
|
2630
2621
|
})
|
|
2631
|
-
|
|
2622
|
+
|
|
2632
2623
|
# Sort by m/z for efficient searching
|
|
2633
|
-
current_consensus_data.sort(key=lambda x: x[
|
|
2624
|
+
current_consensus_data.sort(key=lambda x: x["mz"])
|
|
2634
2625
|
n_current = len(current_consensus_data)
|
|
2635
|
-
|
|
2626
|
+
|
|
2636
2627
|
for i in range(n_current):
|
|
2637
2628
|
feature_i = current_consensus_data[i]
|
|
2638
|
-
|
|
2629
|
+
|
|
2639
2630
|
# Skip if already marked for removal
|
|
2640
|
-
if feature_i[
|
|
2631
|
+
if feature_i["consensus_uid"] in isotope_uids_to_remove:
|
|
2641
2632
|
continue
|
|
2642
|
-
|
|
2633
|
+
|
|
2643
2634
|
# Look for potential +1 and +2 isotopes (higher m/z)
|
|
2644
2635
|
for j in range(i + 1, n_current):
|
|
2645
2636
|
feature_j = current_consensus_data[j]
|
|
2646
|
-
|
|
2637
|
+
|
|
2647
2638
|
# Skip if already marked for removal
|
|
2648
|
-
if feature_j[
|
|
2639
|
+
if feature_j["consensus_uid"] in isotope_uids_to_remove:
|
|
2649
2640
|
continue
|
|
2650
|
-
|
|
2651
|
-
mz_diff = feature_j[
|
|
2652
|
-
|
|
2641
|
+
|
|
2642
|
+
mz_diff = feature_j["mz"] - feature_i["mz"]
|
|
2643
|
+
|
|
2653
2644
|
# Break if m/z difference is too large (features are sorted by m/z)
|
|
2654
2645
|
if mz_diff > 2.1: # Beyond +2 isotope range
|
|
2655
2646
|
break
|
|
2656
|
-
|
|
2657
|
-
rt_diff = abs(feature_j[
|
|
2658
|
-
|
|
2647
|
+
|
|
2648
|
+
rt_diff = abs(feature_j["rt"] - feature_i["rt"])
|
|
2649
|
+
|
|
2659
2650
|
# Check for +1 isotope (C13 mass difference ≈ 1.003354 Da)
|
|
2660
2651
|
if (0.995 <= mz_diff <= 1.011) and rt_diff <= deisotope_rt_tol:
|
|
2661
2652
|
# Potential +1 isotope - should have fewer samples than main feature
|
|
2662
|
-
if feature_j[
|
|
2663
|
-
isotope_uids_to_remove.add(feature_j[
|
|
2653
|
+
if feature_j["number_samples"] < feature_i["number_samples"]:
|
|
2654
|
+
isotope_uids_to_remove.add(feature_j["consensus_uid"])
|
|
2664
2655
|
continue
|
|
2665
|
-
|
|
2666
|
-
# Check for +2 isotope (2 * C13 mass difference ≈ 2.006708 Da)
|
|
2656
|
+
|
|
2657
|
+
# Check for +2 isotope (2 * C13 mass difference ≈ 2.006708 Da)
|
|
2667
2658
|
if (1.995 <= mz_diff <= 2.018) and rt_diff <= deisotope_rt_tol:
|
|
2668
2659
|
# Potential +2 isotope - should have fewer samples than main feature
|
|
2669
|
-
if feature_j[
|
|
2670
|
-
isotope_uids_to_remove.add(feature_j[
|
|
2660
|
+
if feature_j["number_samples"] < feature_i["number_samples"]:
|
|
2661
|
+
isotope_uids_to_remove.add(feature_j["consensus_uid"])
|
|
2671
2662
|
continue
|
|
2672
|
-
|
|
2663
|
+
|
|
2673
2664
|
# Remove isotopic features
|
|
2674
2665
|
if isotope_uids_to_remove:
|
|
2675
|
-
study.consensus_df = study.consensus_df.filter(
|
|
2676
|
-
|
|
2677
|
-
)
|
|
2678
|
-
|
|
2666
|
+
study.consensus_df = study.consensus_df.filter(~pl.col("consensus_uid").is_in(list(isotope_uids_to_remove)))
|
|
2667
|
+
|
|
2679
2668
|
# Also update consensus_mapping_df if it exists
|
|
2680
|
-
if hasattr(study,
|
|
2669
|
+
if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
|
|
2681
2670
|
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
2682
|
-
~pl.col(
|
|
2671
|
+
~pl.col("consensus_uid").is_in(list(isotope_uids_to_remove))
|
|
2683
2672
|
)
|
|
2684
|
-
|
|
2673
|
+
|
|
2685
2674
|
post_deisotoping_count = len(study.consensus_df)
|
|
2686
2675
|
isotope_reduction = pre_deisotoping_count - post_deisotoping_count
|
|
2687
|
-
|
|
2676
|
+
|
|
2688
2677
|
if isotope_reduction > 0:
|
|
2689
|
-
study.logger.debug(
|
|
2690
|
-
|
|
2678
|
+
study.logger.debug(
|
|
2679
|
+
f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)"
|
|
2680
|
+
)
|
|
2681
|
+
|
|
2691
2682
|
# Final summary
|
|
2692
2683
|
final_count = len(study.consensus_df)
|
|
2693
2684
|
total_reduction = initial_count - final_count
|
|
2694
2685
|
if total_reduction > 0:
|
|
2695
2686
|
total_reduction_pct = (total_reduction / initial_count) * 100
|
|
2696
|
-
study.logger.debug(
|
|
2687
|
+
study.logger.debug(
|
|
2688
|
+
f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)"
|
|
2689
|
+
)
|
|
2697
2690
|
|
|
2698
2691
|
|
|
2699
2692
|
def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
2700
2693
|
"""
|
|
2701
2694
|
Identify coeluting consensus features by characteristic mass shifts between adducts
|
|
2702
2695
|
and update their adduct information accordingly.
|
|
2703
|
-
|
|
2696
|
+
|
|
2704
2697
|
This function:
|
|
2705
2698
|
1. Generates a catalogue of mass shifts between adducts using _get_adducts()
|
|
2706
2699
|
2. Searches for pairs of consensus features with same RT (within strict RT tolerance)
|
|
2707
2700
|
and matching m/z shifts (±0.005 Da)
|
|
2708
2701
|
3. Updates adduct_* columns based on identified relationships
|
|
2709
|
-
|
|
2702
|
+
|
|
2710
2703
|
Args:
|
|
2711
2704
|
rt_tol: RT tolerance in seconds (strict tolerance for coelution detection)
|
|
2712
2705
|
cached_adducts_df: Pre-computed adducts DataFrame for performance
|
|
2713
2706
|
"""
|
|
2714
2707
|
import polars as pl
|
|
2715
|
-
|
|
2708
|
+
|
|
2716
2709
|
# Check if consensus_df exists and has features
|
|
2717
2710
|
if len(study.consensus_df) == 0:
|
|
2718
2711
|
study.logger.debug("No consensus features for adduct identification by mass shift")
|
|
2719
2712
|
return
|
|
2720
|
-
|
|
2713
|
+
|
|
2721
2714
|
# Get adducts DataFrame if not provided
|
|
2722
2715
|
if cached_adducts_df is None or cached_adducts_df.is_empty():
|
|
2723
2716
|
try:
|
|
@@ -2726,145 +2719,148 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2726
2719
|
except Exception as e:
|
|
2727
2720
|
study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
|
|
2728
2721
|
return
|
|
2729
|
-
|
|
2722
|
+
|
|
2730
2723
|
if cached_adducts_df.is_empty():
|
|
2731
2724
|
study.logger.debug("No adducts available for mass shift identification")
|
|
2732
2725
|
return
|
|
2733
|
-
|
|
2726
|
+
|
|
2734
2727
|
# Build catalogue of mass shifts between adducts
|
|
2735
2728
|
mass_shift_catalog = {}
|
|
2736
2729
|
adduct_info = {}
|
|
2737
|
-
|
|
2730
|
+
|
|
2738
2731
|
# Extract adduct information
|
|
2739
2732
|
adducts_data = cached_adducts_df.select(["name", "charge", "mass_shift"]).to_dicts()
|
|
2740
|
-
|
|
2733
|
+
|
|
2741
2734
|
for adduct in adducts_data:
|
|
2742
2735
|
name = adduct["name"]
|
|
2743
|
-
charge = adduct["charge"]
|
|
2736
|
+
charge = adduct["charge"]
|
|
2744
2737
|
mass_shift = adduct["mass_shift"]
|
|
2745
|
-
|
|
2746
|
-
adduct_info[name] = {
|
|
2747
|
-
|
|
2748
|
-
"mass_shift": mass_shift
|
|
2749
|
-
}
|
|
2750
|
-
|
|
2738
|
+
|
|
2739
|
+
adduct_info[name] = {"charge": charge, "mass_shift": mass_shift}
|
|
2740
|
+
|
|
2751
2741
|
# Generate pairwise mass differences for catalog
|
|
2752
2742
|
for adduct1 in adducts_data:
|
|
2753
2743
|
for adduct2 in adducts_data:
|
|
2754
2744
|
if adduct1["name"] == adduct2["name"]:
|
|
2755
2745
|
continue
|
|
2756
|
-
|
|
2746
|
+
|
|
2757
2747
|
name1, charge1, ms1 = adduct1["name"], adduct1["charge"], adduct1["mass_shift"]
|
|
2758
2748
|
name2, charge2, ms2 = adduct2["name"], adduct2["charge"], adduct2["mass_shift"]
|
|
2759
|
-
|
|
2749
|
+
|
|
2760
2750
|
# Only consider shifts between adducts that have the same charge (same ionization state)
|
|
2761
2751
|
if charge1 != charge2:
|
|
2762
2752
|
continue
|
|
2763
|
-
|
|
2753
|
+
|
|
2764
2754
|
# Calculate expected m/z difference
|
|
2765
2755
|
if charge1 != 0 and charge2 != 0:
|
|
2766
2756
|
mz_diff = (ms1 - ms2) / abs(charge1)
|
|
2767
2757
|
else:
|
|
2768
2758
|
continue # Skip neutral adducts for this analysis
|
|
2769
|
-
|
|
2759
|
+
|
|
2770
2760
|
# Store the mass shift relationship
|
|
2771
2761
|
shift_key = round(mz_diff, 4) # Round to 4 decimal places for matching
|
|
2772
2762
|
if shift_key not in mass_shift_catalog:
|
|
2773
2763
|
mass_shift_catalog[shift_key] = []
|
|
2774
2764
|
mass_shift_catalog[shift_key].append({
|
|
2775
2765
|
"from_adduct": name1,
|
|
2776
|
-
"to_adduct": name2,
|
|
2766
|
+
"to_adduct": name2,
|
|
2777
2767
|
"mz_shift": mz_diff,
|
|
2778
2768
|
"from_charge": charge1,
|
|
2779
|
-
"to_charge": charge2
|
|
2769
|
+
"to_charge": charge2,
|
|
2780
2770
|
})
|
|
2781
|
-
|
|
2771
|
+
|
|
2782
2772
|
study.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
|
|
2783
|
-
|
|
2773
|
+
|
|
2784
2774
|
# Get consensus features data
|
|
2785
2775
|
consensus_data = []
|
|
2786
2776
|
for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
|
|
2787
2777
|
consensus_data.append({
|
|
2788
2778
|
"index": i,
|
|
2789
2779
|
"consensus_uid": row["consensus_uid"],
|
|
2790
|
-
"rt": row["rt"],
|
|
2780
|
+
"rt": row["rt"],
|
|
2791
2781
|
"mz": row["mz"],
|
|
2792
2782
|
"adduct_top": row.get("adduct_top", "[M+?]1+"),
|
|
2793
2783
|
"adduct_charge_top": row.get("adduct_charge_top", 1),
|
|
2794
2784
|
"adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
|
|
2795
2785
|
"adduct_mass_shift_top": row.get("adduct_mass_shift_top"),
|
|
2796
|
-
"inty_mean": row.get("inty_mean", 0)
|
|
2786
|
+
"inty_mean": row.get("inty_mean", 0),
|
|
2797
2787
|
})
|
|
2798
|
-
|
|
2788
|
+
|
|
2799
2789
|
# Sort by RT for efficient searching
|
|
2800
2790
|
consensus_data.sort(key=lambda x: x["rt"])
|
|
2801
2791
|
n_features = len(consensus_data)
|
|
2802
|
-
|
|
2792
|
+
|
|
2803
2793
|
# Track updates to make
|
|
2804
2794
|
adduct_updates = {} # consensus_uid -> new_adduct_info
|
|
2805
|
-
|
|
2795
|
+
|
|
2806
2796
|
# Strict RT tolerance for coelution (convert to minutes)
|
|
2807
2797
|
rt_tol_strict = rt_tol * 0.5 # Use half the merge tolerance for strict coelution
|
|
2808
2798
|
mz_tol_shift = 0.005 # ±5 mDa tolerance for mass shift matching
|
|
2809
|
-
|
|
2799
|
+
|
|
2810
2800
|
# Search for coeluting pairs with characteristic mass shifts
|
|
2811
2801
|
updated_count = 0
|
|
2812
|
-
|
|
2802
|
+
|
|
2813
2803
|
for i in range(n_features):
|
|
2814
2804
|
feature1 = consensus_data[i]
|
|
2815
2805
|
rt1 = feature1["rt"]
|
|
2816
2806
|
mz1 = feature1["mz"]
|
|
2817
2807
|
adduct1 = feature1["adduct_top"]
|
|
2818
|
-
|
|
2808
|
+
|
|
2819
2809
|
# Conservative approach: Don't skip features here - let algorithm find pairs first
|
|
2820
2810
|
# We'll check for inappropriate assignments later in the pair processing logic
|
|
2821
|
-
|
|
2811
|
+
|
|
2822
2812
|
# Search for coeluting features within strict RT tolerance
|
|
2823
2813
|
for j in range(i + 1, n_features):
|
|
2824
2814
|
feature2 = consensus_data[j]
|
|
2825
2815
|
rt2 = feature2["rt"]
|
|
2826
|
-
|
|
2816
|
+
|
|
2827
2817
|
# Break if RT difference exceeds tolerance (sorted by RT)
|
|
2828
2818
|
if abs(rt2 - rt1) > rt_tol_strict:
|
|
2829
2819
|
break
|
|
2830
|
-
|
|
2820
|
+
|
|
2831
2821
|
mz2 = feature2["mz"]
|
|
2832
2822
|
adduct2 = feature2["adduct_top"]
|
|
2833
|
-
|
|
2823
|
+
|
|
2834
2824
|
# Conservative approach: Don't skip feature2 here either - process all potential pairs
|
|
2835
|
-
|
|
2825
|
+
|
|
2836
2826
|
# Calculate observed m/z difference
|
|
2837
2827
|
mz_diff = mz2 - mz1
|
|
2838
2828
|
shift_key = round(mz_diff, 4)
|
|
2839
|
-
|
|
2829
|
+
|
|
2840
2830
|
# Check if this mass shift matches any known adduct relationships
|
|
2841
2831
|
for catalog_shift, relationships in mass_shift_catalog.items():
|
|
2842
2832
|
if abs(shift_key - catalog_shift) <= mz_tol_shift:
|
|
2843
2833
|
# Found a matching mass shift!
|
|
2844
|
-
|
|
2834
|
+
|
|
2845
2835
|
# Choose the best relationship based on common adducts
|
|
2846
2836
|
best_rel = None
|
|
2847
2837
|
best_score = 0
|
|
2848
|
-
|
|
2838
|
+
|
|
2849
2839
|
for rel in relationships:
|
|
2850
2840
|
# Prioritize common adducts ([M+H]+, [M+Na]+, [M+NH4]+)
|
|
2851
2841
|
score = 0
|
|
2852
|
-
if "H]" in rel["from_adduct"]:
|
|
2853
|
-
|
|
2854
|
-
if "
|
|
2855
|
-
|
|
2856
|
-
if "
|
|
2857
|
-
|
|
2858
|
-
|
|
2842
|
+
if "H]" in rel["from_adduct"]:
|
|
2843
|
+
score += 3
|
|
2844
|
+
if "Na]" in rel["from_adduct"]:
|
|
2845
|
+
score += 2
|
|
2846
|
+
if "NH4]" in rel["from_adduct"]:
|
|
2847
|
+
score += 2
|
|
2848
|
+
if "H]" in rel["to_adduct"]:
|
|
2849
|
+
score += 3
|
|
2850
|
+
if "Na]" in rel["to_adduct"]:
|
|
2851
|
+
score += 2
|
|
2852
|
+
if "NH4]" in rel["to_adduct"]:
|
|
2853
|
+
score += 2
|
|
2854
|
+
|
|
2859
2855
|
if score > best_score:
|
|
2860
2856
|
best_score = score
|
|
2861
2857
|
best_rel = rel
|
|
2862
|
-
|
|
2858
|
+
|
|
2863
2859
|
if best_rel:
|
|
2864
2860
|
# Determine which feature gets which adduct based on intensity
|
|
2865
2861
|
inty1 = feature1["inty_mean"]
|
|
2866
|
-
inty2 = feature2["inty_mean"]
|
|
2867
|
-
|
|
2862
|
+
inty2 = feature2["inty_mean"]
|
|
2863
|
+
|
|
2868
2864
|
# Assign higher intensity to [M+H]+ if possible
|
|
2869
2865
|
if "H]" in best_rel["from_adduct"] and inty1 >= inty2:
|
|
2870
2866
|
# Feature 1 = from_adduct, Feature 2 = to_adduct
|
|
@@ -2881,107 +2877,111 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2881
2877
|
else:
|
|
2882
2878
|
# Assignment based on mass shift direction
|
|
2883
2879
|
# catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
|
|
2884
|
-
# If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
|
|
2880
|
+
# If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
|
|
2885
2881
|
# If catalog_shift < 0: from_adduct has lower mass shift than to_adduct
|
|
2886
2882
|
# observed mz_diff = mz2 - mz1 (always positive for mz2 > mz1)
|
|
2887
|
-
#
|
|
2883
|
+
#
|
|
2888
2884
|
# CRITICAL FIX: Correct assignment logic
|
|
2889
|
-
# When mz_diff matches positive catalog_shift:
|
|
2885
|
+
# When mz_diff matches positive catalog_shift:
|
|
2890
2886
|
# - from_adduct is the heavier adduct (higher mass shift)
|
|
2891
|
-
# - to_adduct is the lighter adduct (lower mass shift)
|
|
2887
|
+
# - to_adduct is the lighter adduct (lower mass shift)
|
|
2892
2888
|
# - Higher m/z feature should get the heavier adduct (from_adduct)
|
|
2893
2889
|
# - Lower m/z feature should get the lighter adduct (to_adduct)
|
|
2894
|
-
|
|
2890
|
+
|
|
2895
2891
|
if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
|
|
2896
2892
|
# mz_diff matches catalog_shift direction
|
|
2897
2893
|
if catalog_shift > 0:
|
|
2898
2894
|
# from_adduct is heavier, to_adduct is lighter
|
|
2899
2895
|
from_feature = feature2 # Higher m/z gets heavier adduct
|
|
2900
|
-
to_feature = feature1
|
|
2896
|
+
to_feature = feature1 # Lower m/z gets lighter adduct
|
|
2901
2897
|
from_adduct_name = best_rel["from_adduct"] # Heavier adduct
|
|
2902
|
-
to_adduct_name = best_rel["to_adduct"]
|
|
2898
|
+
to_adduct_name = best_rel["to_adduct"] # Lighter adduct
|
|
2903
2899
|
else:
|
|
2904
2900
|
# from_adduct is lighter, to_adduct is heavier
|
|
2905
2901
|
from_feature = feature1 # Lower m/z gets lighter adduct
|
|
2906
|
-
to_feature = feature2
|
|
2907
|
-
from_adduct_name = best_rel["from_adduct"] # Lighter adduct
|
|
2908
|
-
to_adduct_name = best_rel["to_adduct"]
|
|
2902
|
+
to_feature = feature2 # Higher m/z gets heavier adduct
|
|
2903
|
+
from_adduct_name = best_rel["from_adduct"] # Lighter adduct
|
|
2904
|
+
to_adduct_name = best_rel["to_adduct"] # Heavier adduct
|
|
2909
2905
|
else:
|
|
2910
2906
|
# mz_diff matches reverse direction of catalog_shift
|
|
2911
2907
|
if catalog_shift > 0:
|
|
2912
2908
|
# Reverse: from_adduct becomes lighter, to_adduct becomes heavier
|
|
2913
2909
|
from_feature = feature1 # Lower m/z gets lighter adduct
|
|
2914
|
-
to_feature = feature2
|
|
2915
|
-
from_adduct_name = best_rel["to_adduct"]
|
|
2916
|
-
to_adduct_name = best_rel["from_adduct"]
|
|
2910
|
+
to_feature = feature2 # Higher m/z gets heavier adduct
|
|
2911
|
+
from_adduct_name = best_rel["to_adduct"] # Now lighter adduct
|
|
2912
|
+
to_adduct_name = best_rel["from_adduct"] # Now heavier adduct
|
|
2917
2913
|
else:
|
|
2918
2914
|
# Reverse: from_adduct becomes heavier, to_adduct becomes lighter
|
|
2919
2915
|
from_feature = feature2 # Higher m/z gets heavier adduct
|
|
2920
|
-
to_feature = feature1
|
|
2921
|
-
from_adduct_name = best_rel["to_adduct"]
|
|
2922
|
-
to_adduct_name = best_rel["from_adduct"]
|
|
2923
|
-
|
|
2916
|
+
to_feature = feature1 # Lower m/z gets lighter adduct
|
|
2917
|
+
from_adduct_name = best_rel["to_adduct"] # Now heavier adduct
|
|
2918
|
+
to_adduct_name = best_rel["from_adduct"] # Now lighter adduct
|
|
2919
|
+
|
|
2924
2920
|
# Get adduct details from catalog
|
|
2925
2921
|
from_adduct_info = adduct_info.get(from_adduct_name, {})
|
|
2926
2922
|
to_adduct_info = adduct_info.get(to_adduct_name, {})
|
|
2927
|
-
|
|
2923
|
+
|
|
2928
2924
|
# Calculate neutral masses
|
|
2929
2925
|
from_charge = from_adduct_info.get("charge", 1)
|
|
2930
2926
|
to_charge = to_adduct_info.get("charge", 1)
|
|
2931
2927
|
from_mass_shift = from_adduct_info.get("mass_shift", 1.007825)
|
|
2932
2928
|
to_mass_shift = to_adduct_info.get("mass_shift", 1.007825)
|
|
2933
|
-
|
|
2929
|
+
|
|
2934
2930
|
from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
|
|
2935
2931
|
to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
|
|
2936
|
-
|
|
2932
|
+
|
|
2937
2933
|
# Smart conservative check: prevent inappropriate assignments to isolated features
|
|
2938
2934
|
# Check if both features are isolated (single-member groups) with [M+?]1+ assignments
|
|
2939
2935
|
def is_isolated_unknown_feature(feature):
|
|
2940
2936
|
"""Check if a feature is isolated with unknown adduct"""
|
|
2941
2937
|
if not feature["adduct_top"] or "[M+?]" not in feature["adduct_top"]:
|
|
2942
2938
|
return False # Not unknown, safe to process
|
|
2943
|
-
|
|
2939
|
+
|
|
2944
2940
|
# Check group size
|
|
2945
2941
|
try:
|
|
2946
|
-
feature_row = study.consensus_df.filter(
|
|
2942
|
+
feature_row = study.consensus_df.filter(
|
|
2943
|
+
study.consensus_df["consensus_uid"] == feature["consensus_uid"]
|
|
2944
|
+
)
|
|
2947
2945
|
if len(feature_row) > 0:
|
|
2948
2946
|
adduct_group = feature_row["adduct_group"].iloc[0]
|
|
2949
2947
|
if adduct_group > 0:
|
|
2950
|
-
group_members = study.consensus_df.filter(
|
|
2948
|
+
group_members = study.consensus_df.filter(
|
|
2949
|
+
study.consensus_df["adduct_group"] == adduct_group
|
|
2950
|
+
)
|
|
2951
2951
|
return len(group_members) <= 1 # Isolated if group size <= 1
|
|
2952
2952
|
except Exception:
|
|
2953
2953
|
pass
|
|
2954
2954
|
return True # Default to isolated if can't determine
|
|
2955
|
-
|
|
2955
|
+
|
|
2956
2956
|
from_isolated = is_isolated_unknown_feature(from_feature)
|
|
2957
2957
|
to_isolated = is_isolated_unknown_feature(to_feature)
|
|
2958
|
-
|
|
2958
|
+
|
|
2959
2959
|
# Only skip assignment if BOTH features are isolated AND would get the SAME adduct
|
|
2960
2960
|
# (This prevents inappropriate duplicate assignments to isolated features)
|
|
2961
|
-
skip_assignment =
|
|
2962
|
-
|
|
2961
|
+
skip_assignment = from_isolated and to_isolated and from_adduct_name == to_adduct_name
|
|
2962
|
+
|
|
2963
2963
|
if skip_assignment:
|
|
2964
2964
|
study.logger.debug(
|
|
2965
2965
|
f"Skipping inappropriate assignment: both isolated features would get {from_adduct_name} "
|
|
2966
2966
|
f"(UIDs {from_feature['consensus_uid']}, {to_feature['consensus_uid']})"
|
|
2967
2967
|
)
|
|
2968
2968
|
continue # Skip this pair, continue to next relationship
|
|
2969
|
-
|
|
2969
|
+
|
|
2970
2970
|
# Store updates (legitimate pair or at least one feature already has specific adduct)
|
|
2971
2971
|
adduct_updates[from_feature["consensus_uid"]] = {
|
|
2972
2972
|
"adduct_top": from_adduct_name,
|
|
2973
2973
|
"adduct_charge_top": from_charge,
|
|
2974
2974
|
"adduct_mass_neutral_top": from_neutral_mass,
|
|
2975
|
-
"adduct_mass_shift_top": from_mass_shift
|
|
2975
|
+
"adduct_mass_shift_top": from_mass_shift,
|
|
2976
2976
|
}
|
|
2977
|
-
|
|
2977
|
+
|
|
2978
2978
|
adduct_updates[to_feature["consensus_uid"]] = {
|
|
2979
2979
|
"adduct_top": to_adduct_name,
|
|
2980
2980
|
"adduct_charge_top": to_charge,
|
|
2981
2981
|
"adduct_mass_neutral_top": to_neutral_mass,
|
|
2982
|
-
"adduct_mass_shift_top": to_mass_shift
|
|
2982
|
+
"adduct_mass_shift_top": to_mass_shift,
|
|
2983
2983
|
}
|
|
2984
|
-
|
|
2984
|
+
|
|
2985
2985
|
updated_count += 2
|
|
2986
2986
|
study.logger.debug(
|
|
2987
2987
|
f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
|
|
@@ -2989,17 +2989,17 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
2989
2989
|
f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
|
|
2990
2990
|
)
|
|
2991
2991
|
break # Found match, no need to check other relationships
|
|
2992
|
-
|
|
2992
|
+
|
|
2993
2993
|
# Apply updates to consensus_df
|
|
2994
2994
|
if adduct_updates:
|
|
2995
2995
|
# Prepare update data
|
|
2996
2996
|
consensus_uids = study.consensus_df["consensus_uid"].to_list()
|
|
2997
|
-
|
|
2997
|
+
|
|
2998
2998
|
new_adduct_top = []
|
|
2999
2999
|
new_adduct_charge_top = []
|
|
3000
3000
|
new_adduct_mass_neutral_top = []
|
|
3001
3001
|
new_adduct_mass_shift_top = []
|
|
3002
|
-
|
|
3002
|
+
|
|
3003
3003
|
for uid in consensus_uids:
|
|
3004
3004
|
if uid in adduct_updates:
|
|
3005
3005
|
update = adduct_updates[uid]
|
|
@@ -3015,13 +3015,13 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
3015
3015
|
new_adduct_charge_top.append(row.get("adduct_charge_top"))
|
|
3016
3016
|
new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
|
|
3017
3017
|
new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
|
|
3018
|
-
|
|
3018
|
+
|
|
3019
3019
|
# Update the DataFrame
|
|
3020
3020
|
study.consensus_df = study.consensus_df.with_columns([
|
|
3021
3021
|
pl.Series("adduct_top", new_adduct_top),
|
|
3022
|
-
pl.Series("adduct_charge_top", new_adduct_charge_top),
|
|
3022
|
+
pl.Series("adduct_charge_top", new_adduct_charge_top),
|
|
3023
3023
|
pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
|
|
3024
|
-
pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
|
|
3024
|
+
pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top),
|
|
3025
3025
|
])
|
|
3026
3026
|
study.logger.info(f"Adduct information updated for {updated_count} consensus features.")
|
|
3027
3027
|
else:
|
|
@@ -3031,12 +3031,12 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
3031
3031
|
def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
|
|
3032
3032
|
"""Complete the merge process with final calculations and cleanup."""
|
|
3033
3033
|
import polars as pl
|
|
3034
|
-
|
|
3034
|
+
|
|
3035
3035
|
# Check if consensus_df is empty or missing required columns
|
|
3036
3036
|
if len(study.consensus_df) == 0 or "number_samples" not in study.consensus_df.columns:
|
|
3037
3037
|
study.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
|
|
3038
3038
|
return
|
|
3039
|
-
|
|
3039
|
+
|
|
3040
3040
|
# Validate min_samples parameter
|
|
3041
3041
|
if min_samples is None:
|
|
3042
3042
|
min_samples = 1
|
|
@@ -3059,7 +3059,7 @@ def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
|
|
|
3059
3059
|
study.logger.debug(
|
|
3060
3060
|
f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
3061
3061
|
)
|
|
3062
|
-
|
|
3062
|
+
|
|
3063
3063
|
# Filter out consensus mapping with less than min_samples features
|
|
3064
3064
|
study.consensus_mapping_df = study.consensus_mapping_df.filter(
|
|
3065
3065
|
pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
|
|
@@ -3068,15 +3068,11 @@ def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
|
|
|
3068
3068
|
# Calculate the completeness of the consensus map
|
|
3069
3069
|
# Log completion with tight cluster metrics
|
|
3070
3070
|
if len(study.consensus_df) > 0 and len(study.samples_df) > 0:
|
|
3071
|
-
c = (
|
|
3072
|
-
|
|
3073
|
-
/ len(study.consensus_df)
|
|
3074
|
-
/ len(study.samples_df)
|
|
3075
|
-
)
|
|
3076
|
-
|
|
3071
|
+
c = len(study.consensus_mapping_df) / len(study.consensus_df) / len(study.samples_df)
|
|
3072
|
+
|
|
3077
3073
|
# Count tight clusters with specified thresholds
|
|
3078
|
-
tight_clusters = _count_tight_clusters(study,mz_tol=0.04, rt_tol=0.3)
|
|
3079
|
-
|
|
3074
|
+
tight_clusters = _count_tight_clusters(study, mz_tol=0.04, rt_tol=0.3)
|
|
3075
|
+
|
|
3080
3076
|
study.logger.success(
|
|
3081
3077
|
f"Merging completed. Consensus features: {len(study.consensus_df)}. "
|
|
3082
3078
|
f"Completeness: {c:.2f}. Tight clusters: {tight_clusters}.",
|
|
@@ -3100,27 +3096,42 @@ def __merge_feature_lookup(study_obj, features_df):
|
|
|
3100
3096
|
"""
|
|
3101
3097
|
study_obj.logger.debug("Creating optimized feature lookup...")
|
|
3102
3098
|
start_time = time.time()
|
|
3103
|
-
|
|
3099
|
+
|
|
3104
3100
|
# Use Polars select for faster conversion
|
|
3105
3101
|
feature_columns = [
|
|
3106
|
-
"feature_uid",
|
|
3107
|
-
"
|
|
3108
|
-
"
|
|
3109
|
-
"
|
|
3102
|
+
"feature_uid",
|
|
3103
|
+
"sample_uid",
|
|
3104
|
+
"rt",
|
|
3105
|
+
"mz",
|
|
3106
|
+
"rt_start",
|
|
3107
|
+
"rt_end",
|
|
3108
|
+
"rt_delta",
|
|
3109
|
+
"mz_start",
|
|
3110
|
+
"mz_end",
|
|
3111
|
+
"inty",
|
|
3112
|
+
"chrom_coherence",
|
|
3113
|
+
"chrom_prominence",
|
|
3114
|
+
"chrom_prominence_scaled",
|
|
3115
|
+
"chrom_height_scaled",
|
|
3116
|
+
"iso",
|
|
3117
|
+
"charge",
|
|
3118
|
+
"ms2_scans",
|
|
3119
|
+
"adduct",
|
|
3120
|
+
"adduct_mass",
|
|
3110
3121
|
]
|
|
3111
|
-
|
|
3122
|
+
|
|
3112
3123
|
# Filter to only existing columns
|
|
3113
3124
|
existing_columns = [col for col in feature_columns if col in features_df.columns]
|
|
3114
|
-
|
|
3125
|
+
|
|
3115
3126
|
# Convert to dictionary more efficiently
|
|
3116
3127
|
selected_df = features_df.select(existing_columns)
|
|
3117
|
-
|
|
3128
|
+
|
|
3118
3129
|
features_lookup = {}
|
|
3119
3130
|
for row in selected_df.iter_rows(named=True):
|
|
3120
3131
|
feature_uid = row["feature_uid"]
|
|
3121
3132
|
# Keep feature_uid in the dictionary for chunked merge compatibility
|
|
3122
3133
|
features_lookup[feature_uid] = {k: v for k, v in row.items()}
|
|
3123
|
-
|
|
3134
|
+
|
|
3124
3135
|
lookup_time = time.time() - start_time
|
|
3125
3136
|
if len(features_lookup) > 50000:
|
|
3126
3137
|
study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
|
|
@@ -3130,188 +3141,187 @@ def __merge_feature_lookup(study_obj, features_df):
|
|
|
3130
3141
|
def _get_features_matrix(study, consensus_data, quant_col="inty"):
|
|
3131
3142
|
"""
|
|
3132
3143
|
Create a local intensity matrix from features_df for correlation calculations.
|
|
3133
|
-
|
|
3144
|
+
|
|
3134
3145
|
Args:
|
|
3135
3146
|
study: Study object with features_df and samples_df
|
|
3136
3147
|
consensus_data: List of consensus feature dictionaries
|
|
3137
3148
|
quant_col: Column name to use for quantification (default: "inty")
|
|
3138
|
-
|
|
3149
|
+
|
|
3139
3150
|
Returns:
|
|
3140
3151
|
pandas.DataFrame: Matrix with consensus_uid as index, sample names as columns
|
|
3141
3152
|
"""
|
|
3142
3153
|
import pandas as pd
|
|
3143
3154
|
import numpy as np
|
|
3144
|
-
|
|
3155
|
+
|
|
3145
3156
|
# Get all sample names
|
|
3146
3157
|
sample_names = study.samples_df["sample_name"].to_list()
|
|
3147
3158
|
consensus_uids = [int(f["consensus_uid"]) for f in consensus_data]
|
|
3148
|
-
|
|
3159
|
+
|
|
3149
3160
|
# Initialize matrix with zeros
|
|
3150
3161
|
matrix_data = pd.DataFrame(
|
|
3151
|
-
index=pd.Index(consensus_uids, name="consensus_uid"),
|
|
3152
|
-
columns=sample_names,
|
|
3153
|
-
data=0.0,
|
|
3154
|
-
dtype=float
|
|
3162
|
+
index=pd.Index(consensus_uids, name="consensus_uid"), columns=sample_names, data=0.0, dtype=float
|
|
3155
3163
|
)
|
|
3156
|
-
|
|
3164
|
+
|
|
3157
3165
|
study.logger.debug(f"Building local features matrix: {len(consensus_uids)} features x {len(sample_names)} samples")
|
|
3158
|
-
|
|
3166
|
+
|
|
3159
3167
|
# Fill matrix with actual intensity values
|
|
3160
3168
|
features_df_pandas = study.features_df.to_pandas()
|
|
3161
3169
|
samples_df_pandas = study.samples_df.to_pandas()
|
|
3162
3170
|
consensus_mapping_pandas = study.consensus_mapping_df.to_pandas()
|
|
3163
|
-
|
|
3171
|
+
|
|
3164
3172
|
# Create sample_uid to sample_name mapping
|
|
3165
3173
|
uid_to_name = dict(zip(samples_df_pandas["sample_uid"], samples_df_pandas["sample_name"]))
|
|
3166
|
-
|
|
3174
|
+
|
|
3167
3175
|
# For each consensus feature, get intensities from all samples
|
|
3168
3176
|
for consensus_uid in consensus_uids:
|
|
3169
3177
|
# Get all feature_uids that map to this consensus_uid
|
|
3170
|
-
feature_mappings = consensus_mapping_pandas[
|
|
3171
|
-
|
|
3172
|
-
]
|
|
3173
|
-
|
|
3178
|
+
feature_mappings = consensus_mapping_pandas[consensus_mapping_pandas["consensus_uid"] == consensus_uid]
|
|
3179
|
+
|
|
3174
3180
|
for _, mapping in feature_mappings.iterrows():
|
|
3175
3181
|
feature_uid = mapping["feature_uid"]
|
|
3176
3182
|
sample_uid = mapping["sample_uid"]
|
|
3177
3183
|
sample_name = uid_to_name.get(sample_uid, f"sample_{sample_uid}")
|
|
3178
|
-
|
|
3184
|
+
|
|
3179
3185
|
# Get intensity for this feature
|
|
3180
3186
|
feature_row = features_df_pandas[
|
|
3181
|
-
(features_df_pandas["feature_uid"] == feature_uid) &
|
|
3182
|
-
(features_df_pandas["sample_uid"] == sample_uid)
|
|
3187
|
+
(features_df_pandas["feature_uid"] == feature_uid) & (features_df_pandas["sample_uid"] == sample_uid)
|
|
3183
3188
|
]
|
|
3184
|
-
|
|
3189
|
+
|
|
3185
3190
|
if len(feature_row) > 0:
|
|
3186
3191
|
intensity = feature_row[quant_col].iloc[0]
|
|
3187
3192
|
if pd.notna(intensity):
|
|
3188
3193
|
matrix_data.loc[consensus_uid, sample_name] = float(intensity)
|
|
3189
|
-
|
|
3194
|
+
|
|
3190
3195
|
# Convert any remaining NaN to 0
|
|
3191
3196
|
matrix_data = matrix_data.fillna(0.0)
|
|
3192
|
-
|
|
3197
|
+
|
|
3193
3198
|
study.logger.debug(f"Local matrix built successfully with shape {matrix_data.shape}")
|
|
3194
|
-
|
|
3199
|
+
|
|
3195
3200
|
return matrix_data
|
|
3196
3201
|
|
|
3197
3202
|
|
|
3198
3203
|
def _get_adduct_deltas_with_likelihood(study):
|
|
3199
3204
|
"""
|
|
3200
3205
|
Extract all pairwise mass differences between adducts with joint likelihood scoring.
|
|
3201
|
-
|
|
3206
|
+
|
|
3202
3207
|
Args:
|
|
3203
3208
|
study: Study object with _get_adducts method
|
|
3204
|
-
|
|
3209
|
+
|
|
3205
3210
|
Returns:
|
|
3206
3211
|
List of tuples: (mass_delta, joint_likelihood, adduct1_name, adduct2_name)
|
|
3207
3212
|
Sorted by joint_likelihood descending (most likely pairs first)
|
|
3208
3213
|
"""
|
|
3209
3214
|
try:
|
|
3210
3215
|
adducts_df = study._get_adducts()
|
|
3211
|
-
|
|
3216
|
+
|
|
3212
3217
|
if adducts_df is None or adducts_df.is_empty():
|
|
3213
3218
|
study.logger.warning("No adducts dataframe available for study")
|
|
3214
3219
|
return []
|
|
3215
|
-
|
|
3220
|
+
|
|
3216
3221
|
# Convert to pandas for easier manipulation
|
|
3217
3222
|
adducts_pd = adducts_df.to_pandas()
|
|
3218
|
-
|
|
3223
|
+
|
|
3219
3224
|
# Check if we have likelihood/probability information
|
|
3220
3225
|
likelihood_col = None
|
|
3221
|
-
for col in [
|
|
3226
|
+
for col in ["likelihood", "probability", "freq", "frequency", "score"]:
|
|
3222
3227
|
if col in adducts_pd.columns:
|
|
3223
3228
|
likelihood_col = col
|
|
3224
3229
|
break
|
|
3225
|
-
|
|
3230
|
+
|
|
3226
3231
|
# If no likelihood column, estimate based on adduct type
|
|
3227
3232
|
if likelihood_col is None:
|
|
3228
|
-
adducts_pd[
|
|
3229
|
-
likelihood_col =
|
|
3230
|
-
|
|
3233
|
+
adducts_pd["estimated_likelihood"] = adducts_pd.apply(_estimate_adduct_likelihood, axis=1)
|
|
3234
|
+
likelihood_col = "estimated_likelihood"
|
|
3235
|
+
|
|
3231
3236
|
# Get mass column (try different possible column names)
|
|
3232
3237
|
mass_col = None
|
|
3233
|
-
for col_name in [
|
|
3238
|
+
for col_name in ["mass_shift", "mass", "mass_shift_da", "mass_da"]:
|
|
3234
3239
|
if col_name in adducts_pd.columns:
|
|
3235
3240
|
mass_col = col_name
|
|
3236
3241
|
break
|
|
3237
|
-
|
|
3242
|
+
|
|
3238
3243
|
if mass_col is None:
|
|
3239
|
-
study.logger.warning(
|
|
3244
|
+
study.logger.warning(
|
|
3245
|
+
f"No mass column found in adducts dataframe. Available columns: {list(adducts_pd.columns)}"
|
|
3246
|
+
)
|
|
3240
3247
|
return []
|
|
3241
|
-
|
|
3248
|
+
|
|
3242
3249
|
# Calculate all pairwise differences with joint likelihoods
|
|
3243
3250
|
adduct_pairs = []
|
|
3244
3251
|
for i in range(len(adducts_pd)):
|
|
3245
3252
|
for j in range(i + 1, len(adducts_pd)):
|
|
3246
3253
|
row_i = adducts_pd.iloc[i]
|
|
3247
3254
|
row_j = adducts_pd.iloc[j]
|
|
3248
|
-
|
|
3255
|
+
|
|
3249
3256
|
# Skip if masses are NaN or invalid
|
|
3250
|
-
if (hasattr(row_i[mass_col],
|
|
3251
|
-
|
|
3257
|
+
if (hasattr(row_i[mass_col], "__iter__") and not isinstance(row_i[mass_col], str)) or (
|
|
3258
|
+
hasattr(row_j[mass_col], "__iter__") and not isinstance(row_j[mass_col], str)
|
|
3259
|
+
):
|
|
3252
3260
|
continue
|
|
3253
|
-
|
|
3261
|
+
|
|
3254
3262
|
mass_i = float(row_i[mass_col])
|
|
3255
3263
|
mass_j = float(row_j[mass_col])
|
|
3256
3264
|
delta = abs(mass_i - mass_j)
|
|
3257
|
-
|
|
3265
|
+
|
|
3258
3266
|
if delta > 0.1: # Only meaningful mass differences
|
|
3259
3267
|
# Joint likelihood is sum of individual likelihoods
|
|
3260
3268
|
joint_likelihood = float(row_i[likelihood_col]) + float(row_j[likelihood_col])
|
|
3261
|
-
|
|
3262
|
-
adduct1_name = row_i.get(
|
|
3263
|
-
adduct2_name = row_j.get(
|
|
3264
|
-
|
|
3269
|
+
|
|
3270
|
+
adduct1_name = row_i.get("adduct", row_i.get("name", f"adduct_{i}"))
|
|
3271
|
+
adduct2_name = row_j.get("adduct", row_j.get("name", f"adduct_{j}"))
|
|
3272
|
+
|
|
3265
3273
|
# CRITICAL FIX: Order adducts consistently from lower mass to higher mass
|
|
3266
3274
|
# This ensures consistent assignment: lower mass adduct = from_adduct, higher mass adduct = to_adduct
|
|
3267
3275
|
if mass_i <= mass_j:
|
|
3268
3276
|
# row_i has lower or equal mass shift -> from_adduct
|
|
3269
|
-
# row_j has higher mass shift -> to_adduct
|
|
3277
|
+
# row_j has higher mass shift -> to_adduct
|
|
3270
3278
|
adduct_pairs.append((round(delta, 4), joint_likelihood, adduct1_name, adduct2_name))
|
|
3271
3279
|
else:
|
|
3272
3280
|
# row_j has lower mass shift -> from_adduct
|
|
3273
3281
|
# row_i has higher mass shift -> to_adduct
|
|
3274
3282
|
adduct_pairs.append((round(delta, 4), joint_likelihood, adduct2_name, adduct1_name))
|
|
3275
|
-
|
|
3283
|
+
|
|
3276
3284
|
# Sort by joint likelihood descending (most likely pairs first)
|
|
3277
3285
|
adduct_pairs.sort(key=lambda x: x[1], reverse=True)
|
|
3278
|
-
|
|
3286
|
+
|
|
3279
3287
|
study.logger.debug(f"Extracted {len(adduct_pairs)} adduct pairs with likelihood scoring")
|
|
3280
3288
|
return adduct_pairs
|
|
3281
|
-
|
|
3289
|
+
|
|
3282
3290
|
except Exception as e:
|
|
3283
|
-
study.logger.warning(
|
|
3291
|
+
study.logger.warning(
|
|
3292
|
+
f"Could not extract adduct deltas with likelihood: {e}. No adducts defined - returning empty list."
|
|
3293
|
+
)
|
|
3284
3294
|
return []
|
|
3285
3295
|
|
|
3286
3296
|
|
|
3287
3297
|
def _estimate_adduct_likelihood(adduct_row):
|
|
3288
3298
|
"""
|
|
3289
3299
|
Estimate likelihood of an adduct based on common knowledge.
|
|
3290
|
-
|
|
3300
|
+
|
|
3291
3301
|
Args:
|
|
3292
3302
|
adduct_row: pandas Series with adduct information
|
|
3293
|
-
|
|
3303
|
+
|
|
3294
3304
|
Returns:
|
|
3295
3305
|
float: Estimated likelihood (0.0 to 1.0)
|
|
3296
3306
|
"""
|
|
3297
|
-
adduct_name = str(adduct_row.get(
|
|
3298
|
-
|
|
3307
|
+
adduct_name = str(adduct_row.get("adduct", adduct_row.get("name", ""))).lower()
|
|
3308
|
+
|
|
3299
3309
|
# Common likelihood estimates based on adduct frequency in positive mode
|
|
3300
3310
|
likelihood_map = {
|
|
3301
|
-
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
|
|
3305
|
-
|
|
3306
|
-
|
|
3307
|
-
|
|
3311
|
+
"[m+h]": 0.9, # Most common
|
|
3312
|
+
"[m+na]": 0.7, # Very common
|
|
3313
|
+
"[m+nh4]": 0.6, # Common
|
|
3314
|
+
"[m+k]": 0.3, # Less common
|
|
3315
|
+
"[m+2h]": 0.2, # Doubly charged, less frequent
|
|
3316
|
+
"[m+3h]": 0.1, # Triply charged, rare
|
|
3317
|
+
"[m+h-h2o]": 0.4, # Loss adducts, moderately common
|
|
3308
3318
|
}
|
|
3309
|
-
|
|
3319
|
+
|
|
3310
3320
|
# Find best match
|
|
3311
3321
|
for pattern, likelihood in likelihood_map.items():
|
|
3312
3322
|
if pattern in adduct_name:
|
|
3313
3323
|
return likelihood
|
|
3314
|
-
|
|
3324
|
+
|
|
3315
3325
|
# Default for unknown adducts
|
|
3316
3326
|
return 0.2
|
|
3317
3327
|
|
|
@@ -3319,10 +3329,10 @@ def _estimate_adduct_likelihood(adduct_row):
|
|
|
3319
3329
|
def _get_adduct_deltas(study):
|
|
3320
3330
|
"""
|
|
3321
3331
|
Extract all pairwise mass differences between adducts from study adducts data.
|
|
3322
|
-
|
|
3332
|
+
|
|
3323
3333
|
Args:
|
|
3324
3334
|
study: Study object with _get_adducts method
|
|
3325
|
-
|
|
3335
|
+
|
|
3326
3336
|
Returns:
|
|
3327
3337
|
List of mass differences (deltas) for adduct filtering
|
|
3328
3338
|
"""
|
|
@@ -3338,15 +3348,15 @@ def _fast_correlation(vec1, vec2):
|
|
|
3338
3348
|
"""
|
|
3339
3349
|
if len(vec1) != len(vec2):
|
|
3340
3350
|
return 0.0
|
|
3341
|
-
|
|
3351
|
+
|
|
3342
3352
|
# Remove NaN values and corresponding positions
|
|
3343
3353
|
mask = ~(np.isnan(vec1) | np.isnan(vec2))
|
|
3344
3354
|
if np.sum(mask) < 2: # Need at least 2 valid points
|
|
3345
3355
|
return 0.0
|
|
3346
|
-
|
|
3356
|
+
|
|
3347
3357
|
v1 = vec1[mask]
|
|
3348
3358
|
v2 = vec2[mask]
|
|
3349
|
-
|
|
3359
|
+
|
|
3350
3360
|
# Fast correlation using numpy built-in
|
|
3351
3361
|
try:
|
|
3352
3362
|
corr_matrix = np.corrcoef(v1, v2)
|
|
@@ -3365,45 +3375,47 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
|
3365
3375
|
4. Hierarchical boss structure (prevent transitivity)
|
|
3366
3376
|
5. Correlation-based confirmation
|
|
3367
3377
|
6. Intensity-based ranking for final selection
|
|
3368
|
-
|
|
3378
|
+
|
|
3369
3379
|
Args:
|
|
3370
3380
|
study: Study object
|
|
3371
3381
|
consensus_data: List of consensus feature dictionaries
|
|
3372
|
-
rt_tol: Retention time tolerance (seconds)
|
|
3382
|
+
rt_tol: Retention time tolerance (seconds)
|
|
3373
3383
|
mz_tol: M/z tolerance (Da)
|
|
3374
|
-
|
|
3384
|
+
|
|
3375
3385
|
Returns:
|
|
3376
3386
|
Tuple of (adduct_group_list, adduct_of_list)
|
|
3377
3387
|
"""
|
|
3378
|
-
|
|
3388
|
+
|
|
3379
3389
|
if not consensus_data:
|
|
3380
3390
|
return [], []
|
|
3381
|
-
|
|
3391
|
+
|
|
3382
3392
|
n_features = len(consensus_data)
|
|
3383
3393
|
study.logger.info(f"Starting adduct grouping for {n_features} features")
|
|
3384
|
-
|
|
3394
|
+
|
|
3385
3395
|
# Step 1: Build local intensity matrix ONCE
|
|
3386
3396
|
try:
|
|
3387
3397
|
intensity_matrix_pd = _get_features_matrix(study, consensus_data, quant_col="inty")
|
|
3388
|
-
|
|
3398
|
+
|
|
3389
3399
|
if intensity_matrix_pd is None or len(intensity_matrix_pd) == 0:
|
|
3390
3400
|
study.logger.warning("Could not build local intensity matrix - creating single-feature groups")
|
|
3391
3401
|
adduct_group_list = list(range(1, len(consensus_data) + 1))
|
|
3392
3402
|
adduct_of_list = [0] * len(consensus_data)
|
|
3393
3403
|
return adduct_group_list, adduct_of_list
|
|
3394
|
-
|
|
3395
|
-
study.logger.debug(
|
|
3396
|
-
|
|
3404
|
+
|
|
3405
|
+
study.logger.debug(
|
|
3406
|
+
f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples"
|
|
3407
|
+
)
|
|
3408
|
+
|
|
3397
3409
|
except Exception as e:
|
|
3398
3410
|
study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
|
|
3399
3411
|
adduct_group_list = list(range(1, len(consensus_data) + 1))
|
|
3400
3412
|
adduct_of_list = [0] * len(consensus_data)
|
|
3401
3413
|
return adduct_group_list, adduct_of_list
|
|
3402
|
-
|
|
3403
|
-
# Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
|
|
3414
|
+
|
|
3415
|
+
# Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
|
|
3404
3416
|
adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
|
|
3405
3417
|
study.logger.debug(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
|
|
3406
|
-
|
|
3418
|
+
|
|
3407
3419
|
# Build hash map for O(1) mass shift lookup
|
|
3408
3420
|
mass_shift_map = {} # rounded_delta -> [(likelihood, adduct1, adduct2), ...]
|
|
3409
3421
|
for mass_delta, joint_likelihood, adduct1, adduct2 in adduct_pairs_with_likelihood:
|
|
@@ -3411,11 +3423,11 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
|
3411
3423
|
if key not in mass_shift_map:
|
|
3412
3424
|
mass_shift_map[key] = []
|
|
3413
3425
|
mass_shift_map[key].append((joint_likelihood, adduct1, adduct2))
|
|
3414
|
-
|
|
3426
|
+
|
|
3415
3427
|
# Sort each mass shift group by likelihood (highest first)
|
|
3416
3428
|
for key in mass_shift_map:
|
|
3417
3429
|
mass_shift_map[key].sort(key=lambda x: x[0], reverse=True)
|
|
3418
|
-
|
|
3430
|
+
|
|
3419
3431
|
# Step 3: Pre-compute feature properties and sort by RT for spatial filtering
|
|
3420
3432
|
feature_props = []
|
|
3421
3433
|
for i, feature in enumerate(consensus_data):
|
|
@@ -3423,222 +3435,224 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
|
3423
3435
|
rt = feature["rt"]
|
|
3424
3436
|
mz = feature["mz"]
|
|
3425
3437
|
intensity = feature.get("inty_mean", 0)
|
|
3426
|
-
|
|
3438
|
+
|
|
3427
3439
|
# Get matrix vector once
|
|
3428
3440
|
matrix_vector = intensity_matrix_pd.loc[uid].values if uid in intensity_matrix_pd.index else None
|
|
3429
|
-
|
|
3441
|
+
|
|
3430
3442
|
feature_props.append({
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
|
|
3443
|
+
"index": i,
|
|
3444
|
+
"uid": uid,
|
|
3445
|
+
"rt": rt,
|
|
3446
|
+
"mz": mz,
|
|
3447
|
+
"intensity": intensity,
|
|
3448
|
+
"vector": matrix_vector,
|
|
3449
|
+
"feature": feature,
|
|
3438
3450
|
})
|
|
3439
|
-
|
|
3451
|
+
|
|
3440
3452
|
# Sort by RT for efficient spatial filtering
|
|
3441
|
-
feature_props.sort(key=lambda x: x[
|
|
3442
|
-
|
|
3453
|
+
feature_props.sort(key=lambda x: x["rt"])
|
|
3454
|
+
|
|
3443
3455
|
# Initialize grouping structures
|
|
3444
3456
|
uid_to_boss = {} # Hierarchical structure: uid -> boss_uid
|
|
3445
3457
|
boss_to_members = {} # boss_uid -> [member_uids]
|
|
3446
3458
|
processed_uids = set()
|
|
3447
|
-
|
|
3459
|
+
|
|
3448
3460
|
# Step 4: Process features with optimized RT filtering
|
|
3449
3461
|
for i, boss_prop in enumerate(feature_props):
|
|
3450
|
-
boss_uid = boss_prop[
|
|
3451
|
-
|
|
3462
|
+
boss_uid = boss_prop["uid"]
|
|
3463
|
+
|
|
3452
3464
|
if boss_uid in processed_uids:
|
|
3453
3465
|
continue
|
|
3454
|
-
|
|
3455
|
-
if boss_prop[
|
|
3466
|
+
|
|
3467
|
+
if boss_prop["vector"] is None:
|
|
3456
3468
|
processed_uids.add(boss_uid)
|
|
3457
3469
|
continue
|
|
3458
|
-
|
|
3470
|
+
|
|
3459
3471
|
# Initialize as boss
|
|
3460
3472
|
if boss_uid not in uid_to_boss:
|
|
3461
3473
|
uid_to_boss[boss_uid] = boss_uid
|
|
3462
3474
|
boss_to_members[boss_uid] = []
|
|
3463
|
-
|
|
3464
|
-
boss_rt = boss_prop[
|
|
3465
|
-
boss_mz = boss_prop[
|
|
3466
|
-
boss_vector = boss_prop[
|
|
3467
|
-
|
|
3475
|
+
|
|
3476
|
+
boss_rt = boss_prop["rt"]
|
|
3477
|
+
boss_mz = boss_prop["mz"]
|
|
3478
|
+
boss_vector = boss_prop["vector"]
|
|
3479
|
+
|
|
3468
3480
|
# Step 5: Efficient RT coelution filtering using sorted array
|
|
3469
3481
|
candidate_pairs = []
|
|
3470
|
-
|
|
3482
|
+
|
|
3471
3483
|
# Search backwards from current position
|
|
3472
3484
|
j = i - 1
|
|
3473
|
-
while j >= 0 and (boss_rt - feature_props[j][
|
|
3485
|
+
while j >= 0 and (boss_rt - feature_props[j]["rt"]) <= rt_tol:
|
|
3474
3486
|
candidate = feature_props[j]
|
|
3475
|
-
if candidate[
|
|
3476
|
-
if candidate[
|
|
3487
|
+
if candidate["uid"] not in processed_uids and candidate["vector"] is not None:
|
|
3488
|
+
if candidate["uid"] not in uid_to_boss or uid_to_boss[candidate["uid"]] == candidate["uid"]:
|
|
3477
3489
|
# Calculate mz difference and check mass shift
|
|
3478
|
-
mz_diff = abs(boss_mz - candidate[
|
|
3490
|
+
mz_diff = abs(boss_mz - candidate["mz"])
|
|
3479
3491
|
mass_shift_key = round(mz_diff / mz_tol) * mz_tol
|
|
3480
|
-
|
|
3492
|
+
|
|
3481
3493
|
if mass_shift_key in mass_shift_map:
|
|
3482
3494
|
likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
|
|
3483
3495
|
candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
|
|
3484
3496
|
j -= 1
|
|
3485
|
-
|
|
3497
|
+
|
|
3486
3498
|
# Search forwards from current position
|
|
3487
3499
|
j = i + 1
|
|
3488
|
-
while j < len(feature_props) and (feature_props[j][
|
|
3500
|
+
while j < len(feature_props) and (feature_props[j]["rt"] - boss_rt) <= rt_tol:
|
|
3489
3501
|
candidate = feature_props[j]
|
|
3490
|
-
if candidate[
|
|
3491
|
-
if candidate[
|
|
3502
|
+
if candidate["uid"] not in processed_uids and candidate["vector"] is not None:
|
|
3503
|
+
if candidate["uid"] not in uid_to_boss or uid_to_boss[candidate["uid"]] == candidate["uid"]:
|
|
3492
3504
|
# Calculate mz difference and check mass shift
|
|
3493
|
-
mz_diff = abs(boss_mz - candidate[
|
|
3505
|
+
mz_diff = abs(boss_mz - candidate["mz"])
|
|
3494
3506
|
mass_shift_key = round(mz_diff / mz_tol) * mz_tol
|
|
3495
|
-
|
|
3507
|
+
|
|
3496
3508
|
if mass_shift_key in mass_shift_map:
|
|
3497
3509
|
likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
|
|
3498
3510
|
candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
|
|
3499
3511
|
j += 1
|
|
3500
|
-
|
|
3512
|
+
|
|
3501
3513
|
# Sort candidates by likelihood (descending) to prioritize chemically meaningful pairs
|
|
3502
3514
|
candidate_pairs.sort(key=lambda x: x[1], reverse=True)
|
|
3503
|
-
|
|
3515
|
+
|
|
3504
3516
|
# Step 6: Process candidates in likelihood priority order
|
|
3505
3517
|
for candidate_prop, likelihood, adduct_info in candidate_pairs:
|
|
3506
|
-
candidate_uid = candidate_prop[
|
|
3507
|
-
candidate_vector = candidate_prop[
|
|
3508
|
-
|
|
3518
|
+
candidate_uid = candidate_prop["uid"]
|
|
3519
|
+
candidate_vector = candidate_prop["vector"]
|
|
3520
|
+
|
|
3509
3521
|
# Correlation confirmation with optimized threshold
|
|
3510
3522
|
try:
|
|
3511
3523
|
correlation = _fast_correlation(boss_vector, candidate_vector)
|
|
3512
|
-
|
|
3524
|
+
|
|
3513
3525
|
if correlation < 0.5: # More permissive for legitimate adduct relationships
|
|
3514
3526
|
continue
|
|
3515
|
-
|
|
3527
|
+
|
|
3516
3528
|
except Exception:
|
|
3517
3529
|
continue
|
|
3518
|
-
|
|
3530
|
+
|
|
3519
3531
|
# Step 7: Hierarchical assignment (merge groups if needed)
|
|
3520
3532
|
if candidate_uid in boss_to_members:
|
|
3521
3533
|
old_members = boss_to_members[candidate_uid].copy()
|
|
3522
3534
|
del boss_to_members[candidate_uid]
|
|
3523
|
-
|
|
3535
|
+
|
|
3524
3536
|
# Reassign old members to new boss
|
|
3525
3537
|
for member in old_members:
|
|
3526
3538
|
uid_to_boss[member] = boss_uid
|
|
3527
3539
|
boss_to_members[boss_uid].append(member)
|
|
3528
|
-
|
|
3540
|
+
|
|
3529
3541
|
# Assign candidate to current boss
|
|
3530
3542
|
uid_to_boss[candidate_uid] = boss_uid
|
|
3531
3543
|
boss_to_members[boss_uid].append(candidate_uid)
|
|
3532
3544
|
processed_uids.add(candidate_uid)
|
|
3533
|
-
|
|
3545
|
+
|
|
3534
3546
|
processed_uids.add(boss_uid)
|
|
3535
|
-
|
|
3547
|
+
|
|
3536
3548
|
# Step 8: Intensity-based ranking within groups (optimized)
|
|
3537
3549
|
for boss_uid in list(boss_to_members.keys()):
|
|
3538
3550
|
members = boss_to_members[boss_uid]
|
|
3539
3551
|
if len(members) == 0:
|
|
3540
3552
|
continue
|
|
3541
|
-
|
|
3553
|
+
|
|
3542
3554
|
all_group_members = [boss_uid] + members
|
|
3543
|
-
|
|
3555
|
+
|
|
3544
3556
|
# Find member with highest intensity efficiently
|
|
3545
3557
|
max_intensity = -1
|
|
3546
3558
|
new_boss = boss_uid
|
|
3547
|
-
|
|
3559
|
+
|
|
3548
3560
|
for member_uid in all_group_members:
|
|
3549
3561
|
# Find member_uid in feature_props
|
|
3550
|
-
member_intensity = next((fp[
|
|
3562
|
+
member_intensity = next((fp["intensity"] for fp in feature_props if fp["uid"] == member_uid), 0)
|
|
3551
3563
|
if member_intensity > max_intensity:
|
|
3552
3564
|
max_intensity = member_intensity
|
|
3553
3565
|
new_boss = member_uid
|
|
3554
|
-
|
|
3566
|
+
|
|
3555
3567
|
# Update boss if needed
|
|
3556
3568
|
if new_boss != boss_uid:
|
|
3557
3569
|
boss_to_members[new_boss] = [m for m in all_group_members if m != new_boss]
|
|
3558
3570
|
del boss_to_members[boss_uid]
|
|
3559
|
-
|
|
3571
|
+
|
|
3560
3572
|
# Update all member references
|
|
3561
3573
|
for member in all_group_members:
|
|
3562
3574
|
uid_to_boss[member] = new_boss
|
|
3563
|
-
|
|
3575
|
+
|
|
3564
3576
|
# Count and log results
|
|
3565
3577
|
total_groups = len(boss_to_members)
|
|
3566
3578
|
multi_member_groups = sum(1 for members in boss_to_members.values() if len(members) > 0)
|
|
3567
3579
|
total_grouped_features = sum(len(members) + 1 for members in boss_to_members.values())
|
|
3568
|
-
|
|
3569
|
-
study.logger.info(
|
|
3570
|
-
|
|
3580
|
+
|
|
3581
|
+
study.logger.info(
|
|
3582
|
+
f"Grouping results: {total_groups} groups ({multi_member_groups} multi-member, {total_grouped_features} features)"
|
|
3583
|
+
)
|
|
3584
|
+
|
|
3571
3585
|
# Step 9: Convert to return format (optimized)
|
|
3572
|
-
uid_to_index = {fp[
|
|
3586
|
+
uid_to_index = {fp["uid"]: fp["index"] for fp in feature_props}
|
|
3573
3587
|
adduct_group_list = [0] * n_features
|
|
3574
3588
|
adduct_of_list = [0] * n_features
|
|
3575
|
-
|
|
3589
|
+
|
|
3576
3590
|
group_counter = 1
|
|
3577
3591
|
for boss_uid, members in boss_to_members.items():
|
|
3578
3592
|
# Assign boss
|
|
3579
3593
|
boss_idx = uid_to_index[boss_uid]
|
|
3580
3594
|
adduct_group_list[boss_idx] = group_counter
|
|
3581
3595
|
adduct_of_list[boss_idx] = 0
|
|
3582
|
-
|
|
3596
|
+
|
|
3583
3597
|
# Assign members
|
|
3584
3598
|
for member_uid in members:
|
|
3585
3599
|
member_idx = uid_to_index[member_uid]
|
|
3586
3600
|
adduct_group_list[member_idx] = group_counter
|
|
3587
3601
|
adduct_of_list[member_idx] = boss_uid
|
|
3588
|
-
|
|
3602
|
+
|
|
3589
3603
|
group_counter += 1
|
|
3590
|
-
|
|
3604
|
+
|
|
3591
3605
|
# Handle ungrouped features
|
|
3592
3606
|
for i in range(n_features):
|
|
3593
3607
|
if adduct_group_list[i] == 0:
|
|
3594
3608
|
adduct_group_list[i] = group_counter
|
|
3595
3609
|
adduct_of_list[i] = 0
|
|
3596
3610
|
group_counter += 1
|
|
3597
|
-
|
|
3611
|
+
|
|
3598
3612
|
return adduct_group_list, adduct_of_list
|
|
3599
3613
|
|
|
3600
3614
|
|
|
3601
3615
|
def _fast_correlation(x, y):
|
|
3602
3616
|
"""
|
|
3603
3617
|
Fast correlation coefficient calculation for consensus matrix data.
|
|
3604
|
-
|
|
3618
|
+
|
|
3605
3619
|
In the consensus matrix:
|
|
3606
|
-
- Negative values (typically -1.0) indicate missing features
|
|
3620
|
+
- Negative values (typically -1.0) indicate missing features
|
|
3607
3621
|
- Zero and positive values are actual intensities
|
|
3608
3622
|
- Only consider intensities >= 1000 for meaningful correlation
|
|
3609
|
-
|
|
3623
|
+
|
|
3610
3624
|
Args:
|
|
3611
3625
|
x, y: numpy arrays of the same length
|
|
3612
|
-
|
|
3626
|
+
|
|
3613
3627
|
Returns:
|
|
3614
3628
|
Correlation coefficient (float), 0 if cannot be calculated
|
|
3615
3629
|
"""
|
|
3616
3630
|
import numpy as np
|
|
3617
|
-
|
|
3631
|
+
|
|
3618
3632
|
# For consensus matrix: exclude negative values (missing features) and very low intensities
|
|
3619
|
-
# Use a very low threshold since processed matrix values are often scaled/normalized
|
|
3633
|
+
# Use a very low threshold since processed matrix values are often scaled/normalized
|
|
3620
3634
|
valid = ~(np.isnan(x) | np.isnan(y) | (x < 0) | (y < 0) | (x < 0.1) | (y < 0.1))
|
|
3621
|
-
|
|
3635
|
+
|
|
3622
3636
|
if np.sum(valid) < 3: # Need at least 3 valid pairs
|
|
3623
3637
|
return 0.0
|
|
3624
|
-
|
|
3638
|
+
|
|
3625
3639
|
x_valid = x[valid]
|
|
3626
3640
|
y_valid = y[valid]
|
|
3627
|
-
|
|
3641
|
+
|
|
3628
3642
|
# If all values are the same (e.g., all zeros), correlation is undefined
|
|
3629
3643
|
if np.var(x_valid) == 0 or np.var(y_valid) == 0:
|
|
3630
3644
|
return 0.0
|
|
3631
|
-
|
|
3645
|
+
|
|
3632
3646
|
# Fast correlation using numpy
|
|
3633
3647
|
try:
|
|
3634
3648
|
correlation_matrix = np.corrcoef(x_valid, y_valid)
|
|
3635
3649
|
correlation = correlation_matrix[0, 1]
|
|
3636
|
-
|
|
3650
|
+
|
|
3637
3651
|
# Handle NaN result
|
|
3638
3652
|
if np.isnan(correlation):
|
|
3639
3653
|
return 0.0
|
|
3640
|
-
|
|
3654
|
+
|
|
3641
3655
|
return correlation
|
|
3642
|
-
|
|
3656
|
+
|
|
3643
3657
|
except Exception:
|
|
3644
3658
|
return 0.0
|