masster 0.4.18__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +0 -1
- masster/_version.py +1 -1
- masster/logger.py +42 -0
- masster/sample/load.py +6 -5
- masster/sample/sample.py +0 -9
- masster/study/defaults/merge_def.py +43 -2
- masster/study/helpers.py +52 -11
- masster/study/merge.py +1418 -105
- masster/study/plot.py +11 -5
- masster/study/study.py +18 -0
- masster/wizard/__init__.py +5 -2
- masster/wizard/wizard.py +1199 -27
- {masster-0.4.18.dist-info → masster-0.4.20.dist-info}/METADATA +1 -1
- {masster-0.4.18.dist-info → masster-0.4.20.dist-info}/RECORD +17 -18
- masster/wizard.py +0 -1175
- {masster-0.4.18.dist-info → masster-0.4.20.dist-info}/WHEEL +0 -0
- {masster-0.4.18.dist-info → masster-0.4.20.dist-info}/entry_points.txt +0 -0
- {masster-0.4.18.dist-info → masster-0.4.20.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
CHANGED
|
@@ -10,9 +10,270 @@ from datetime import datetime
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
import pyopenms as oms
|
|
12
12
|
import polars as pl
|
|
13
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
14
|
+
from concurrent.futures.process import BrokenProcessPool
|
|
13
15
|
from masster.study.defaults import merge_defaults
|
|
14
16
|
|
|
15
17
|
|
|
18
|
+
def _process_kd_chunk_parallel(chunk_data):
|
|
19
|
+
"""
|
|
20
|
+
Process a single KD chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
chunk_data: Dictionary containing chunk processing parameters
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
27
|
+
"""
|
|
28
|
+
import pyopenms as oms
|
|
29
|
+
|
|
30
|
+
chunk_start_idx = chunk_data['chunk_start_idx']
|
|
31
|
+
chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
|
|
32
|
+
chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
|
|
33
|
+
params_dict = chunk_data['params']
|
|
34
|
+
|
|
35
|
+
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
36
|
+
chunk_maps = []
|
|
37
|
+
|
|
38
|
+
for sample_data in chunk_samples_data:
|
|
39
|
+
sample_uid = sample_data['sample_uid']
|
|
40
|
+
|
|
41
|
+
# Filter features for this specific sample
|
|
42
|
+
sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
|
|
43
|
+
|
|
44
|
+
# Create FeatureMap for this sample
|
|
45
|
+
feature_map = oms.FeatureMap()
|
|
46
|
+
|
|
47
|
+
# Add each feature to the map
|
|
48
|
+
for feature_dict in sample_features:
|
|
49
|
+
feature = oms.Feature()
|
|
50
|
+
feature.setRT(float(feature_dict['rt']))
|
|
51
|
+
feature.setMZ(float(feature_dict['mz']))
|
|
52
|
+
feature.setIntensity(float(feature_dict['inty']))
|
|
53
|
+
feature.setCharge(int(feature_dict.get('charge', 0)))
|
|
54
|
+
|
|
55
|
+
# Set unique ID using feature_id for mapping back
|
|
56
|
+
feature.setUniqueId(int(feature_dict['feature_id']))
|
|
57
|
+
|
|
58
|
+
feature_map.push_back(feature)
|
|
59
|
+
|
|
60
|
+
chunk_maps.append(feature_map)
|
|
61
|
+
|
|
62
|
+
# Create the chunk consensus map
|
|
63
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
64
|
+
|
|
65
|
+
# Set up file descriptions for chunk
|
|
66
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
67
|
+
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
68
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
69
|
+
file_description.filename = sample_data['sample_name']
|
|
70
|
+
file_description.size = feature_map.size()
|
|
71
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
72
|
+
file_descriptions[j] = file_description
|
|
73
|
+
|
|
74
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
75
|
+
|
|
76
|
+
# Use KD algorithm for chunk
|
|
77
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
78
|
+
chunk_params = grouper.getParameters()
|
|
79
|
+
chunk_params.setValue("mz_unit", "Da")
|
|
80
|
+
chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
|
|
81
|
+
chunk_params.setValue("warp:enabled", "true")
|
|
82
|
+
chunk_params.setValue("warp:rt_tol", params_dict['rt_tol'])
|
|
83
|
+
chunk_params.setValue("warp:mz_tol", params_dict['mz_tol'])
|
|
84
|
+
chunk_params.setValue("link:rt_tol", params_dict['rt_tol'])
|
|
85
|
+
chunk_params.setValue("link:mz_tol", params_dict['mz_tol'])
|
|
86
|
+
chunk_params.setValue("link:min_rel_cc_size", params_dict['min_rel_cc_size'])
|
|
87
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params_dict['max_pairwise_log_fc'])
|
|
88
|
+
chunk_params.setValue("link:max_nr_conflicts", params_dict['max_nr_conflicts'])
|
|
89
|
+
|
|
90
|
+
grouper.setParameters(chunk_params)
|
|
91
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
92
|
+
|
|
93
|
+
# Serialize the consensus map result for cross-process communication
|
|
94
|
+
consensus_features = []
|
|
95
|
+
for consensus_feature in chunk_consensus_map:
|
|
96
|
+
feature_data = {
|
|
97
|
+
'rt': consensus_feature.getRT(),
|
|
98
|
+
'mz': consensus_feature.getMZ(),
|
|
99
|
+
'intensity': consensus_feature.getIntensity(),
|
|
100
|
+
'quality': consensus_feature.getQuality(),
|
|
101
|
+
'unique_id': str(consensus_feature.getUniqueId()),
|
|
102
|
+
'features': []
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# Get constituent features
|
|
106
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
107
|
+
feature_handle_data = {
|
|
108
|
+
'unique_id': str(feature_handle.getUniqueId()),
|
|
109
|
+
'map_index': feature_handle.getMapIndex()
|
|
110
|
+
}
|
|
111
|
+
feature_data['features'].append(feature_handle_data)
|
|
112
|
+
|
|
113
|
+
consensus_features.append(feature_data)
|
|
114
|
+
|
|
115
|
+
return chunk_start_idx, consensus_features
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _deserialize_consensus_features(consensus_features):
|
|
119
|
+
"""
|
|
120
|
+
Deserialize consensus features back into an OpenMS ConsensusMap.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
consensus_features: List of serialized consensus feature dictionaries
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
OpenMS ConsensusMap object
|
|
127
|
+
"""
|
|
128
|
+
import pyopenms as oms
|
|
129
|
+
|
|
130
|
+
consensus_map = oms.ConsensusMap()
|
|
131
|
+
|
|
132
|
+
for feature_data in consensus_features:
|
|
133
|
+
consensus_feature = oms.ConsensusFeature()
|
|
134
|
+
consensus_feature.setRT(float(feature_data['rt']))
|
|
135
|
+
consensus_feature.setMZ(float(feature_data['mz']))
|
|
136
|
+
consensus_feature.setIntensity(float(feature_data['intensity']))
|
|
137
|
+
consensus_feature.setQuality(float(feature_data['quality']))
|
|
138
|
+
consensus_feature.setUniqueId(int(feature_data['unique_id']))
|
|
139
|
+
|
|
140
|
+
# Reconstruct feature handles (simplified approach)
|
|
141
|
+
feature_handles = []
|
|
142
|
+
for handle_data in feature_data['features']:
|
|
143
|
+
feature_handle = oms.FeatureHandle()
|
|
144
|
+
feature_handle.setUniqueId(int(handle_data['unique_id']))
|
|
145
|
+
feature_handle.setMapIndex(int(handle_data['map_index']))
|
|
146
|
+
feature_handles.append(feature_handle)
|
|
147
|
+
|
|
148
|
+
# Set the feature list - properly add feature handles back to consensus feature
|
|
149
|
+
if feature_handles:
|
|
150
|
+
# Add each feature handle to the consensus feature using the correct OpenMS API
|
|
151
|
+
for feature_handle in feature_handles:
|
|
152
|
+
consensus_feature.getFeatureList().append(feature_handle)
|
|
153
|
+
|
|
154
|
+
consensus_map.push_back(consensus_feature)
|
|
155
|
+
|
|
156
|
+
return consensus_map
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _process_qt_chunk_parallel(chunk_data):
|
|
160
|
+
"""
|
|
161
|
+
Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
chunk_data: Dictionary containing chunk processing parameters
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Tuple of (chunk_start_idx, serialized_consensus_features)
|
|
168
|
+
"""
|
|
169
|
+
import pyopenms as oms
|
|
170
|
+
|
|
171
|
+
chunk_start_idx = chunk_data['chunk_start_idx']
|
|
172
|
+
chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
|
|
173
|
+
chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
|
|
174
|
+
params_dict = chunk_data['params']
|
|
175
|
+
|
|
176
|
+
# Reconstruct FeatureMaps from features data for each sample in the chunk
|
|
177
|
+
chunk_maps = []
|
|
178
|
+
|
|
179
|
+
for sample_data in chunk_samples_data:
|
|
180
|
+
sample_uid = sample_data['sample_uid']
|
|
181
|
+
|
|
182
|
+
# Filter features for this specific sample
|
|
183
|
+
sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
|
|
184
|
+
|
|
185
|
+
# Create FeatureMap for this sample
|
|
186
|
+
feature_map = oms.FeatureMap()
|
|
187
|
+
|
|
188
|
+
# Add each feature to the map
|
|
189
|
+
for feature_dict in sample_features:
|
|
190
|
+
feature = oms.Feature()
|
|
191
|
+
feature.setRT(float(feature_dict['rt']))
|
|
192
|
+
feature.setMZ(float(feature_dict['mz']))
|
|
193
|
+
feature.setIntensity(float(feature_dict['inty']))
|
|
194
|
+
feature.setCharge(int(feature_dict.get('charge', 0)))
|
|
195
|
+
|
|
196
|
+
# Set unique ID using feature_id for mapping back
|
|
197
|
+
feature.setUniqueId(int(feature_dict['feature_id']))
|
|
198
|
+
|
|
199
|
+
feature_map.push_back(feature)
|
|
200
|
+
|
|
201
|
+
chunk_maps.append(feature_map)
|
|
202
|
+
|
|
203
|
+
# Create the chunk consensus map
|
|
204
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
205
|
+
|
|
206
|
+
# Set up file descriptions for chunk
|
|
207
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
208
|
+
for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
|
|
209
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
210
|
+
file_description.filename = sample_data['sample_name']
|
|
211
|
+
file_description.size = feature_map.size()
|
|
212
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
213
|
+
file_descriptions[j] = file_description
|
|
214
|
+
|
|
215
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
216
|
+
|
|
217
|
+
# Use QT algorithm for chunk
|
|
218
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
219
|
+
chunk_params = grouper.getParameters()
|
|
220
|
+
chunk_params.setValue("distance_RT:max_difference", params_dict['rt_tol'])
|
|
221
|
+
chunk_params.setValue("distance_MZ:max_difference", params_dict['mz_tol'])
|
|
222
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
223
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
224
|
+
chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
|
|
225
|
+
|
|
226
|
+
grouper.setParameters(chunk_params)
|
|
227
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
228
|
+
|
|
229
|
+
# Serialize the consensus map result for cross-process communication
|
|
230
|
+
consensus_features = []
|
|
231
|
+
for consensus_feature in chunk_consensus_map:
|
|
232
|
+
feature_data = {
|
|
233
|
+
'rt': consensus_feature.getRT(),
|
|
234
|
+
'mz': consensus_feature.getMZ(),
|
|
235
|
+
'intensity': consensus_feature.getIntensity(),
|
|
236
|
+
'quality': consensus_feature.getQuality(),
|
|
237
|
+
'unique_id': str(consensus_feature.getUniqueId()),
|
|
238
|
+
'features': []
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
# Get constituent features
|
|
242
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
243
|
+
feature_handle_data = {
|
|
244
|
+
'unique_id': str(feature_handle.getUniqueId()),
|
|
245
|
+
'map_index': feature_handle.getMapIndex()
|
|
246
|
+
}
|
|
247
|
+
feature_data['features'].append(feature_handle_data)
|
|
248
|
+
|
|
249
|
+
consensus_features.append(feature_data)
|
|
250
|
+
|
|
251
|
+
return chunk_start_idx, consensus_features
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _serialize_feature_map(feature_map):
|
|
255
|
+
"""
|
|
256
|
+
Serialize a FeatureMap to a list of dictionaries for multiprocessing.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
feature_map: OpenMS FeatureMap object
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
List of feature dictionaries
|
|
263
|
+
"""
|
|
264
|
+
features_data = []
|
|
265
|
+
for feature in feature_map:
|
|
266
|
+
feature_data = {
|
|
267
|
+
'rt': feature.getRT(),
|
|
268
|
+
'mz': feature.getMZ(),
|
|
269
|
+
'intensity': feature.getIntensity(),
|
|
270
|
+
'charge': feature.getCharge(),
|
|
271
|
+
'unique_id': feature.getUniqueId()
|
|
272
|
+
}
|
|
273
|
+
features_data.append(feature_data)
|
|
274
|
+
return features_data
|
|
275
|
+
|
|
276
|
+
|
|
16
277
|
def merge(self, **kwargs) -> None:
|
|
17
278
|
"""
|
|
18
279
|
Group features across samples into consensus features using various algorithms.
|
|
@@ -34,6 +295,8 @@ def merge(self, **kwargs) -> None:
|
|
|
34
295
|
m/z tolerance in Da (Daltons) for all methods
|
|
35
296
|
- chunk_size : int, default 500
|
|
36
297
|
Chunk size for 'chunked' method
|
|
298
|
+
- threads : int, default 1
|
|
299
|
+
Number of parallel processes for chunked methods (kd_chunked, qt_chunked)
|
|
37
300
|
- nr_partitions : int, default 500
|
|
38
301
|
Number of partitions in m/z dimension for KD algorithms
|
|
39
302
|
- min_rel_cc_size : float, default 0.3
|
|
@@ -54,9 +317,19 @@ def merge(self, **kwargs) -> None:
|
|
|
54
317
|
- NoWarp: Memory efficient KD without RT warping for large datasets
|
|
55
318
|
- KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
56
319
|
Uses optimized partitioning for better memory management while maintaining
|
|
57
|
-
full cross-sample consensus feature detection.
|
|
320
|
+
full cross-sample consensus feature detection. Supports parallel processing.
|
|
58
321
|
- QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
|
|
59
322
|
Uses QT clustering in first stage with optimized cross-chunk consensus building.
|
|
323
|
+
Supports parallel processing.
|
|
324
|
+
|
|
325
|
+
Parallel Processing
|
|
326
|
+
------------------
|
|
327
|
+
For kd_chunked and qt_chunked methods, use threads > 1 to enable parallel processing
|
|
328
|
+
of chunk alignments. This can significantly reduce processing time for large datasets
|
|
329
|
+
by processing multiple chunks simultaneously in separate processes.
|
|
330
|
+
|
|
331
|
+
Example:
|
|
332
|
+
study.merge(method='kd_chunked', threads=4, chunk_size=200)
|
|
60
333
|
"""
|
|
61
334
|
start_time = time.time()
|
|
62
335
|
|
|
@@ -95,6 +368,17 @@ def merge(self, **kwargs) -> None:
|
|
|
95
368
|
if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
|
|
96
369
|
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
|
|
97
370
|
|
|
371
|
+
# Check if chunked method is advisable for large datasets
|
|
372
|
+
num_samples = len(self.samples_df) if hasattr(self, 'samples_df') and self.samples_df is not None else 0
|
|
373
|
+
if num_samples > 500:
|
|
374
|
+
chunked_methods = {'kd_chunked', 'qt_chunked'}
|
|
375
|
+
if params.method not in chunked_methods:
|
|
376
|
+
self.logger.warning(
|
|
377
|
+
f"Large dataset detected ({num_samples} samples > 500). "
|
|
378
|
+
f"For better performance and memory efficiency, consider using a chunked method: "
|
|
379
|
+
f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
|
|
380
|
+
)
|
|
381
|
+
|
|
98
382
|
# Persist last used params for diagnostics
|
|
99
383
|
try:
|
|
100
384
|
self._merge_params_last = params.to_dict()
|
|
@@ -113,7 +397,7 @@ def merge(self, **kwargs) -> None:
|
|
|
113
397
|
# Ensure feature maps are available for merging (regenerate if needed)
|
|
114
398
|
if len(self.features_maps) < len(self.samples_df):
|
|
115
399
|
self.features_maps = []
|
|
116
|
-
|
|
400
|
+
# Feature maps will be generated on-demand within each merge method
|
|
117
401
|
|
|
118
402
|
self.logger.info(
|
|
119
403
|
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da, min_rel_cc_size={params.min_rel_cc_size}, max_pairwise_log_fc={params.max_pairwise_log_fc}, max_nr_conflicts={params.max_nr_conflicts}"
|
|
@@ -161,9 +445,16 @@ def merge(self, **kwargs) -> None:
|
|
|
161
445
|
consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
162
446
|
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
163
447
|
|
|
448
|
+
# Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
|
|
449
|
+
if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked']:
|
|
450
|
+
self._consensus_cleanup(params.rt_tol, params.mz_tol)
|
|
451
|
+
|
|
164
452
|
# Perform adduct grouping
|
|
165
453
|
self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
|
|
166
454
|
|
|
455
|
+
# Identify coeluting consensus features by mass shifts and update adduct information
|
|
456
|
+
self._identify_adduct_by_mass_shift(params.rt_tol, cached_adducts_df)
|
|
457
|
+
|
|
167
458
|
# Link MS2 if requested
|
|
168
459
|
if params.link_ms2:
|
|
169
460
|
self._finalize_merge(params.link_ms2, params.min_samples)
|
|
@@ -176,10 +467,13 @@ def merge(self, **kwargs) -> None:
|
|
|
176
467
|
def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
177
468
|
"""KD-tree based merge (fast, recommended)"""
|
|
178
469
|
|
|
470
|
+
# Generate temporary feature maps on-demand from features_df
|
|
471
|
+
temp_feature_maps = _generate_feature_maps_on_demand(self)
|
|
472
|
+
|
|
179
473
|
consensus_map = oms.ConsensusMap()
|
|
180
474
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
181
475
|
|
|
182
|
-
for i, feature_map in enumerate(
|
|
476
|
+
for i, feature_map in enumerate(temp_feature_maps):
|
|
183
477
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
184
478
|
file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
|
|
185
479
|
file_description.size = feature_map.size()
|
|
@@ -205,22 +499,145 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
205
499
|
#params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
|
|
206
500
|
|
|
207
501
|
grouper.setParameters(params_oms)
|
|
208
|
-
grouper.group(
|
|
502
|
+
grouper.group(temp_feature_maps, consensus_map)
|
|
209
503
|
|
|
210
504
|
return consensus_map
|
|
211
505
|
|
|
212
506
|
|
|
507
|
+
def _generate_feature_maps_on_demand(study):
|
|
508
|
+
"""
|
|
509
|
+
Generate feature maps on-demand from study.features_df for merge operations.
|
|
510
|
+
Returns temporary feature maps that are not cached in the study.
|
|
511
|
+
|
|
512
|
+
Args:
|
|
513
|
+
study: Study object containing features_df and samples_df
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
list: List of temporary FeatureMap objects
|
|
517
|
+
"""
|
|
518
|
+
import polars as pl
|
|
519
|
+
import pyopenms as oms
|
|
520
|
+
import numpy as np
|
|
521
|
+
|
|
522
|
+
if study.features_df is None or len(study.features_df) == 0:
|
|
523
|
+
study.logger.error("No features_df available for generating feature maps")
|
|
524
|
+
return []
|
|
525
|
+
|
|
526
|
+
temp_feature_maps = []
|
|
527
|
+
n_samples = len(study.samples_df)
|
|
528
|
+
n_features = len(study.features_df)
|
|
529
|
+
|
|
530
|
+
# Performance optimization: use efficient polars groupby for large datasets
|
|
531
|
+
use_groupby_optimization = n_features > 5000
|
|
532
|
+
if use_groupby_optimization:
|
|
533
|
+
study.logger.debug(f"Using polars groupby optimization for {n_features} features across {n_samples} samples")
|
|
534
|
+
|
|
535
|
+
# Pre-group features by sample_uid - this is much more efficient than repeated filtering
|
|
536
|
+
features_by_sample = study.features_df.group_by("sample_uid").agg([
|
|
537
|
+
pl.col("feature_id"),
|
|
538
|
+
pl.col("mz"),
|
|
539
|
+
pl.col("rt"),
|
|
540
|
+
pl.col("inty"),
|
|
541
|
+
pl.col("quality").fill_null(1.0),
|
|
542
|
+
pl.col("charge").fill_null(0)
|
|
543
|
+
])
|
|
544
|
+
|
|
545
|
+
# Convert to dictionary for fast lookups
|
|
546
|
+
sample_feature_dict = {}
|
|
547
|
+
for row in features_by_sample.iter_rows(named=True):
|
|
548
|
+
sample_uid = row["sample_uid"]
|
|
549
|
+
# Convert lists to numpy arrays for vectorized operations
|
|
550
|
+
sample_feature_dict[sample_uid] = {
|
|
551
|
+
"feature_id": np.array(row["feature_id"]),
|
|
552
|
+
"mz": np.array(row["mz"]),
|
|
553
|
+
"rt": np.array(row["rt"]),
|
|
554
|
+
"inty": np.array(row["inty"]),
|
|
555
|
+
"quality": np.array(row["quality"]),
|
|
556
|
+
"charge": np.array(row["charge"])
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
# Process each sample in order
|
|
560
|
+
for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
|
|
561
|
+
sample_uid = row_dict["sample_uid"]
|
|
562
|
+
|
|
563
|
+
if use_groupby_optimization:
|
|
564
|
+
# Use pre-grouped data with vectorized operations
|
|
565
|
+
if sample_uid not in sample_feature_dict:
|
|
566
|
+
feature_map = oms.FeatureMap()
|
|
567
|
+
temp_feature_maps.append(feature_map)
|
|
568
|
+
continue
|
|
569
|
+
|
|
570
|
+
sample_data = sample_feature_dict[sample_uid]
|
|
571
|
+
n_sample_features = len(sample_data["feature_id"])
|
|
572
|
+
|
|
573
|
+
if n_sample_features == 0:
|
|
574
|
+
feature_map = oms.FeatureMap()
|
|
575
|
+
temp_feature_maps.append(feature_map)
|
|
576
|
+
continue
|
|
577
|
+
|
|
578
|
+
# Create new FeatureMap
|
|
579
|
+
feature_map = oms.FeatureMap()
|
|
580
|
+
|
|
581
|
+
# Use vectorized data directly (no conversion needed)
|
|
582
|
+
for i in range(n_sample_features):
|
|
583
|
+
try:
|
|
584
|
+
feature = oms.Feature()
|
|
585
|
+
feature.setUniqueId(int(sample_data["feature_id"][i]))
|
|
586
|
+
feature.setMZ(float(sample_data["mz"][i]))
|
|
587
|
+
feature.setRT(float(sample_data["rt"][i]))
|
|
588
|
+
feature.setIntensity(float(sample_data["inty"][i]))
|
|
589
|
+
feature.setOverallQuality(float(sample_data["quality"][i]))
|
|
590
|
+
feature.setCharge(int(sample_data["charge"][i]))
|
|
591
|
+
feature_map.push_back(feature)
|
|
592
|
+
except (ValueError, TypeError) as e:
|
|
593
|
+
study.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
594
|
+
continue
|
|
595
|
+
else:
|
|
596
|
+
# Use original polars-based approach for smaller datasets
|
|
597
|
+
sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
598
|
+
|
|
599
|
+
# Create new FeatureMap
|
|
600
|
+
feature_map = oms.FeatureMap()
|
|
601
|
+
|
|
602
|
+
# Convert DataFrame features to OpenMS Features
|
|
603
|
+
for feature_row in sample_features.iter_rows(named=True):
|
|
604
|
+
feature = oms.Feature()
|
|
605
|
+
|
|
606
|
+
# Set properties from DataFrame (handle missing values gracefully)
|
|
607
|
+
try:
|
|
608
|
+
feature.setUniqueId(int(feature_row["feature_id"]))
|
|
609
|
+
feature.setMZ(float(feature_row["mz"]))
|
|
610
|
+
feature.setRT(float(feature_row["rt"]))
|
|
611
|
+
feature.setIntensity(float(feature_row["inty"]))
|
|
612
|
+
feature.setOverallQuality(float(feature_row["quality"]))
|
|
613
|
+
feature.setCharge(int(feature_row["charge"]))
|
|
614
|
+
|
|
615
|
+
# Add to feature map
|
|
616
|
+
feature_map.push_back(feature)
|
|
617
|
+
except (ValueError, TypeError) as e:
|
|
618
|
+
study.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
temp_feature_maps.append(feature_map)
|
|
622
|
+
|
|
623
|
+
study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df")
|
|
624
|
+
return temp_feature_maps
|
|
625
|
+
|
|
626
|
+
|
|
213
627
|
def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
214
628
|
"""QT (Quality Threshold) based merge"""
|
|
215
629
|
|
|
216
|
-
|
|
630
|
+
# Generate temporary feature maps on-demand from features_df
|
|
631
|
+
temp_feature_maps = _generate_feature_maps_on_demand(self)
|
|
632
|
+
|
|
633
|
+
n_samples = len(temp_feature_maps)
|
|
217
634
|
if n_samples > 1000:
|
|
218
635
|
self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
|
|
219
636
|
|
|
220
637
|
consensus_map = oms.ConsensusMap()
|
|
221
638
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
222
639
|
|
|
223
|
-
for i, feature_map in enumerate(
|
|
640
|
+
for i, feature_map in enumerate(temp_feature_maps):
|
|
224
641
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
225
642
|
file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
|
|
226
643
|
file_description.size = feature_map.size()
|
|
@@ -243,7 +660,7 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
243
660
|
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
244
661
|
|
|
245
662
|
grouper.setParameters(params_oms)
|
|
246
|
-
grouper.group(
|
|
663
|
+
grouper.group(temp_feature_maps, consensus_map)
|
|
247
664
|
|
|
248
665
|
return consensus_map
|
|
249
666
|
|
|
@@ -741,10 +1158,13 @@ def _filter_coherence(self, features: list, min_coherence: float) -> list:
|
|
|
741
1158
|
def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
742
1159
|
"""KD-tree based merge without RT warping"""
|
|
743
1160
|
|
|
1161
|
+
# Generate temporary feature maps on-demand from features_df
|
|
1162
|
+
temp_feature_maps = _generate_feature_maps_on_demand(self)
|
|
1163
|
+
|
|
744
1164
|
consensus_map = oms.ConsensusMap()
|
|
745
1165
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
746
1166
|
|
|
747
|
-
for i, feature_map in enumerate(
|
|
1167
|
+
for i, feature_map in enumerate(temp_feature_maps):
|
|
748
1168
|
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
749
1169
|
file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
|
|
750
1170
|
file_description.size = feature_map.size()
|
|
@@ -768,15 +1188,18 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
768
1188
|
#params_oms.setValue("link:charge_merging", "Any")
|
|
769
1189
|
|
|
770
1190
|
grouper.setParameters(params_oms)
|
|
771
|
-
grouper.group(
|
|
1191
|
+
grouper.group(temp_feature_maps, consensus_map)
|
|
772
1192
|
|
|
773
1193
|
return consensus_map
|
|
774
1194
|
|
|
775
1195
|
|
|
776
1196
|
def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
777
|
-
"""KD-based chunked merge with proper cross-chunk consensus building"""
|
|
1197
|
+
"""KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
778
1198
|
|
|
779
|
-
|
|
1199
|
+
# Generate temporary feature maps on-demand from features_df
|
|
1200
|
+
temp_feature_maps = _generate_feature_maps_on_demand(self)
|
|
1201
|
+
|
|
1202
|
+
n_samples = len(temp_feature_maps)
|
|
780
1203
|
if n_samples <= params.chunk_size:
|
|
781
1204
|
self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
|
|
782
1205
|
consensus_map = _merge_kd(self, params)
|
|
@@ -788,58 +1211,175 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
788
1211
|
chunks = []
|
|
789
1212
|
for i in range(0, n_samples, params.chunk_size):
|
|
790
1213
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
791
|
-
chunks.append((i,
|
|
1214
|
+
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
792
1215
|
|
|
793
|
-
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
1216
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
794
1217
|
|
|
795
1218
|
# Process each chunk to create chunk consensus maps
|
|
796
1219
|
chunk_consensus_maps = []
|
|
797
1220
|
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
1221
|
+
if params.threads is None:
|
|
1222
|
+
# Sequential processing (original behavior)
|
|
1223
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}KD Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1224
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
1225
|
+
|
|
1226
|
+
# Set up file descriptions for chunk
|
|
1227
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1228
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
1229
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1230
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1231
|
+
file_description.size = feature_map.size()
|
|
1232
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
1233
|
+
file_descriptions[j] = file_description
|
|
1234
|
+
|
|
1235
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
1236
|
+
|
|
1237
|
+
# Use KD algorithm for chunk
|
|
1238
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
1239
|
+
chunk_params = grouper.getParameters()
|
|
1240
|
+
chunk_params.setValue("mz_unit", "Da")
|
|
1241
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
1242
|
+
chunk_params.setValue("warp:enabled", "true")
|
|
1243
|
+
chunk_params.setValue("warp:rt_tol", params.rt_tol)
|
|
1244
|
+
chunk_params.setValue("warp:mz_tol", params.mz_tol)
|
|
1245
|
+
chunk_params.setValue("link:rt_tol", params.rt_tol)
|
|
1246
|
+
chunk_params.setValue("link:mz_tol", params.mz_tol)
|
|
1247
|
+
chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
1248
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
1249
|
+
chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
1250
|
+
|
|
1251
|
+
grouper.setParameters(chunk_params)
|
|
1252
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
1253
|
+
|
|
1254
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
1255
|
+
|
|
1256
|
+
else:
|
|
1257
|
+
# Parallel processing
|
|
1258
|
+
self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
1259
|
+
|
|
1260
|
+
# Prepare chunk data for parallel processing using features_df slices
|
|
1261
|
+
chunk_data_list = []
|
|
1262
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
1263
|
+
# Get the sample UIDs for this chunk
|
|
1264
|
+
chunk_sample_uids = []
|
|
1265
|
+
chunk_samples_df_rows = []
|
|
1266
|
+
for j in range(len(chunk_maps)):
|
|
1267
|
+
sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
|
|
1268
|
+
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1269
|
+
chunk_samples_df_rows.append(sample_row)
|
|
1270
|
+
|
|
1271
|
+
# Create a DataFrame for this chunk's samples
|
|
1272
|
+
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1273
|
+
|
|
1274
|
+
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1275
|
+
chunk_features_df = self.features_df.filter(
|
|
1276
|
+
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1277
|
+
).select([
|
|
1278
|
+
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
1279
|
+
])
|
|
1280
|
+
|
|
1281
|
+
# Convert DataFrames to serializable format (lists of dicts)
|
|
1282
|
+
chunk_features_data = chunk_features_df.to_dicts()
|
|
1283
|
+
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
1284
|
+
|
|
1285
|
+
chunk_data = {
|
|
1286
|
+
'chunk_start_idx': chunk_start_idx,
|
|
1287
|
+
'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
|
|
1288
|
+
'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
|
|
1289
|
+
'params': {
|
|
1290
|
+
'nr_partitions': params.nr_partitions,
|
|
1291
|
+
'rt_tol': params.rt_tol,
|
|
1292
|
+
'mz_tol': params.mz_tol,
|
|
1293
|
+
'min_rel_cc_size': params.min_rel_cc_size,
|
|
1294
|
+
'max_pairwise_log_fc': params.max_pairwise_log_fc,
|
|
1295
|
+
'max_nr_conflicts': params.max_nr_conflicts
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
chunk_data_list.append(chunk_data)
|
|
1299
|
+
|
|
1300
|
+
# Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
|
|
1301
|
+
try:
|
|
1302
|
+
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
1303
|
+
# Submit all chunk processing tasks
|
|
1304
|
+
future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
|
|
1305
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1306
|
+
|
|
1307
|
+
# Collect results with progress tracking
|
|
1308
|
+
completed_chunks = 0
|
|
1309
|
+
total_chunks = len(chunk_data_list)
|
|
1310
|
+
serialized_chunk_results = []
|
|
1311
|
+
|
|
1312
|
+
for future in as_completed(future_to_chunk):
|
|
1313
|
+
chunk_idx = future_to_chunk[future]
|
|
1314
|
+
try:
|
|
1315
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1316
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1317
|
+
completed_chunks += 1
|
|
1318
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1319
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1320
|
+
except Exception as exc:
|
|
1321
|
+
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
1322
|
+
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
1323
|
+
# Convert to RuntimeError so outer except block can catch it for fallback
|
|
1324
|
+
raise RuntimeError(f"Windows multiprocessing failure: {exc}")
|
|
1325
|
+
else:
|
|
1326
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1327
|
+
raise exc
|
|
1328
|
+
|
|
1329
|
+
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
1330
|
+
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
1331
|
+
if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
|
|
1332
|
+
"process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
|
|
1333
|
+
self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
1334
|
+
self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
1335
|
+
|
|
1336
|
+
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
1337
|
+
# Submit all chunk processing tasks
|
|
1338
|
+
future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
|
|
1339
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1340
|
+
|
|
1341
|
+
# Collect results with progress tracking
|
|
1342
|
+
completed_chunks = 0
|
|
1343
|
+
total_chunks = len(chunk_data_list)
|
|
1344
|
+
serialized_chunk_results = []
|
|
1345
|
+
|
|
1346
|
+
for future in as_completed(future_to_chunk):
|
|
1347
|
+
chunk_idx = future_to_chunk[future]
|
|
1348
|
+
try:
|
|
1349
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1350
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1351
|
+
completed_chunks += 1
|
|
1352
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1353
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1354
|
+
except Exception as exc:
|
|
1355
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1356
|
+
raise exc
|
|
1357
|
+
else:
|
|
1358
|
+
# Re-raise other exceptions
|
|
1359
|
+
raise
|
|
1360
|
+
|
|
1361
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
1362
|
+
chunk_consensus_maps = []
|
|
1363
|
+
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
1364
|
+
# Store serialized data directly for _merge_chunk_results to handle
|
|
1365
|
+
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
1366
|
+
|
|
1367
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
1368
|
+
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
832
1369
|
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
833
1370
|
|
|
834
|
-
#
|
|
1371
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
|
|
835
1372
|
consensus_map = oms.ConsensusMap()
|
|
836
1373
|
return consensus_map
|
|
837
1374
|
|
|
838
1375
|
|
|
839
1376
|
def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
840
|
-
"""QT-based chunked merge with proper cross-chunk consensus building"""
|
|
1377
|
+
"""QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
841
1378
|
|
|
842
|
-
|
|
1379
|
+
# Generate temporary feature maps on-demand from features_df
|
|
1380
|
+
temp_feature_maps = _generate_feature_maps_on_demand(self)
|
|
1381
|
+
|
|
1382
|
+
n_samples = len(temp_feature_maps)
|
|
843
1383
|
if n_samples <= params.chunk_size:
|
|
844
1384
|
self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
|
|
845
1385
|
consensus_map = _merge_qt(self, params)
|
|
@@ -851,45 +1391,159 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
|
|
|
851
1391
|
chunks = []
|
|
852
1392
|
for i in range(0, n_samples, params.chunk_size):
|
|
853
1393
|
chunk_end = min(i + params.chunk_size, n_samples)
|
|
854
|
-
chunks.append((i,
|
|
1394
|
+
chunks.append((i, temp_feature_maps[i:chunk_end]))
|
|
855
1395
|
|
|
856
|
-
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
1396
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
|
|
857
1397
|
|
|
858
1398
|
# Process each chunk to create chunk consensus maps
|
|
859
1399
|
chunk_consensus_maps = []
|
|
860
1400
|
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
1401
|
+
if params.threads is None:
|
|
1402
|
+
# Sequential processing (original behavior)
|
|
1403
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}QT Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
1404
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
1405
|
+
|
|
1406
|
+
# Set up file descriptions for chunk
|
|
1407
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
1408
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
1409
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
1410
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
1411
|
+
file_description.size = feature_map.size()
|
|
1412
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
1413
|
+
file_descriptions[j] = file_description
|
|
1414
|
+
|
|
1415
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
1416
|
+
|
|
1417
|
+
# Use QT algorithm for chunk (main difference from KD chunked)
|
|
1418
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
1419
|
+
chunk_params = grouper.getParameters()
|
|
1420
|
+
chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
|
|
1421
|
+
chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
1422
|
+
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
1423
|
+
chunk_params.setValue("ignore_charge", "true")
|
|
1424
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
1425
|
+
|
|
1426
|
+
grouper.setParameters(chunk_params)
|
|
1427
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
1428
|
+
|
|
1429
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
1430
|
+
|
|
1431
|
+
else:
|
|
1432
|
+
# Parallel processing
|
|
1433
|
+
self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
|
|
872
1434
|
|
|
873
|
-
|
|
1435
|
+
# Prepare chunk data for parallel processing using features_df slices
|
|
1436
|
+
chunk_data_list = []
|
|
1437
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
|
|
1438
|
+
# Get the sample UIDs for this chunk
|
|
1439
|
+
chunk_sample_uids = []
|
|
1440
|
+
chunk_samples_df_rows = []
|
|
1441
|
+
for j in range(len(chunk_maps)):
|
|
1442
|
+
sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
|
|
1443
|
+
chunk_sample_uids.append(sample_row['sample_uid'])
|
|
1444
|
+
chunk_samples_df_rows.append(sample_row)
|
|
1445
|
+
|
|
1446
|
+
# Create a DataFrame for this chunk's samples
|
|
1447
|
+
chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
|
|
1448
|
+
|
|
1449
|
+
# Filter features_df for this chunk's samples and select only necessary columns
|
|
1450
|
+
chunk_features_df = self.features_df.filter(
|
|
1451
|
+
pl.col('sample_uid').is_in(chunk_sample_uids)
|
|
1452
|
+
).select([
|
|
1453
|
+
'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
|
|
1454
|
+
])
|
|
1455
|
+
|
|
1456
|
+
# Convert DataFrames to serializable format (lists of dicts)
|
|
1457
|
+
chunk_features_data = chunk_features_df.to_dicts()
|
|
1458
|
+
chunk_samples_data = chunk_samples_df.to_dicts()
|
|
1459
|
+
|
|
1460
|
+
chunk_data = {
|
|
1461
|
+
'chunk_start_idx': chunk_start_idx,
|
|
1462
|
+
'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
|
|
1463
|
+
'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
|
|
1464
|
+
'params': {
|
|
1465
|
+
'nr_partitions': params.nr_partitions,
|
|
1466
|
+
'rt_tol': params.rt_tol,
|
|
1467
|
+
'mz_tol': params.mz_tol,
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
chunk_data_list.append(chunk_data)
|
|
874
1471
|
|
|
875
|
-
#
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
|
|
879
|
-
chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
880
|
-
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
881
|
-
chunk_params.setValue("ignore_charge", "true")
|
|
882
|
-
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
1472
|
+
# Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
|
|
1473
|
+
executor_class = ProcessPoolExecutor
|
|
1474
|
+
executor_name = "processes"
|
|
883
1475
|
|
|
884
|
-
|
|
885
|
-
|
|
1476
|
+
try:
|
|
1477
|
+
with ProcessPoolExecutor(max_workers=params.threads) as executor:
|
|
1478
|
+
# Submit all chunk processing tasks
|
|
1479
|
+
future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
|
|
1480
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1481
|
+
|
|
1482
|
+
# Collect results with progress tracking
|
|
1483
|
+
completed_chunks = 0
|
|
1484
|
+
total_chunks = len(chunk_data_list)
|
|
1485
|
+
serialized_chunk_results = []
|
|
1486
|
+
|
|
1487
|
+
for future in as_completed(future_to_chunk):
|
|
1488
|
+
chunk_idx = future_to_chunk[future]
|
|
1489
|
+
try:
|
|
1490
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1491
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1492
|
+
completed_chunks += 1
|
|
1493
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1494
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1495
|
+
except Exception as exc:
|
|
1496
|
+
# Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
|
|
1497
|
+
if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
|
|
1498
|
+
# Convert to RuntimeError so outer except block can catch it for fallback
|
|
1499
|
+
raise RuntimeError(f"Windows multiprocessing failure: {exc}")
|
|
1500
|
+
else:
|
|
1501
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1502
|
+
raise exc
|
|
1503
|
+
|
|
1504
|
+
except (RuntimeError, OSError, BrokenProcessPool) as e:
|
|
1505
|
+
# Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
|
|
1506
|
+
if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
|
|
1507
|
+
"process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
|
|
1508
|
+
self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
|
|
1509
|
+
self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
|
|
1510
|
+
|
|
1511
|
+
with ThreadPoolExecutor(max_workers=params.threads) as executor:
|
|
1512
|
+
# Submit all chunk processing tasks
|
|
1513
|
+
future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
|
|
1514
|
+
for i, chunk_data in enumerate(chunk_data_list)}
|
|
1515
|
+
|
|
1516
|
+
# Collect results with progress tracking
|
|
1517
|
+
completed_chunks = 0
|
|
1518
|
+
total_chunks = len(chunk_data_list)
|
|
1519
|
+
serialized_chunk_results = []
|
|
1520
|
+
|
|
1521
|
+
for future in as_completed(future_to_chunk):
|
|
1522
|
+
chunk_idx = future_to_chunk[future]
|
|
1523
|
+
try:
|
|
1524
|
+
chunk_start_idx, consensus_features = future.result()
|
|
1525
|
+
serialized_chunk_results.append((chunk_start_idx, consensus_features))
|
|
1526
|
+
completed_chunks += 1
|
|
1527
|
+
n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
|
|
1528
|
+
self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
|
|
1529
|
+
except Exception as exc:
|
|
1530
|
+
self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
|
|
1531
|
+
raise exc
|
|
1532
|
+
else:
|
|
1533
|
+
# Re-raise other exceptions
|
|
1534
|
+
raise
|
|
886
1535
|
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
1536
|
+
# Store serialized results for _merge_chunk_results to handle directly
|
|
1537
|
+
chunk_consensus_maps = []
|
|
1538
|
+
for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
|
|
1539
|
+
# Store serialized data directly for _merge_chunk_results to handle
|
|
1540
|
+
chunk_consensus_maps.append((chunk_start_idx, consensus_features))
|
|
1541
|
+
|
|
1542
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
1543
|
+
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
890
1544
|
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
891
1545
|
|
|
892
|
-
#
|
|
1546
|
+
# Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
|
|
893
1547
|
consensus_map = oms.ConsensusMap()
|
|
894
1548
|
return consensus_map
|
|
895
1549
|
|
|
@@ -927,61 +1581,128 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
|
|
|
927
1581
|
all_chunk_consensus = []
|
|
928
1582
|
consensus_id_counter = 0
|
|
929
1583
|
|
|
930
|
-
for chunk_idx, (chunk_start_idx,
|
|
931
|
-
|
|
1584
|
+
for chunk_idx, (chunk_start_idx, chunk_data) in enumerate(chunk_consensus_maps):
|
|
1585
|
+
# Handle both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1586
|
+
if isinstance(chunk_data, list):
|
|
1587
|
+
# Parallel processing: chunk_data is a list of serialized consensus feature dictionaries
|
|
1588
|
+
consensus_features_data = chunk_data
|
|
1589
|
+
else:
|
|
1590
|
+
# Sequential processing: chunk_data is a ConsensusMap object
|
|
1591
|
+
chunk_consensus_map = chunk_data
|
|
1592
|
+
consensus_features_data = []
|
|
1593
|
+
|
|
1594
|
+
# Extract data from ConsensusMap and convert to serialized format
|
|
1595
|
+
for consensus_feature in chunk_consensus_map:
|
|
1596
|
+
# Extract feature_uids from this consensus feature
|
|
1597
|
+
feature_uids = []
|
|
1598
|
+
feature_data_list = []
|
|
1599
|
+
sample_uids = []
|
|
1600
|
+
|
|
1601
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
1602
|
+
fuid = str(feature_handle.getUniqueId())
|
|
1603
|
+
if fuid not in feature_uid_map:
|
|
1604
|
+
continue
|
|
1605
|
+
|
|
1606
|
+
feature_uid = feature_uid_map[fuid]
|
|
1607
|
+
feature_data = features_lookup.get(feature_uid)
|
|
1608
|
+
if feature_data:
|
|
1609
|
+
feature_uids.append(feature_uid)
|
|
1610
|
+
feature_data_list.append(feature_data)
|
|
1611
|
+
sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
|
|
1612
|
+
|
|
1613
|
+
if not feature_data_list:
|
|
1614
|
+
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
1615
|
+
continue
|
|
1616
|
+
|
|
1617
|
+
# Convert ConsensusFeature to serialized format
|
|
1618
|
+
consensus_feature_data = {
|
|
1619
|
+
'rt': consensus_feature.getRT(),
|
|
1620
|
+
'mz': consensus_feature.getMZ(),
|
|
1621
|
+
'intensity': consensus_feature.getIntensity(),
|
|
1622
|
+
'quality': consensus_feature.getQuality(),
|
|
1623
|
+
'feature_uids': feature_uids,
|
|
1624
|
+
'feature_data_list': feature_data_list,
|
|
1625
|
+
'sample_uids': sample_uids
|
|
1626
|
+
}
|
|
1627
|
+
consensus_features_data.append(consensus_feature_data)
|
|
1628
|
+
|
|
1629
|
+
# Process the consensus features (now all in serialized format)
|
|
1630
|
+
for consensus_feature_data in consensus_features_data:
|
|
932
1631
|
# ACCEPT ALL consensus features (size >=1) here.
|
|
933
1632
|
# Reason: A feature that is globally present in many samples can still
|
|
934
1633
|
# appear only once inside a given sample chunk. Early filtering at
|
|
935
1634
|
# size>=2 causes irreversible loss and underestimates the final
|
|
936
1635
|
# consensus count (observed ~296 vs 950 for KD). We defer filtering
|
|
937
1636
|
# strictly to the final global min_samples.
|
|
938
|
-
|
|
939
|
-
# Extract feature_uids from this consensus feature
|
|
940
|
-
feature_uids = []
|
|
941
|
-
feature_data_list = []
|
|
942
|
-
sample_uids = []
|
|
943
1637
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
1638
|
+
# For parallel processing, feature data is already extracted
|
|
1639
|
+
if isinstance(chunk_data, list):
|
|
1640
|
+
# Extract feature_uids and data from serialized format for parallel processing
|
|
1641
|
+
feature_uids = []
|
|
1642
|
+
feature_data_list = []
|
|
1643
|
+
sample_uids = []
|
|
1644
|
+
|
|
1645
|
+
for handle_data in consensus_feature_data['features']:
|
|
1646
|
+
fuid = str(handle_data['unique_id'])
|
|
1647
|
+
if fuid not in feature_uid_map:
|
|
1648
|
+
continue
|
|
1649
|
+
|
|
1650
|
+
feature_uid = feature_uid_map[fuid]
|
|
1651
|
+
feature_data = features_lookup.get(feature_uid)
|
|
1652
|
+
if feature_data:
|
|
1653
|
+
feature_uids.append(feature_uid)
|
|
1654
|
+
feature_data_list.append(feature_data)
|
|
1655
|
+
sample_uids.append(chunk_start_idx + handle_data['map_index'] + 1)
|
|
1656
|
+
|
|
1657
|
+
if not feature_data_list:
|
|
947
1658
|
continue
|
|
948
1659
|
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
1660
|
+
# Get RT/MZ from consensus feature data
|
|
1661
|
+
consensus_rt = consensus_feature_data['rt']
|
|
1662
|
+
consensus_mz = consensus_feature_data['mz']
|
|
1663
|
+
consensus_intensity = consensus_feature_data['intensity']
|
|
1664
|
+
consensus_quality = consensus_feature_data['quality']
|
|
1665
|
+
else:
|
|
1666
|
+
# Sequential processing: data is already extracted above
|
|
1667
|
+
feature_uids = consensus_feature_data['feature_uids']
|
|
1668
|
+
feature_data_list = consensus_feature_data['feature_data_list']
|
|
1669
|
+
sample_uids = consensus_feature_data['sample_uids']
|
|
1670
|
+
consensus_rt = consensus_feature_data['rt']
|
|
1671
|
+
consensus_mz = consensus_feature_data['mz']
|
|
1672
|
+
consensus_intensity = consensus_feature_data['intensity']
|
|
1673
|
+
consensus_quality = consensus_feature_data['quality']
|
|
955
1674
|
|
|
956
1675
|
if not feature_data_list:
|
|
957
1676
|
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
958
|
-
continue
|
|
1677
|
+
continue
|
|
1678
|
+
|
|
1679
|
+
# Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
|
|
959
1680
|
rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
|
|
960
1681
|
mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
|
|
961
1682
|
if rt_vals_local:
|
|
962
1683
|
rt_min_local = min(rt_vals_local)
|
|
963
1684
|
rt_max_local = max(rt_vals_local)
|
|
964
1685
|
else:
|
|
965
|
-
rt_min_local = rt_max_local =
|
|
1686
|
+
rt_min_local = rt_max_local = consensus_rt
|
|
966
1687
|
if mz_vals_local:
|
|
967
1688
|
mz_min_local = min(mz_vals_local)
|
|
968
1689
|
mz_max_local = max(mz_vals_local)
|
|
969
1690
|
else:
|
|
970
|
-
mz_min_local = mz_max_local =
|
|
1691
|
+
mz_min_local = mz_max_local = consensus_mz
|
|
971
1692
|
|
|
972
1693
|
# Store chunk consensus with feature tracking
|
|
973
1694
|
chunk_consensus_data = {
|
|
974
1695
|
'consensus_id': consensus_id_counter,
|
|
975
1696
|
'chunk_idx': chunk_idx,
|
|
976
1697
|
'chunk_start_idx': chunk_start_idx,
|
|
977
|
-
'mz':
|
|
978
|
-
'rt':
|
|
1698
|
+
'mz': consensus_mz,
|
|
1699
|
+
'rt': consensus_rt,
|
|
979
1700
|
'mz_min': mz_min_local,
|
|
980
1701
|
'mz_max': mz_max_local,
|
|
981
1702
|
'rt_min': rt_min_local,
|
|
982
1703
|
'rt_max': rt_max_local,
|
|
983
|
-
'intensity':
|
|
984
|
-
'quality':
|
|
1704
|
+
'intensity': consensus_intensity,
|
|
1705
|
+
'quality': consensus_quality,
|
|
985
1706
|
'feature_uids': feature_uids,
|
|
986
1707
|
'feature_data_list': feature_data_list,
|
|
987
1708
|
'sample_uids': sample_uids,
|
|
@@ -1479,9 +2200,6 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
|
|
|
1479
2200
|
return list(groups_by_root.values())
|
|
1480
2201
|
|
|
1481
2202
|
|
|
1482
|
-
# Note: Restored proper chunked implementation with cross-chunk consensus clustering
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
2203
|
def _reset_consensus_data(self):
|
|
1486
2204
|
"""Reset consensus-related DataFrames at the start of merge."""
|
|
1487
2205
|
self.consensus_df = pl.DataFrame()
|
|
@@ -1960,6 +2678,595 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
|
|
|
1960
2678
|
)
|
|
1961
2679
|
|
|
1962
2680
|
|
|
2681
|
+
def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
|
|
2682
|
+
"""
|
|
2683
|
+
Count consensus features grouped in tight clusters.
|
|
2684
|
+
|
|
2685
|
+
Args:
|
|
2686
|
+
mz_tol: m/z tolerance in Daltons for cluster detection
|
|
2687
|
+
rt_tol: RT tolerance in seconds for cluster detection
|
|
2688
|
+
|
|
2689
|
+
Returns:
|
|
2690
|
+
Number of tight clusters found
|
|
2691
|
+
"""
|
|
2692
|
+
if len(self.consensus_df) < 2:
|
|
2693
|
+
return 0
|
|
2694
|
+
|
|
2695
|
+
# Extract consensus feature data
|
|
2696
|
+
consensus_data = []
|
|
2697
|
+
for row in self.consensus_df.iter_rows(named=True):
|
|
2698
|
+
consensus_data.append({
|
|
2699
|
+
'consensus_uid': row['consensus_uid'],
|
|
2700
|
+
'mz': row['mz'],
|
|
2701
|
+
'rt': row['rt']
|
|
2702
|
+
})
|
|
2703
|
+
|
|
2704
|
+
# Build spatial index using bins
|
|
2705
|
+
rt_bin_size = rt_tol / 2
|
|
2706
|
+
mz_bin_size = mz_tol / 2
|
|
2707
|
+
|
|
2708
|
+
bins = defaultdict(list)
|
|
2709
|
+
for feature in consensus_data:
|
|
2710
|
+
rt_bin = int(feature['rt'] / rt_bin_size)
|
|
2711
|
+
mz_bin = int(feature['mz'] / mz_bin_size)
|
|
2712
|
+
bins[(rt_bin, mz_bin)].append(feature)
|
|
2713
|
+
|
|
2714
|
+
processed_features = set()
|
|
2715
|
+
tight_clusters_count = 0
|
|
2716
|
+
|
|
2717
|
+
for bin_key, bin_features in bins.items():
|
|
2718
|
+
if len(bin_features) < 2:
|
|
2719
|
+
continue
|
|
2720
|
+
|
|
2721
|
+
# Check neighboring bins for additional features
|
|
2722
|
+
rt_bin, mz_bin = bin_key
|
|
2723
|
+
all_nearby_features = list(bin_features)
|
|
2724
|
+
|
|
2725
|
+
# Check 8 neighboring bins
|
|
2726
|
+
for drt in [-1, 0, 1]:
|
|
2727
|
+
for dmz in [-1, 0, 1]:
|
|
2728
|
+
if drt == 0 and dmz == 0:
|
|
2729
|
+
continue
|
|
2730
|
+
neighbor_key = (rt_bin + drt, mz_bin + dmz)
|
|
2731
|
+
if neighbor_key in bins:
|
|
2732
|
+
all_nearby_features.extend(bins[neighbor_key])
|
|
2733
|
+
|
|
2734
|
+
# Filter to features within actual tolerances and not yet processed
|
|
2735
|
+
valid_cluster_features = []
|
|
2736
|
+
for feature in all_nearby_features:
|
|
2737
|
+
if feature['consensus_uid'] in processed_features:
|
|
2738
|
+
continue
|
|
2739
|
+
|
|
2740
|
+
# Check if this feature is within tolerances of any bin feature
|
|
2741
|
+
for bin_feature in bin_features:
|
|
2742
|
+
rt_diff = abs(feature['rt'] - bin_feature['rt'])
|
|
2743
|
+
mz_diff = abs(feature['mz'] - bin_feature['mz'])
|
|
2744
|
+
|
|
2745
|
+
if rt_diff <= rt_tol and mz_diff <= mz_tol:
|
|
2746
|
+
valid_cluster_features.append(feature)
|
|
2747
|
+
break
|
|
2748
|
+
|
|
2749
|
+
# Count as tight cluster if we have multiple features
|
|
2750
|
+
if len(valid_cluster_features) >= 2:
|
|
2751
|
+
tight_clusters_count += 1
|
|
2752
|
+
for feature in valid_cluster_features:
|
|
2753
|
+
processed_features.add(feature['consensus_uid'])
|
|
2754
|
+
|
|
2755
|
+
return tight_clusters_count
|
|
2756
|
+
|
|
2757
|
+
|
|
2758
|
+
def _consensus_cleanup(self, rt_tol, mz_tol):
|
|
2759
|
+
"""
|
|
2760
|
+
Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
|
|
2761
|
+
|
|
2762
|
+
This function:
|
|
2763
|
+
1. Identifies and merges consensus features that are likely over-segmented
|
|
2764
|
+
(too many features in very tight m/z and RT windows)
|
|
2765
|
+
2. Performs deisotoping to remove +1 and +2 isotopic features
|
|
2766
|
+
"""
|
|
2767
|
+
if len(self.consensus_df) == 0:
|
|
2768
|
+
return
|
|
2769
|
+
|
|
2770
|
+
initial_count = len(self.consensus_df)
|
|
2771
|
+
|
|
2772
|
+
# Only perform enhanced post-clustering if there are many features
|
|
2773
|
+
if initial_count < 50:
|
|
2774
|
+
return
|
|
2775
|
+
|
|
2776
|
+
self.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
|
|
2777
|
+
|
|
2778
|
+
# Find tight clusters using spatial binning
|
|
2779
|
+
consensus_data = []
|
|
2780
|
+
for row in self.consensus_df.iter_rows(named=True):
|
|
2781
|
+
consensus_data.append({
|
|
2782
|
+
'consensus_uid': row['consensus_uid'],
|
|
2783
|
+
'mz': row['mz'],
|
|
2784
|
+
'rt': row['rt'],
|
|
2785
|
+
'inty_mean': row.get('inty_mean', 0),
|
|
2786
|
+
'number_samples': row.get('number_samples', 0)
|
|
2787
|
+
})
|
|
2788
|
+
|
|
2789
|
+
# Parameters for tight clustering detection - more lenient for effective merging
|
|
2790
|
+
tight_rt_tol = min(0.5, rt_tol * 0.5) # More lenient RT tolerance (max 0.5s)
|
|
2791
|
+
tight_mz_tol = min(0.05, max(0.03, mz_tol * 2.0)) # More lenient m/z tolerance (min 30 mDa, max 50 mDa)
|
|
2792
|
+
|
|
2793
|
+
# Build spatial index using smaller RT and m/z bins for better coverage
|
|
2794
|
+
rt_bin_size = tight_rt_tol / 4 # Smaller bins to ensure nearby features are captured
|
|
2795
|
+
mz_bin_size = tight_mz_tol / 4 # Smaller bins to ensure nearby features are captured
|
|
2796
|
+
|
|
2797
|
+
bins = defaultdict(list)
|
|
2798
|
+
for feature in consensus_data:
|
|
2799
|
+
rt_bin = int(feature['rt'] / rt_bin_size)
|
|
2800
|
+
mz_bin = int(feature['mz'] / mz_bin_size)
|
|
2801
|
+
bins[(rt_bin, mz_bin)].append(feature)
|
|
2802
|
+
|
|
2803
|
+
# Find clusters that need merging
|
|
2804
|
+
merge_groups = []
|
|
2805
|
+
processed_uids = set()
|
|
2806
|
+
|
|
2807
|
+
for bin_key, bin_features in bins.items():
|
|
2808
|
+
# Check current bin and extended neighboring bins for complete cluster
|
|
2809
|
+
rt_bin, mz_bin = bin_key
|
|
2810
|
+
cluster_features = list(bin_features)
|
|
2811
|
+
|
|
2812
|
+
# Check a larger neighborhood (±2 bins) to ensure we capture all nearby features
|
|
2813
|
+
for dr in [-2, -1, 0, 1, 2]:
|
|
2814
|
+
for dm in [-2, -1, 0, 1, 2]:
|
|
2815
|
+
if dr == 0 and dm == 0:
|
|
2816
|
+
continue
|
|
2817
|
+
neighbor_key = (rt_bin + dr, mz_bin + dm)
|
|
2818
|
+
if neighbor_key in bins:
|
|
2819
|
+
cluster_features.extend(bins[neighbor_key])
|
|
2820
|
+
|
|
2821
|
+
# Remove duplicates
|
|
2822
|
+
seen_uids = set()
|
|
2823
|
+
unique_features = []
|
|
2824
|
+
for f in cluster_features:
|
|
2825
|
+
if f['consensus_uid'] not in seen_uids:
|
|
2826
|
+
unique_features.append(f)
|
|
2827
|
+
seen_uids.add(f['consensus_uid'])
|
|
2828
|
+
|
|
2829
|
+
# Only proceed if we have at least 2 features after including neighbors
|
|
2830
|
+
if len(unique_features) < 2:
|
|
2831
|
+
continue
|
|
2832
|
+
|
|
2833
|
+
# Calculate cluster bounds
|
|
2834
|
+
mzs = [f['mz'] for f in unique_features]
|
|
2835
|
+
rts = [f['rt'] for f in unique_features]
|
|
2836
|
+
|
|
2837
|
+
mz_spread = max(mzs) - min(mzs)
|
|
2838
|
+
rt_spread = max(rts) - min(rts)
|
|
2839
|
+
|
|
2840
|
+
# Only merge if features are tightly clustered
|
|
2841
|
+
if mz_spread <= tight_mz_tol and rt_spread <= tight_rt_tol:
|
|
2842
|
+
# Filter out features that were already processed
|
|
2843
|
+
uids_in_cluster = {f['consensus_uid'] for f in unique_features}
|
|
2844
|
+
unprocessed_features = [f for f in unique_features if f['consensus_uid'] not in processed_uids]
|
|
2845
|
+
|
|
2846
|
+
# Only proceed if we have at least 2 unprocessed features that still form a tight cluster
|
|
2847
|
+
if len(unprocessed_features) >= 2:
|
|
2848
|
+
# Recalculate bounds for unprocessed features only
|
|
2849
|
+
unprocessed_mzs = [f['mz'] for f in unprocessed_features]
|
|
2850
|
+
unprocessed_rts = [f['rt'] for f in unprocessed_features]
|
|
2851
|
+
|
|
2852
|
+
unprocessed_mz_spread = max(unprocessed_mzs) - min(unprocessed_mzs)
|
|
2853
|
+
unprocessed_rt_spread = max(unprocessed_rts) - min(unprocessed_rts)
|
|
2854
|
+
|
|
2855
|
+
# Check if unprocessed features still meet tight clustering criteria
|
|
2856
|
+
if unprocessed_mz_spread <= tight_mz_tol and unprocessed_rt_spread <= tight_rt_tol:
|
|
2857
|
+
merge_groups.append(unprocessed_features)
|
|
2858
|
+
processed_uids.update({f['consensus_uid'] for f in unprocessed_features})
|
|
2859
|
+
|
|
2860
|
+
if not merge_groups:
|
|
2861
|
+
return
|
|
2862
|
+
|
|
2863
|
+
self.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
|
|
2864
|
+
|
|
2865
|
+
# Merge clusters by keeping the most representative feature
|
|
2866
|
+
uids_to_remove = set()
|
|
2867
|
+
|
|
2868
|
+
for group in merge_groups:
|
|
2869
|
+
if len(group) < 2:
|
|
2870
|
+
continue
|
|
2871
|
+
|
|
2872
|
+
# Find the most representative feature (highest intensity and sample count)
|
|
2873
|
+
best_feature = max(group, key=lambda x: (x['number_samples'], x['inty_mean']))
|
|
2874
|
+
|
|
2875
|
+
# Mark other features for removal
|
|
2876
|
+
for f in group:
|
|
2877
|
+
if f['consensus_uid'] != best_feature['consensus_uid']:
|
|
2878
|
+
uids_to_remove.add(f['consensus_uid'])
|
|
2879
|
+
|
|
2880
|
+
if uids_to_remove:
|
|
2881
|
+
# Remove merged features from consensus_df
|
|
2882
|
+
self.consensus_df = self.consensus_df.filter(
|
|
2883
|
+
~pl.col('consensus_uid').is_in(list(uids_to_remove))
|
|
2884
|
+
)
|
|
2885
|
+
|
|
2886
|
+
# Also update consensus_mapping_df if it exists
|
|
2887
|
+
if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
|
|
2888
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2889
|
+
~pl.col('consensus_uid').is_in(list(uids_to_remove))
|
|
2890
|
+
)
|
|
2891
|
+
|
|
2892
|
+
final_count = len(self.consensus_df)
|
|
2893
|
+
reduction = initial_count - final_count
|
|
2894
|
+
reduction_pct = (reduction / initial_count) * 100
|
|
2895
|
+
|
|
2896
|
+
if reduction > 0:
|
|
2897
|
+
self.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
|
|
2898
|
+
|
|
2899
|
+
# Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
|
|
2900
|
+
pre_deisotoping_count = len(self.consensus_df)
|
|
2901
|
+
isotope_uids_to_remove = set()
|
|
2902
|
+
|
|
2903
|
+
# Use strict tolerances for deisotoping (same as declustering)
|
|
2904
|
+
deisotope_rt_tol = min(0.3, rt_tol * 0.3) # Strict RT tolerance for isotope detection
|
|
2905
|
+
deisotope_mz_tol = min(0.01, mz_tol * 0.5) # Strict m/z tolerance for isotope detection
|
|
2906
|
+
|
|
2907
|
+
# Get current consensus data for isotope detection
|
|
2908
|
+
current_consensus_data = []
|
|
2909
|
+
for row in self.consensus_df.iter_rows(named=True):
|
|
2910
|
+
current_consensus_data.append({
|
|
2911
|
+
'consensus_uid': row['consensus_uid'],
|
|
2912
|
+
'mz': row['mz'],
|
|
2913
|
+
'rt': row['rt'],
|
|
2914
|
+
'number_samples': row.get('number_samples', 0)
|
|
2915
|
+
})
|
|
2916
|
+
|
|
2917
|
+
# Sort by m/z for efficient searching
|
|
2918
|
+
current_consensus_data.sort(key=lambda x: x['mz'])
|
|
2919
|
+
n_current = len(current_consensus_data)
|
|
2920
|
+
|
|
2921
|
+
for i in range(n_current):
|
|
2922
|
+
feature_i = current_consensus_data[i]
|
|
2923
|
+
|
|
2924
|
+
# Skip if already marked for removal
|
|
2925
|
+
if feature_i['consensus_uid'] in isotope_uids_to_remove:
|
|
2926
|
+
continue
|
|
2927
|
+
|
|
2928
|
+
# Look for potential +1 and +2 isotopes (higher m/z)
|
|
2929
|
+
for j in range(i + 1, n_current):
|
|
2930
|
+
feature_j = current_consensus_data[j]
|
|
2931
|
+
|
|
2932
|
+
# Skip if already marked for removal
|
|
2933
|
+
if feature_j['consensus_uid'] in isotope_uids_to_remove:
|
|
2934
|
+
continue
|
|
2935
|
+
|
|
2936
|
+
mz_diff = feature_j['mz'] - feature_i['mz']
|
|
2937
|
+
|
|
2938
|
+
# Break if m/z difference is too large (features are sorted by m/z)
|
|
2939
|
+
if mz_diff > 2.1: # Beyond +2 isotope range
|
|
2940
|
+
break
|
|
2941
|
+
|
|
2942
|
+
rt_diff = abs(feature_j['rt'] - feature_i['rt'])
|
|
2943
|
+
|
|
2944
|
+
# Check for +1 isotope (C13 mass difference ≈ 1.003354 Da)
|
|
2945
|
+
if (0.995 <= mz_diff <= 1.011) and rt_diff <= deisotope_rt_tol:
|
|
2946
|
+
# Potential +1 isotope - should have fewer samples than main feature
|
|
2947
|
+
if feature_j['number_samples'] < feature_i['number_samples']:
|
|
2948
|
+
isotope_uids_to_remove.add(feature_j['consensus_uid'])
|
|
2949
|
+
continue
|
|
2950
|
+
|
|
2951
|
+
# Check for +2 isotope (2 * C13 mass difference ≈ 2.006708 Da)
|
|
2952
|
+
if (1.995 <= mz_diff <= 2.018) and rt_diff <= deisotope_rt_tol:
|
|
2953
|
+
# Potential +2 isotope - should have fewer samples than main feature
|
|
2954
|
+
if feature_j['number_samples'] < feature_i['number_samples']:
|
|
2955
|
+
isotope_uids_to_remove.add(feature_j['consensus_uid'])
|
|
2956
|
+
continue
|
|
2957
|
+
|
|
2958
|
+
# Remove isotopic features
|
|
2959
|
+
if isotope_uids_to_remove:
|
|
2960
|
+
self.consensus_df = self.consensus_df.filter(
|
|
2961
|
+
~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
|
|
2962
|
+
)
|
|
2963
|
+
|
|
2964
|
+
# Also update consensus_mapping_df if it exists
|
|
2965
|
+
if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
|
|
2966
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
2967
|
+
~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
|
|
2968
|
+
)
|
|
2969
|
+
|
|
2970
|
+
post_deisotoping_count = len(self.consensus_df)
|
|
2971
|
+
isotope_reduction = pre_deisotoping_count - post_deisotoping_count
|
|
2972
|
+
|
|
2973
|
+
if isotope_reduction > 0:
|
|
2974
|
+
self.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
|
|
2975
|
+
|
|
2976
|
+
# Final summary
|
|
2977
|
+
final_count = len(self.consensus_df)
|
|
2978
|
+
total_reduction = initial_count - final_count
|
|
2979
|
+
if total_reduction > 0:
|
|
2980
|
+
total_reduction_pct = (total_reduction / initial_count) * 100
|
|
2981
|
+
self.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
|
|
2982
|
+
|
|
2983
|
+
|
|
2984
|
+
def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
|
|
2985
|
+
"""
|
|
2986
|
+
Identify coeluting consensus features by characteristic mass shifts between adducts
|
|
2987
|
+
and update their adduct information accordingly.
|
|
2988
|
+
|
|
2989
|
+
This function:
|
|
2990
|
+
1. Generates a catalogue of mass shifts between adducts using _get_adducts()
|
|
2991
|
+
2. Searches for pairs of consensus features with same RT (within strict RT tolerance)
|
|
2992
|
+
and matching m/z shifts (±0.005 Da)
|
|
2993
|
+
3. Updates adduct_* columns based on identified relationships
|
|
2994
|
+
|
|
2995
|
+
Args:
|
|
2996
|
+
rt_tol: RT tolerance in seconds (strict tolerance for coelution detection)
|
|
2997
|
+
cached_adducts_df: Pre-computed adducts DataFrame for performance
|
|
2998
|
+
"""
|
|
2999
|
+
import polars as pl
|
|
3000
|
+
import numpy as np
|
|
3001
|
+
from collections import defaultdict
|
|
3002
|
+
|
|
3003
|
+
# Check if consensus_df exists and has features
|
|
3004
|
+
if len(self.consensus_df) == 0:
|
|
3005
|
+
self.logger.debug("No consensus features for adduct identification by mass shift")
|
|
3006
|
+
return
|
|
3007
|
+
|
|
3008
|
+
self.logger.info(f"Identifying coeluting adducts by mass shifts in {len(self.consensus_df)} consensus features...")
|
|
3009
|
+
|
|
3010
|
+
# Get adducts DataFrame if not provided
|
|
3011
|
+
if cached_adducts_df is None or cached_adducts_df.is_empty():
|
|
3012
|
+
try:
|
|
3013
|
+
# Use lower min_probability for better adduct coverage in mass shift identification
|
|
3014
|
+
cached_adducts_df = self._get_adducts(min_probability=0.01)
|
|
3015
|
+
except Exception as e:
|
|
3016
|
+
self.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
|
|
3017
|
+
return
|
|
3018
|
+
|
|
3019
|
+
if cached_adducts_df.is_empty():
|
|
3020
|
+
self.logger.debug("No adducts available for mass shift identification")
|
|
3021
|
+
return
|
|
3022
|
+
|
|
3023
|
+
# Build catalogue of mass shifts between adducts
|
|
3024
|
+
mass_shift_catalog = {}
|
|
3025
|
+
adduct_info = {}
|
|
3026
|
+
|
|
3027
|
+
# Extract adduct information
|
|
3028
|
+
adducts_data = cached_adducts_df.select(["name", "charge", "mass_shift"]).to_dicts()
|
|
3029
|
+
|
|
3030
|
+
for adduct in adducts_data:
|
|
3031
|
+
name = adduct["name"]
|
|
3032
|
+
charge = adduct["charge"]
|
|
3033
|
+
mass_shift = adduct["mass_shift"]
|
|
3034
|
+
|
|
3035
|
+
adduct_info[name] = {
|
|
3036
|
+
"charge": charge,
|
|
3037
|
+
"mass_shift": mass_shift
|
|
3038
|
+
}
|
|
3039
|
+
|
|
3040
|
+
# Generate pairwise mass differences for catalog
|
|
3041
|
+
for adduct1 in adducts_data:
|
|
3042
|
+
for adduct2 in adducts_data:
|
|
3043
|
+
if adduct1["name"] == adduct2["name"]:
|
|
3044
|
+
continue
|
|
3045
|
+
|
|
3046
|
+
name1, charge1, ms1 = adduct1["name"], adduct1["charge"], adduct1["mass_shift"]
|
|
3047
|
+
name2, charge2, ms2 = adduct2["name"], adduct2["charge"], adduct2["mass_shift"]
|
|
3048
|
+
|
|
3049
|
+
# Only consider shifts between adducts that have the same charge (same ionization state)
|
|
3050
|
+
if charge1 != charge2:
|
|
3051
|
+
continue
|
|
3052
|
+
|
|
3053
|
+
# Calculate expected m/z difference
|
|
3054
|
+
if charge1 != 0 and charge2 != 0:
|
|
3055
|
+
mz_diff = (ms1 - ms2) / abs(charge1)
|
|
3056
|
+
else:
|
|
3057
|
+
continue # Skip neutral adducts for this analysis
|
|
3058
|
+
|
|
3059
|
+
# Store the mass shift relationship
|
|
3060
|
+
shift_key = round(mz_diff, 4) # Round to 4 decimal places for matching
|
|
3061
|
+
if shift_key not in mass_shift_catalog:
|
|
3062
|
+
mass_shift_catalog[shift_key] = []
|
|
3063
|
+
mass_shift_catalog[shift_key].append({
|
|
3064
|
+
"from_adduct": name1,
|
|
3065
|
+
"to_adduct": name2,
|
|
3066
|
+
"mz_shift": mz_diff,
|
|
3067
|
+
"from_charge": charge1,
|
|
3068
|
+
"to_charge": charge2
|
|
3069
|
+
})
|
|
3070
|
+
|
|
3071
|
+
self.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
|
|
3072
|
+
|
|
3073
|
+
# Get consensus features data
|
|
3074
|
+
consensus_data = []
|
|
3075
|
+
for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
|
|
3076
|
+
consensus_data.append({
|
|
3077
|
+
"index": i,
|
|
3078
|
+
"consensus_uid": row["consensus_uid"],
|
|
3079
|
+
"rt": row["rt"],
|
|
3080
|
+
"mz": row["mz"],
|
|
3081
|
+
"adduct_top": row.get("adduct_top", "[M+?]1+"),
|
|
3082
|
+
"adduct_charge_top": row.get("adduct_charge_top", 1),
|
|
3083
|
+
"adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
|
|
3084
|
+
"adduct_mass_shift_top": row.get("adduct_mass_shift_top"),
|
|
3085
|
+
"inty_mean": row.get("inty_mean", 0)
|
|
3086
|
+
})
|
|
3087
|
+
|
|
3088
|
+
# Sort by RT for efficient searching
|
|
3089
|
+
consensus_data.sort(key=lambda x: x["rt"])
|
|
3090
|
+
n_features = len(consensus_data)
|
|
3091
|
+
|
|
3092
|
+
# Track updates to make
|
|
3093
|
+
adduct_updates = {} # consensus_uid -> new_adduct_info
|
|
3094
|
+
|
|
3095
|
+
# Strict RT tolerance for coelution (convert to minutes)
|
|
3096
|
+
rt_tol_strict = rt_tol * 0.5 # Use half the merge tolerance for strict coelution
|
|
3097
|
+
mz_tol_shift = 0.005 # ±5 mDa tolerance for mass shift matching
|
|
3098
|
+
|
|
3099
|
+
# Search for coeluting pairs with characteristic mass shifts
|
|
3100
|
+
updated_count = 0
|
|
3101
|
+
|
|
3102
|
+
for i in range(n_features):
|
|
3103
|
+
feature1 = consensus_data[i]
|
|
3104
|
+
rt1 = feature1["rt"]
|
|
3105
|
+
mz1 = feature1["mz"]
|
|
3106
|
+
adduct1 = feature1["adduct_top"]
|
|
3107
|
+
|
|
3108
|
+
# Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
|
|
3109
|
+
# if adduct1 and "?" not in adduct1:
|
|
3110
|
+
# continue
|
|
3111
|
+
|
|
3112
|
+
# Search for coeluting features within strict RT tolerance
|
|
3113
|
+
for j in range(i + 1, n_features):
|
|
3114
|
+
feature2 = consensus_data[j]
|
|
3115
|
+
rt2 = feature2["rt"]
|
|
3116
|
+
|
|
3117
|
+
# Break if RT difference exceeds tolerance (sorted by RT)
|
|
3118
|
+
if abs(rt2 - rt1) > rt_tol_strict:
|
|
3119
|
+
break
|
|
3120
|
+
|
|
3121
|
+
mz2 = feature2["mz"]
|
|
3122
|
+
adduct2 = feature2["adduct_top"]
|
|
3123
|
+
|
|
3124
|
+
# Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
|
|
3125
|
+
# if adduct2 and "?" not in adduct2:
|
|
3126
|
+
# continue
|
|
3127
|
+
|
|
3128
|
+
# Calculate observed m/z difference
|
|
3129
|
+
mz_diff = mz2 - mz1
|
|
3130
|
+
shift_key = round(mz_diff, 4)
|
|
3131
|
+
|
|
3132
|
+
# Check if this mass shift matches any known adduct relationships
|
|
3133
|
+
for catalog_shift, relationships in mass_shift_catalog.items():
|
|
3134
|
+
if abs(shift_key - catalog_shift) <= mz_tol_shift:
|
|
3135
|
+
# Found a matching mass shift!
|
|
3136
|
+
|
|
3137
|
+
# Choose the best relationship based on common adducts
|
|
3138
|
+
best_rel = None
|
|
3139
|
+
best_score = 0
|
|
3140
|
+
|
|
3141
|
+
for rel in relationships:
|
|
3142
|
+
# Prioritize common adducts ([M+H]+, [M+Na]+, [M+NH4]+)
|
|
3143
|
+
score = 0
|
|
3144
|
+
if "H]" in rel["from_adduct"]: score += 3
|
|
3145
|
+
if "Na]" in rel["from_adduct"]: score += 2
|
|
3146
|
+
if "NH4]" in rel["from_adduct"]: score += 2
|
|
3147
|
+
if "H]" in rel["to_adduct"]: score += 3
|
|
3148
|
+
if "Na]" in rel["to_adduct"]: score += 2
|
|
3149
|
+
if "NH4]" in rel["to_adduct"]: score += 2
|
|
3150
|
+
|
|
3151
|
+
if score > best_score:
|
|
3152
|
+
best_score = score
|
|
3153
|
+
best_rel = rel
|
|
3154
|
+
|
|
3155
|
+
if best_rel:
|
|
3156
|
+
# Determine which feature gets which adduct based on intensity
|
|
3157
|
+
inty1 = feature1["inty_mean"]
|
|
3158
|
+
inty2 = feature2["inty_mean"]
|
|
3159
|
+
|
|
3160
|
+
# Assign higher intensity to [M+H]+ if possible
|
|
3161
|
+
if "H]" in best_rel["from_adduct"] and inty1 >= inty2:
|
|
3162
|
+
# Feature 1 = from_adduct, Feature 2 = to_adduct
|
|
3163
|
+
from_feature = feature1
|
|
3164
|
+
to_feature = feature2
|
|
3165
|
+
from_adduct_name = best_rel["from_adduct"]
|
|
3166
|
+
to_adduct_name = best_rel["to_adduct"]
|
|
3167
|
+
elif "H]" in best_rel["to_adduct"] and inty2 >= inty1:
|
|
3168
|
+
# Feature 2 = to_adduct (reverse), Feature 1 = from_adduct
|
|
3169
|
+
from_feature = feature2
|
|
3170
|
+
to_feature = feature1
|
|
3171
|
+
from_adduct_name = best_rel["to_adduct"]
|
|
3172
|
+
to_adduct_name = best_rel["from_adduct"]
|
|
3173
|
+
else:
|
|
3174
|
+
# Assignment based on mass shift direction
|
|
3175
|
+
# catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
|
|
3176
|
+
# If catalog_shift > 0: from_adduct has higher m/z than to_adduct
|
|
3177
|
+
# If catalog_shift < 0: from_adduct has lower m/z than to_adduct
|
|
3178
|
+
# observed mz_diff = mz2 - mz1
|
|
3179
|
+
# If mz_diff matches catalog_shift: feature2 should get to_adduct, feature1 should get from_adduct
|
|
3180
|
+
# If mz_diff matches -catalog_shift: assignments are swapped
|
|
3181
|
+
|
|
3182
|
+
if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
|
|
3183
|
+
# mz_diff matches catalog_shift direction
|
|
3184
|
+
from_feature = feature1
|
|
3185
|
+
to_feature = feature2
|
|
3186
|
+
from_adduct_name = best_rel["from_adduct"]
|
|
3187
|
+
to_adduct_name = best_rel["to_adduct"]
|
|
3188
|
+
else:
|
|
3189
|
+
# mz_diff matches reverse direction of catalog_shift
|
|
3190
|
+
from_feature = feature2
|
|
3191
|
+
to_feature = feature1
|
|
3192
|
+
from_adduct_name = best_rel["to_adduct"]
|
|
3193
|
+
to_adduct_name = best_rel["from_adduct"]
|
|
3194
|
+
|
|
3195
|
+
# Get adduct details from catalog
|
|
3196
|
+
from_adduct_info = adduct_info.get(from_adduct_name, {})
|
|
3197
|
+
to_adduct_info = adduct_info.get(to_adduct_name, {})
|
|
3198
|
+
|
|
3199
|
+
# Calculate neutral masses
|
|
3200
|
+
from_charge = from_adduct_info.get("charge", 1)
|
|
3201
|
+
to_charge = to_adduct_info.get("charge", 1)
|
|
3202
|
+
from_mass_shift = from_adduct_info.get("mass_shift", 1.007825)
|
|
3203
|
+
to_mass_shift = to_adduct_info.get("mass_shift", 1.007825)
|
|
3204
|
+
|
|
3205
|
+
from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
|
|
3206
|
+
to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
|
|
3207
|
+
|
|
3208
|
+
# Store updates
|
|
3209
|
+
adduct_updates[from_feature["consensus_uid"]] = {
|
|
3210
|
+
"adduct_top": from_adduct_name,
|
|
3211
|
+
"adduct_charge_top": from_charge,
|
|
3212
|
+
"adduct_mass_neutral_top": from_neutral_mass,
|
|
3213
|
+
"adduct_mass_shift_top": from_mass_shift
|
|
3214
|
+
}
|
|
3215
|
+
|
|
3216
|
+
adduct_updates[to_feature["consensus_uid"]] = {
|
|
3217
|
+
"adduct_top": to_adduct_name,
|
|
3218
|
+
"adduct_charge_top": to_charge,
|
|
3219
|
+
"adduct_mass_neutral_top": to_neutral_mass,
|
|
3220
|
+
"adduct_mass_shift_top": to_mass_shift
|
|
3221
|
+
}
|
|
3222
|
+
|
|
3223
|
+
updated_count += 2
|
|
3224
|
+
self.logger.debug(
|
|
3225
|
+
f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
|
|
3226
|
+
f"<-> {to_adduct_name} (m/z {to_feature['mz']:.4f}), "
|
|
3227
|
+
f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
|
|
3228
|
+
)
|
|
3229
|
+
break # Found match, no need to check other relationships
|
|
3230
|
+
|
|
3231
|
+
# Apply updates to consensus_df
|
|
3232
|
+
if adduct_updates:
|
|
3233
|
+
# Prepare update data
|
|
3234
|
+
consensus_uids = self.consensus_df["consensus_uid"].to_list()
|
|
3235
|
+
|
|
3236
|
+
new_adduct_top = []
|
|
3237
|
+
new_adduct_charge_top = []
|
|
3238
|
+
new_adduct_mass_neutral_top = []
|
|
3239
|
+
new_adduct_mass_shift_top = []
|
|
3240
|
+
|
|
3241
|
+
for uid in consensus_uids:
|
|
3242
|
+
if uid in adduct_updates:
|
|
3243
|
+
update = adduct_updates[uid]
|
|
3244
|
+
new_adduct_top.append(update["adduct_top"])
|
|
3245
|
+
new_adduct_charge_top.append(update["adduct_charge_top"])
|
|
3246
|
+
new_adduct_mass_neutral_top.append(update["adduct_mass_neutral_top"])
|
|
3247
|
+
new_adduct_mass_shift_top.append(update["adduct_mass_shift_top"])
|
|
3248
|
+
else:
|
|
3249
|
+
# Keep existing values
|
|
3250
|
+
row_idx = consensus_uids.index(uid)
|
|
3251
|
+
row = self.consensus_df.row(row_idx, named=True)
|
|
3252
|
+
new_adduct_top.append(row.get("adduct_top"))
|
|
3253
|
+
new_adduct_charge_top.append(row.get("adduct_charge_top"))
|
|
3254
|
+
new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
|
|
3255
|
+
new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
|
|
3256
|
+
|
|
3257
|
+
# Update the DataFrame
|
|
3258
|
+
self.consensus_df = self.consensus_df.with_columns([
|
|
3259
|
+
pl.Series("adduct_top", new_adduct_top),
|
|
3260
|
+
pl.Series("adduct_charge_top", new_adduct_charge_top),
|
|
3261
|
+
pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
|
|
3262
|
+
pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
|
|
3263
|
+
])
|
|
3264
|
+
|
|
3265
|
+
self.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
|
|
3266
|
+
else:
|
|
3267
|
+
self.logger.debug("No consensus features updated based on mass shift analysis")
|
|
3268
|
+
|
|
3269
|
+
|
|
1963
3270
|
def _finalize_merge(self, link_ms2, min_samples):
|
|
1964
3271
|
"""Complete the merge process with final calculations and cleanup."""
|
|
1965
3272
|
import polars as pl
|
|
@@ -1998,14 +3305,20 @@ def _finalize_merge(self, link_ms2, min_samples):
|
|
|
1998
3305
|
)
|
|
1999
3306
|
|
|
2000
3307
|
# Calculate the completeness of the consensus map
|
|
3308
|
+
# Log completion with tight cluster metrics
|
|
2001
3309
|
if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
|
|
2002
3310
|
c = (
|
|
2003
3311
|
len(self.consensus_mapping_df)
|
|
2004
3312
|
/ len(self.consensus_df)
|
|
2005
3313
|
/ len(self.samples_df)
|
|
2006
3314
|
)
|
|
3315
|
+
|
|
3316
|
+
# Count tight clusters with specified thresholds
|
|
3317
|
+
tight_clusters = _count_tight_clusters(self,mz_tol=0.04, rt_tol=0.3)
|
|
3318
|
+
|
|
2007
3319
|
self.logger.info(
|
|
2008
|
-
f"Merging completed. Consensus features: {len(self.consensus_df)}.
|
|
3320
|
+
f"Merging completed. Consensus features: {len(self.consensus_df)}. "
|
|
3321
|
+
f"Completeness: {c:.2f}. Tight clusters left: {tight_clusters}.",
|
|
2009
3322
|
)
|
|
2010
3323
|
else:
|
|
2011
3324
|
self.logger.warning(
|