masster 0.4.14__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/__init__.py +2 -0
- masster/_version.py +1 -1
- masster/study/__init__.py +1 -0
- masster/study/defaults/find_consensus_def.py +1 -1
- masster/study/defaults/merge_def.py +129 -22
- masster/study/h5.py +65 -106
- masster/study/id.py +1 -1
- masster/study/load.py +11 -6
- masster/study/merge.py +2145 -0
- masster/study/plot.py +15 -1
- masster/study/processing.py +0 -902
- masster/study/save.py +1 -1
- masster/study/study.py +28 -31
- masster/wizard/README.md +373 -0
- masster/wizard/__init__.py +11 -0
- masster/wizard/example.py +223 -0
- masster/wizard/test_structure.py +49 -0
- masster/wizard/test_wizard.py +285 -0
- masster/wizard/wizard.py +1175 -0
- masster/wizard.py +1175 -0
- {masster-0.4.14.dist-info → masster-0.4.17.dist-info}/METADATA +3 -2
- {masster-0.4.14.dist-info → masster-0.4.17.dist-info}/RECORD +25 -17
- {masster-0.4.14.dist-info → masster-0.4.17.dist-info}/WHEEL +0 -0
- {masster-0.4.14.dist-info → masster-0.4.17.dist-info}/entry_points.txt +0 -0
- {masster-0.4.14.dist-info → masster-0.4.17.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
ADDED
|
@@ -0,0 +1,2145 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified merge module for the Study class.
|
|
3
|
+
Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'chunked'
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import time
|
|
7
|
+
import numpy as np
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
import pyopenms as oms
|
|
12
|
+
import polars as pl
|
|
13
|
+
from masster.study.defaults import merge_defaults
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def merge(self, **kwargs) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Group features across samples into consensus features using various algorithms.
|
|
19
|
+
|
|
20
|
+
This function provides a unified interface to multiple feature grouping algorithms,
|
|
21
|
+
each optimized for different dataset sizes and analysis requirements.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
**kwargs : dict
|
|
26
|
+
Parameters from merge_defaults class:
|
|
27
|
+
- method : str, default 'quality'
|
|
28
|
+
Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'chunked', 'quality'
|
|
29
|
+
- min_samples : int, default 10
|
|
30
|
+
Minimum number of samples for consensus feature
|
|
31
|
+
- rt_tol : float, default 2.0
|
|
32
|
+
RT tolerance in seconds
|
|
33
|
+
- mz_tol : float, default 0.01
|
|
34
|
+
m/z tolerance in Da (Daltons) for all methods
|
|
35
|
+
- chunk_size : int, default 500
|
|
36
|
+
Chunk size for 'chunked' method
|
|
37
|
+
- nr_partitions : int, default 500
|
|
38
|
+
Number of partitions in m/z dimension for KD algorithms
|
|
39
|
+
- min_rel_cc_size : float, default 0.3
|
|
40
|
+
Minimum relative connected component size for conflict resolution
|
|
41
|
+
- max_pairwise_log_fc : float, default 0.5
|
|
42
|
+
Maximum pairwise log fold change for conflict resolution
|
|
43
|
+
- max_nr_conflicts : int, default 0
|
|
44
|
+
Maximum number of conflicts allowed in consensus feature
|
|
45
|
+
- link_ms2 : bool, default True
|
|
46
|
+
Whether to link MS2 spectra to consensus features
|
|
47
|
+
|
|
48
|
+
Algorithm Guidelines
|
|
49
|
+
-------------------
|
|
50
|
+
- Quality: KD with post-processing quality control to reduce oversegmentation (RECOMMENDED DEFAULT)
|
|
51
|
+
Includes RT tolerance optimization, secondary clustering, and quality filtering
|
|
52
|
+
- Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
|
|
53
|
+
- QT: Thorough but slow O(n²), good for <1000 samples
|
|
54
|
+
- NoWarp: Memory efficient KD without RT warping for large datasets
|
|
55
|
+
- Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
|
|
56
|
+
Uses optimized partitioning for better memory management while maintaining
|
|
57
|
+
full cross-sample consensus feature detection.
|
|
58
|
+
"""
|
|
59
|
+
start_time = time.time()
|
|
60
|
+
|
|
61
|
+
# Initialize with defaults and override with kwargs
|
|
62
|
+
params = merge_defaults()
|
|
63
|
+
|
|
64
|
+
# Filter and apply only valid parameters
|
|
65
|
+
valid_params = set(params.list_parameters())
|
|
66
|
+
for key, value in kwargs.items():
|
|
67
|
+
if key in valid_params:
|
|
68
|
+
setattr(params, key, value)
|
|
69
|
+
else:
|
|
70
|
+
self.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
71
|
+
|
|
72
|
+
# Backward compatibility: Map old method names to new names
|
|
73
|
+
method_mapping = {
|
|
74
|
+
'kd': 'sensitivity',
|
|
75
|
+
'kd-nowarp': 'nowarp',
|
|
76
|
+
'kd_nowarp': 'nowarp',
|
|
77
|
+
'kd-strict': 'quality',
|
|
78
|
+
'kd_strict': 'quality',
|
|
79
|
+
'kdstrict': 'quality'
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
if params.method in method_mapping:
|
|
83
|
+
old_method = params.method
|
|
84
|
+
params.method = method_mapping[old_method]
|
|
85
|
+
self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
86
|
+
|
|
87
|
+
# Validate method
|
|
88
|
+
if params.method not in ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']:
|
|
89
|
+
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']")
|
|
90
|
+
|
|
91
|
+
# Persist last used params for diagnostics
|
|
92
|
+
try:
|
|
93
|
+
self._merge_params_last = params.to_dict()
|
|
94
|
+
except Exception:
|
|
95
|
+
self._merge_params_last = {}
|
|
96
|
+
|
|
97
|
+
# Store merge parameters in history
|
|
98
|
+
try:
|
|
99
|
+
if hasattr(self, 'store_history'):
|
|
100
|
+
self.store_history(['merge'], params.to_dict())
|
|
101
|
+
else:
|
|
102
|
+
self.logger.warning("History storage not available - parameters not saved to history")
|
|
103
|
+
except Exception as e:
|
|
104
|
+
self.logger.warning(f"Failed to store merge parameters in history: {e}")
|
|
105
|
+
|
|
106
|
+
# Ensure feature maps are available for merging (regenerate if needed)
|
|
107
|
+
if len(self.features_maps) < len(self.samples_df):
|
|
108
|
+
self.features_maps = []
|
|
109
|
+
self.load_features()
|
|
110
|
+
|
|
111
|
+
self.logger.info(
|
|
112
|
+
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da, min_rel_cc_size={params.min_rel_cc_size}, max_pairwise_log_fc={params.max_pairwise_log_fc}, max_nr_conflicts={params.max_nr_conflicts}"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Initialize
|
|
116
|
+
self._reset_consensus_data()
|
|
117
|
+
|
|
118
|
+
# Cache adducts for performance (avoid repeated _get_adducts() calls)
|
|
119
|
+
cached_adducts_df = None
|
|
120
|
+
cached_valid_adducts = None
|
|
121
|
+
try:
|
|
122
|
+
cached_adducts_df = self._get_adducts()
|
|
123
|
+
if not cached_adducts_df.is_empty():
|
|
124
|
+
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
125
|
+
else:
|
|
126
|
+
cached_valid_adducts = set()
|
|
127
|
+
except Exception as e:
|
|
128
|
+
self.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
129
|
+
cached_valid_adducts = set()
|
|
130
|
+
|
|
131
|
+
# Always allow '?' adducts
|
|
132
|
+
cached_valid_adducts.add("?")
|
|
133
|
+
|
|
134
|
+
# Route to algorithm implementation
|
|
135
|
+
if params.method == 'sensitivity':
|
|
136
|
+
consensus_map = _merge_kd(self, params)
|
|
137
|
+
# Extract consensus features
|
|
138
|
+
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
139
|
+
elif params.method == 'qt':
|
|
140
|
+
consensus_map = _merge_qt(self, params)
|
|
141
|
+
# Extract consensus features
|
|
142
|
+
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
143
|
+
elif params.method == 'nowarp':
|
|
144
|
+
consensus_map = _merge_kd_nowarp(self, params)
|
|
145
|
+
# Extract consensus features
|
|
146
|
+
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
147
|
+
elif params.method == 'quality':
|
|
148
|
+
consensus_map = _merge_kd_strict(self, params)
|
|
149
|
+
# Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
|
|
150
|
+
elif params.method == 'chunked':
|
|
151
|
+
consensus_map = _merge_chunked(self, params, cached_adducts_df, cached_valid_adducts)
|
|
152
|
+
# Note: _merge_chunked populates consensus_df directly, no need to extract
|
|
153
|
+
|
|
154
|
+
# Perform adduct grouping
|
|
155
|
+
self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
|
|
156
|
+
|
|
157
|
+
# Link MS2 if requested
|
|
158
|
+
if params.link_ms2:
|
|
159
|
+
self._finalize_merge(params.link_ms2, params.min_samples)
|
|
160
|
+
|
|
161
|
+
# Log completion without the misleading feature count
|
|
162
|
+
elapsed = time.time() - start_time
|
|
163
|
+
self.logger.debug(f"Merge process completed in {elapsed:.1f}s")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
167
|
+
"""KD-tree based merge (fast, recommended)"""
|
|
168
|
+
|
|
169
|
+
consensus_map = oms.ConsensusMap()
|
|
170
|
+
file_descriptions = consensus_map.getColumnHeaders()
|
|
171
|
+
|
|
172
|
+
for i, feature_map in enumerate(self.features_maps):
|
|
173
|
+
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
174
|
+
file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
|
|
175
|
+
file_description.size = feature_map.size()
|
|
176
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
177
|
+
file_descriptions[i] = file_description
|
|
178
|
+
|
|
179
|
+
consensus_map.setColumnHeaders(file_descriptions)
|
|
180
|
+
|
|
181
|
+
# Configure KD algorithm
|
|
182
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
183
|
+
params_oms = grouper.getParameters()
|
|
184
|
+
|
|
185
|
+
params_oms.setValue("mz_unit", "Da")
|
|
186
|
+
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
187
|
+
params_oms.setValue("warp:enabled", "true")
|
|
188
|
+
params_oms.setValue("warp:rt_tol", params.rt_tol)
|
|
189
|
+
params_oms.setValue("warp:mz_tol", params.mz_tol)
|
|
190
|
+
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
191
|
+
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
192
|
+
params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
193
|
+
params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
194
|
+
params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
195
|
+
#params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
|
|
196
|
+
|
|
197
|
+
grouper.setParameters(params_oms)
|
|
198
|
+
grouper.group(self.features_maps, consensus_map)
|
|
199
|
+
|
|
200
|
+
return consensus_map
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
204
|
+
"""QT (Quality Threshold) based merge"""
|
|
205
|
+
|
|
206
|
+
n_samples = len(self.features_maps)
|
|
207
|
+
if n_samples > 1000:
|
|
208
|
+
self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
|
|
209
|
+
|
|
210
|
+
consensus_map = oms.ConsensusMap()
|
|
211
|
+
file_descriptions = consensus_map.getColumnHeaders()
|
|
212
|
+
|
|
213
|
+
for i, feature_map in enumerate(self.features_maps):
|
|
214
|
+
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
215
|
+
file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
|
|
216
|
+
file_description.size = feature_map.size()
|
|
217
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
218
|
+
file_descriptions[i] = file_description
|
|
219
|
+
|
|
220
|
+
consensus_map.setColumnHeaders(file_descriptions)
|
|
221
|
+
|
|
222
|
+
# Configure QT algorithm
|
|
223
|
+
grouper = oms.FeatureGroupingAlgorithmQT()
|
|
224
|
+
params_oms = grouper.getParameters()
|
|
225
|
+
|
|
226
|
+
params_oms.setValue("distance_RT:max_difference", params.rt_tol)
|
|
227
|
+
params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
228
|
+
params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
|
|
229
|
+
params_oms.setValue("ignore_charge", "true")
|
|
230
|
+
params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
231
|
+
params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
232
|
+
params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
233
|
+
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
234
|
+
|
|
235
|
+
grouper.setParameters(params_oms)
|
|
236
|
+
grouper.group(self.features_maps, consensus_map)
|
|
237
|
+
|
|
238
|
+
return consensus_map
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
242
|
+
"""
|
|
243
|
+
Quality merge: Standard KD algorithm with post-processing quality control.
|
|
244
|
+
|
|
245
|
+
This method combines the sensitivity of KD clustering with post-processing steps
|
|
246
|
+
to reduce oversegmentation while maintaining high-quality consensus features.
|
|
247
|
+
This is the recommended default method.
|
|
248
|
+
|
|
249
|
+
Post-processing features:
|
|
250
|
+
1. RT tolerance optimization (optional)
|
|
251
|
+
2. Secondary clustering for close features
|
|
252
|
+
3. Sample overlap validation
|
|
253
|
+
4. RT spread quality filtering
|
|
254
|
+
5. Chromatographic coherence validation
|
|
255
|
+
|
|
256
|
+
Additional parameters supported in params:
|
|
257
|
+
- optimize_rt_tol: bool - Enable RT tolerance optimization
|
|
258
|
+
- rt_tol_range: tuple - RT tolerance range for optimization (min, max)
|
|
259
|
+
- secondary_merge_rt_tol: float - Secondary merge RT tolerance (default: 0.5s)
|
|
260
|
+
- secondary_merge_mz_tol: float - Secondary merge m/z tolerance (default: 0.005)
|
|
261
|
+
- min_sample_overlap: float - Minimum sample overlap for merging (0.0-1.0, default: 0.8)
|
|
262
|
+
- max_rt_spread: float - Maximum RT spread allowed (default: 2x rt_tol)
|
|
263
|
+
- min_coherence: float - Minimum chromatographic coherence (default: 0.0, disabled)
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
# Check for RT tolerance optimization
|
|
267
|
+
optimize_rt_tol = getattr(params, 'optimize_rt_tol', False)
|
|
268
|
+
|
|
269
|
+
if optimize_rt_tol:
|
|
270
|
+
# Optimize RT tolerance first
|
|
271
|
+
optimal_rt_tol = _optimize_rt_tolerance(self, params)
|
|
272
|
+
self.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
|
|
273
|
+
# Create modified params with optimal RT tolerance
|
|
274
|
+
import copy
|
|
275
|
+
optimized_params = copy.deepcopy(params)
|
|
276
|
+
optimized_params.rt_tol = optimal_rt_tol
|
|
277
|
+
else:
|
|
278
|
+
optimized_params = params
|
|
279
|
+
|
|
280
|
+
# Phase 1: Standard KD clustering
|
|
281
|
+
self.logger.info("Initial KD clustering")
|
|
282
|
+
consensus_map = _merge_kd(self, optimized_params)
|
|
283
|
+
|
|
284
|
+
# Phase 2: Post-processing quality control
|
|
285
|
+
self.logger.info("Post-processing quality control")
|
|
286
|
+
consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
|
|
287
|
+
|
|
288
|
+
return consensus_map
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
|
|
292
|
+
"""
|
|
293
|
+
Optimize RT tolerance by testing different values and measuring oversegmentation.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
self: Study object
|
|
297
|
+
params: Merge parameters
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Optimal RT tolerance value
|
|
301
|
+
"""
|
|
302
|
+
rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
|
|
303
|
+
rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
|
|
304
|
+
|
|
305
|
+
self.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
|
|
306
|
+
|
|
307
|
+
# Generate test values
|
|
308
|
+
test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
|
|
309
|
+
for i in range(rt_tol_steps)]
|
|
310
|
+
|
|
311
|
+
best_rt_tol = params.rt_tol
|
|
312
|
+
best_score = float('inf')
|
|
313
|
+
|
|
314
|
+
# Store original features for restoration
|
|
315
|
+
original_consensus_df = getattr(self, 'consensus_df', pl.DataFrame())
|
|
316
|
+
original_consensus_mapping_df = getattr(self, 'consensus_mapping_df', pl.DataFrame())
|
|
317
|
+
|
|
318
|
+
for test_rt_tol in test_rt_tols:
|
|
319
|
+
try:
|
|
320
|
+
# Create test parameters
|
|
321
|
+
import copy
|
|
322
|
+
test_params = copy.deepcopy(params)
|
|
323
|
+
test_params.rt_tol = test_rt_tol
|
|
324
|
+
|
|
325
|
+
# Run KD merge with test parameters
|
|
326
|
+
test_consensus_map = _merge_kd(self, test_params)
|
|
327
|
+
|
|
328
|
+
# Extract consensus features temporarily for analysis
|
|
329
|
+
self._extract_consensus_features(test_consensus_map, test_params.min_samples)
|
|
330
|
+
|
|
331
|
+
if len(self.consensus_df) == 0:
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
# Calculate oversegmentation metrics
|
|
335
|
+
oversegmentation_score = _calculate_oversegmentation_score(self, test_rt_tol)
|
|
336
|
+
|
|
337
|
+
self.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(self.consensus_df)} features, score: {oversegmentation_score:.3f}")
|
|
338
|
+
|
|
339
|
+
# Lower score is better (less oversegmentation)
|
|
340
|
+
if oversegmentation_score < best_score:
|
|
341
|
+
best_score = oversegmentation_score
|
|
342
|
+
best_rt_tol = test_rt_tol
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
self.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
|
|
346
|
+
continue
|
|
347
|
+
|
|
348
|
+
# Restore original consensus data
|
|
349
|
+
self.consensus_df = original_consensus_df
|
|
350
|
+
self.consensus_mapping_df = original_consensus_mapping_df
|
|
351
|
+
|
|
352
|
+
self.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
|
|
353
|
+
return best_rt_tol
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
|
|
357
|
+
"""
|
|
358
|
+
Calculate oversegmentation score based on feature density and RT spread metrics.
|
|
359
|
+
Lower scores indicate less oversegmentation.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
self: Study object
|
|
363
|
+
rt_tol: RT tolerance used
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
Oversegmentation score (lower = better)
|
|
367
|
+
"""
|
|
368
|
+
if len(self.consensus_df) == 0:
|
|
369
|
+
return float('inf')
|
|
370
|
+
|
|
371
|
+
# Metric 1: Feature density (features per RT second)
|
|
372
|
+
rt_range = self.consensus_df['rt'].max() - self.consensus_df['rt'].min()
|
|
373
|
+
if rt_range <= 0:
|
|
374
|
+
return float('inf')
|
|
375
|
+
|
|
376
|
+
feature_density = len(self.consensus_df) / rt_range
|
|
377
|
+
|
|
378
|
+
# Metric 2: Average RT spread relative to tolerance
|
|
379
|
+
rt_spreads = (self.consensus_df['rt_max'] - self.consensus_df['rt_min'])
|
|
380
|
+
avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
|
|
381
|
+
|
|
382
|
+
# Metric 3: Proportion of features with low sample counts (indicates fragmentation)
|
|
383
|
+
low_sample_features = len(self.consensus_df.filter(pl.col('number_samples') <= 5))
|
|
384
|
+
low_sample_ratio = low_sample_features / len(self.consensus_df)
|
|
385
|
+
|
|
386
|
+
# Metric 4: Number of features with excessive RT spread
|
|
387
|
+
excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
|
|
388
|
+
excessive_spread_ratio = excessive_spread_features / len(self.consensus_df)
|
|
389
|
+
|
|
390
|
+
# Combined score (weighted combination)
|
|
391
|
+
oversegmentation_score = (
|
|
392
|
+
0.4 * (feature_density / 10.0) + # Normalize to reasonable scale
|
|
393
|
+
0.3 * avg_rt_spread_ratio +
|
|
394
|
+
0.2 * low_sample_ratio +
|
|
395
|
+
0.1 * excessive_spread_ratio
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return oversegmentation_score
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
|
|
402
|
+
"""
|
|
403
|
+
Apply post-processing quality control to KD consensus map.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
consensus_map: Initial consensus map from KD
|
|
407
|
+
params: Merge parameters with kd-strict options
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
Processed consensus map with reduced oversegmentation
|
|
411
|
+
"""
|
|
412
|
+
if consensus_map.size() == 0:
|
|
413
|
+
self.logger.warning("Empty consensus map provided to post-processing")
|
|
414
|
+
return consensus_map
|
|
415
|
+
|
|
416
|
+
self.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
|
|
417
|
+
|
|
418
|
+
# Step 1: Extract initial consensus features
|
|
419
|
+
original_min_samples = params.min_samples
|
|
420
|
+
params.min_samples = 1 # Extract all features initially
|
|
421
|
+
|
|
422
|
+
self._extract_consensus_features(consensus_map, params.min_samples)
|
|
423
|
+
initial_feature_count = len(self.consensus_df)
|
|
424
|
+
|
|
425
|
+
if initial_feature_count == 0:
|
|
426
|
+
self.logger.warning("No consensus features extracted for post-processing")
|
|
427
|
+
params.min_samples = original_min_samples
|
|
428
|
+
return consensus_map
|
|
429
|
+
|
|
430
|
+
# Step 2: Secondary clustering for close features
|
|
431
|
+
secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
|
|
432
|
+
secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
|
|
433
|
+
|
|
434
|
+
self.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
|
|
435
|
+
merged_features = _perform_secondary_clustering(self, secondary_merge_rt_tol, secondary_merge_mz_tol)
|
|
436
|
+
|
|
437
|
+
# Step 3: Sample overlap validation
|
|
438
|
+
min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
|
|
439
|
+
if min_sample_overlap > 0:
|
|
440
|
+
self.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
|
|
441
|
+
merged_features = _validate_sample_overlap(self, merged_features, min_sample_overlap)
|
|
442
|
+
|
|
443
|
+
# Step 4: RT spread quality filtering
|
|
444
|
+
if params.rt_tol is not None:
|
|
445
|
+
max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
|
|
446
|
+
if max_rt_spread is not None:
|
|
447
|
+
self.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
|
|
448
|
+
merged_features = _filter_rt_spread(self, merged_features, max_rt_spread)
|
|
449
|
+
else:
|
|
450
|
+
self.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
|
|
451
|
+
else:
|
|
452
|
+
self.logger.debug("Skipping RT spread filtering - rt_tol is None")
|
|
453
|
+
|
|
454
|
+
# Step 5: Chromatographic coherence filtering (optional)
|
|
455
|
+
min_coherence = getattr(params, 'min_coherence', 0.0)
|
|
456
|
+
if min_coherence > 0:
|
|
457
|
+
self.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
|
|
458
|
+
merged_features = _filter_coherence(self, merged_features, min_coherence)
|
|
459
|
+
|
|
460
|
+
# Step 6: Rebuild consensus_df with filtered features and preserve mapping
|
|
461
|
+
original_mapping_df = self.consensus_mapping_df.clone() # Save original mapping
|
|
462
|
+
self.consensus_df = pl.DataFrame(merged_features, strict=False)
|
|
463
|
+
|
|
464
|
+
# Step 7: Apply original min_samples filter
|
|
465
|
+
params.min_samples = original_min_samples
|
|
466
|
+
if params.min_samples > 1:
|
|
467
|
+
l1 = len(self.consensus_df)
|
|
468
|
+
self.consensus_df = self.consensus_df.filter(
|
|
469
|
+
pl.col("number_samples") >= params.min_samples
|
|
470
|
+
)
|
|
471
|
+
filtered_count = l1 - len(self.consensus_df)
|
|
472
|
+
if filtered_count > 0:
|
|
473
|
+
self.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
|
|
474
|
+
|
|
475
|
+
# Step 8: Update consensus_mapping_df to match final consensus_df
|
|
476
|
+
if len(self.consensus_df) > 0 and len(original_mapping_df) > 0:
|
|
477
|
+
valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
|
|
478
|
+
self.consensus_mapping_df = original_mapping_df.filter(
|
|
479
|
+
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
480
|
+
)
|
|
481
|
+
else:
|
|
482
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
483
|
+
|
|
484
|
+
final_feature_count = len(self.consensus_df)
|
|
485
|
+
reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
|
|
486
|
+
|
|
487
|
+
self.logger.info(f"Post-processing complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
|
|
488
|
+
|
|
489
|
+
# Create a new consensus map for compatibility (the processed data is in consensus_df)
|
|
490
|
+
processed_consensus_map = oms.ConsensusMap()
|
|
491
|
+
return processed_consensus_map
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
|
|
495
|
+
"""
|
|
496
|
+
Perform secondary clustering to merge very close features.
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
rt_tol: RT tolerance for secondary clustering
|
|
500
|
+
mz_tol: m/z tolerance for secondary clustering
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
List of merged consensus feature dictionaries
|
|
504
|
+
"""
|
|
505
|
+
if len(self.consensus_df) == 0:
|
|
506
|
+
return []
|
|
507
|
+
|
|
508
|
+
# Convert consensus_df to list of dictionaries for clustering
|
|
509
|
+
consensus_features = []
|
|
510
|
+
for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
|
|
511
|
+
consensus_features.append(dict(row))
|
|
512
|
+
|
|
513
|
+
# Use Union-Find for efficient clustering
|
|
514
|
+
class UnionFind:
|
|
515
|
+
def __init__(self, n):
|
|
516
|
+
self.parent = list(range(n))
|
|
517
|
+
self.rank = [0] * n
|
|
518
|
+
|
|
519
|
+
def find(self, x):
|
|
520
|
+
if self.parent[x] != x:
|
|
521
|
+
self.parent[x] = self.find(self.parent[x])
|
|
522
|
+
return self.parent[x]
|
|
523
|
+
|
|
524
|
+
def union(self, x, y):
|
|
525
|
+
px, py = self.find(x), self.find(y)
|
|
526
|
+
if px == py:
|
|
527
|
+
return
|
|
528
|
+
if self.rank[px] < self.rank[py]:
|
|
529
|
+
px, py = py, px
|
|
530
|
+
self.parent[py] = px
|
|
531
|
+
if self.rank[px] == self.rank[py]:
|
|
532
|
+
self.rank[px] += 1
|
|
533
|
+
|
|
534
|
+
n_features = len(consensus_features)
|
|
535
|
+
uf = UnionFind(n_features)
|
|
536
|
+
|
|
537
|
+
# Find features to merge based on proximity
|
|
538
|
+
merge_count = 0
|
|
539
|
+
for i in range(n_features):
|
|
540
|
+
for j in range(i + 1, n_features):
|
|
541
|
+
feat_i = consensus_features[i]
|
|
542
|
+
feat_j = consensus_features[j]
|
|
543
|
+
|
|
544
|
+
rt_diff = abs(feat_i['rt'] - feat_j['rt'])
|
|
545
|
+
mz_diff = abs(feat_i['mz'] - feat_j['mz'])
|
|
546
|
+
|
|
547
|
+
if rt_diff <= rt_tol and mz_diff <= mz_tol:
|
|
548
|
+
uf.union(i, j)
|
|
549
|
+
merge_count += 1
|
|
550
|
+
|
|
551
|
+
# Group features by their root
|
|
552
|
+
groups_by_root = defaultdict(list)
|
|
553
|
+
for i in range(n_features):
|
|
554
|
+
root = uf.find(i)
|
|
555
|
+
groups_by_root[root].append(consensus_features[i])
|
|
556
|
+
|
|
557
|
+
# Merge features within each group
|
|
558
|
+
merged_features = []
|
|
559
|
+
for group in groups_by_root.values():
|
|
560
|
+
if len(group) == 1:
|
|
561
|
+
# Single feature - keep as is
|
|
562
|
+
merged_features.append(group[0])
|
|
563
|
+
else:
|
|
564
|
+
# Multiple features - merge them
|
|
565
|
+
merged_feature = _merge_feature_group(group)
|
|
566
|
+
merged_features.append(merged_feature)
|
|
567
|
+
|
|
568
|
+
self.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
|
|
569
|
+
return merged_features
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def _merge_feature_group(feature_group: list) -> dict:
|
|
573
|
+
"""
|
|
574
|
+
Merge a group of similar consensus features into one.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
feature_group: List of consensus feature dictionaries to merge
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
Merged consensus feature dictionary
|
|
581
|
+
"""
|
|
582
|
+
if not feature_group:
|
|
583
|
+
return {}
|
|
584
|
+
|
|
585
|
+
if len(feature_group) == 1:
|
|
586
|
+
return feature_group[0]
|
|
587
|
+
|
|
588
|
+
# Use the feature with highest sample count as base
|
|
589
|
+
base_feature = max(feature_group, key=lambda f: f.get('number_samples', 0))
|
|
590
|
+
merged = base_feature.copy()
|
|
591
|
+
|
|
592
|
+
# Aggregate numeric statistics
|
|
593
|
+
rt_values = [f['rt'] for f in feature_group if f.get('rt') is not None]
|
|
594
|
+
mz_values = [f['mz'] for f in feature_group if f.get('mz') is not None]
|
|
595
|
+
sample_counts = [f.get('number_samples', 0) for f in feature_group]
|
|
596
|
+
intensities = [f.get('inty_mean', 0) for f in feature_group if f.get('inty_mean') is not None]
|
|
597
|
+
|
|
598
|
+
# Update merged feature statistics
|
|
599
|
+
if rt_values:
|
|
600
|
+
merged['rt'] = float(np.mean(rt_values))
|
|
601
|
+
merged['rt_min'] = min([f.get('rt_min', f['rt']) for f in feature_group])
|
|
602
|
+
merged['rt_max'] = max([f.get('rt_max', f['rt']) for f in feature_group])
|
|
603
|
+
merged['rt_mean'] = float(np.mean(rt_values))
|
|
604
|
+
|
|
605
|
+
if mz_values:
|
|
606
|
+
merged['mz'] = float(np.mean(mz_values))
|
|
607
|
+
merged['mz_min'] = min([f.get('mz_min', f['mz']) for f in feature_group])
|
|
608
|
+
merged['mz_max'] = max([f.get('mz_max', f['mz']) for f in feature_group])
|
|
609
|
+
merged['mz_mean'] = float(np.mean(mz_values))
|
|
610
|
+
|
|
611
|
+
# Use maximum sample count (features might be detected in overlapping but different samples)
|
|
612
|
+
merged['number_samples'] = max(sample_counts)
|
|
613
|
+
|
|
614
|
+
# Use weighted average intensity (by sample count)
|
|
615
|
+
if intensities and sample_counts:
|
|
616
|
+
total_weight = sum(sample_counts)
|
|
617
|
+
if total_weight > 0:
|
|
618
|
+
weighted_intensity = sum(inty * count for inty, count in zip(intensities, sample_counts)) / total_weight
|
|
619
|
+
merged['inty_mean'] = float(weighted_intensity)
|
|
620
|
+
|
|
621
|
+
# Aggregate chromatographic quality metrics if available
|
|
622
|
+
coherence_values = [f.get('chrom_coherence_mean', 0) for f in feature_group if f.get('chrom_coherence_mean') is not None]
|
|
623
|
+
prominence_values = [f.get('chrom_prominence_mean', 0) for f in feature_group if f.get('chrom_prominence_mean') is not None]
|
|
624
|
+
|
|
625
|
+
if coherence_values:
|
|
626
|
+
merged['chrom_coherence_mean'] = float(np.mean(coherence_values))
|
|
627
|
+
if prominence_values:
|
|
628
|
+
merged['chrom_prominence_mean'] = float(np.mean(prominence_values))
|
|
629
|
+
|
|
630
|
+
# Merge MS2 counts
|
|
631
|
+
ms2_counts = [f.get('number_ms2', 0) for f in feature_group]
|
|
632
|
+
merged['number_ms2'] = sum(ms2_counts)
|
|
633
|
+
|
|
634
|
+
# Keep the best quality score
|
|
635
|
+
quality_scores = [f.get('quality', 1.0) for f in feature_group if f.get('quality') is not None]
|
|
636
|
+
if quality_scores:
|
|
637
|
+
merged['quality'] = max(quality_scores)
|
|
638
|
+
|
|
639
|
+
return merged
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
|
|
643
|
+
"""
|
|
644
|
+
Validate that merged features have sufficient sample overlap.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
features: List of consensus feature dictionaries
|
|
648
|
+
min_overlap: Minimum sample overlap ratio (0.0-1.0)
|
|
649
|
+
|
|
650
|
+
Returns:
|
|
651
|
+
List of validated features
|
|
652
|
+
"""
|
|
653
|
+
# This is a placeholder for sample overlap validation
|
|
654
|
+
# Implementation would require access to which samples each feature appears in
|
|
655
|
+
# For now, we'll use a simple heuristic based on feature statistics
|
|
656
|
+
|
|
657
|
+
validated_features = []
|
|
658
|
+
for feature in features:
|
|
659
|
+
# Simple validation based on RT spread and sample count ratio
|
|
660
|
+
rt_spread = feature.get('rt_max', feature['rt']) - feature.get('rt_min', feature['rt'])
|
|
661
|
+
sample_count = feature.get('number_samples', 1)
|
|
662
|
+
|
|
663
|
+
# Features with very tight RT spread and high sample counts are more reliable
|
|
664
|
+
if rt_spread <= 2.0 or sample_count >= 10: # More permissive validation
|
|
665
|
+
validated_features.append(feature)
|
|
666
|
+
else:
|
|
667
|
+
# Could implement more sophisticated sample overlap checking here
|
|
668
|
+
validated_features.append(feature) # Keep for now
|
|
669
|
+
|
|
670
|
+
return validated_features
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
|
|
674
|
+
"""
|
|
675
|
+
Filter out features with excessive RT spread.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
features: List of consensus feature dictionaries
|
|
679
|
+
max_rt_spread: Maximum allowed RT spread in seconds
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
List of filtered features
|
|
683
|
+
"""
|
|
684
|
+
filtered_features = []
|
|
685
|
+
filtered_count = 0
|
|
686
|
+
|
|
687
|
+
for feature in features:
|
|
688
|
+
rt_min = feature.get('rt_min', feature['rt'])
|
|
689
|
+
rt_max = feature.get('rt_max', feature['rt'])
|
|
690
|
+
rt_spread = rt_max - rt_min
|
|
691
|
+
|
|
692
|
+
if rt_spread <= max_rt_spread:
|
|
693
|
+
filtered_features.append(feature)
|
|
694
|
+
else:
|
|
695
|
+
filtered_count += 1
|
|
696
|
+
|
|
697
|
+
if filtered_count > 0:
|
|
698
|
+
self.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
|
|
699
|
+
|
|
700
|
+
return filtered_features
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def _filter_coherence(self, features: list, min_coherence: float) -> list:
|
|
704
|
+
"""
|
|
705
|
+
Filter out features with low chromatographic coherence.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
features: List of consensus feature dictionaries
|
|
709
|
+
min_coherence: Minimum chromatographic coherence score
|
|
710
|
+
|
|
711
|
+
Returns:
|
|
712
|
+
List of filtered features
|
|
713
|
+
"""
|
|
714
|
+
filtered_features = []
|
|
715
|
+
filtered_count = 0
|
|
716
|
+
|
|
717
|
+
for feature in features:
|
|
718
|
+
coherence = feature.get('chrom_coherence_mean', 1.0) # Default to high coherence if missing
|
|
719
|
+
|
|
720
|
+
if coherence >= min_coherence:
|
|
721
|
+
filtered_features.append(feature)
|
|
722
|
+
else:
|
|
723
|
+
filtered_count += 1
|
|
724
|
+
|
|
725
|
+
if filtered_count > 0:
|
|
726
|
+
self.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
|
|
727
|
+
|
|
728
|
+
return filtered_features
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
|
|
732
|
+
"""KD-tree based merge without RT warping"""
|
|
733
|
+
|
|
734
|
+
consensus_map = oms.ConsensusMap()
|
|
735
|
+
file_descriptions = consensus_map.getColumnHeaders()
|
|
736
|
+
|
|
737
|
+
for i, feature_map in enumerate(self.features_maps):
|
|
738
|
+
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
739
|
+
file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
|
|
740
|
+
file_description.size = feature_map.size()
|
|
741
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
742
|
+
file_descriptions[i] = file_description
|
|
743
|
+
|
|
744
|
+
consensus_map.setColumnHeaders(file_descriptions)
|
|
745
|
+
|
|
746
|
+
# Configure KD algorithm with warping disabled for memory efficiency
|
|
747
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
748
|
+
params_oms = grouper.getParameters()
|
|
749
|
+
|
|
750
|
+
params_oms.setValue("mz_unit", "Da")
|
|
751
|
+
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
752
|
+
params_oms.setValue("warp:enabled", "false") # Disabled for memory efficiency
|
|
753
|
+
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
754
|
+
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
755
|
+
params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
756
|
+
params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
757
|
+
params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
758
|
+
#params_oms.setValue("link:charge_merging", "Any")
|
|
759
|
+
|
|
760
|
+
grouper.setParameters(params_oms)
|
|
761
|
+
grouper.group(self.features_maps, consensus_map)
|
|
762
|
+
|
|
763
|
+
return consensus_map
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
767
|
+
"""Chunked merge with proper cross-chunk consensus building"""
|
|
768
|
+
|
|
769
|
+
n_samples = len(self.features_maps)
|
|
770
|
+
if n_samples <= params.chunk_size:
|
|
771
|
+
self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
|
|
772
|
+
consensus_map = _merge_kd(self, params)
|
|
773
|
+
# Extract consensus features to populate consensus_df for chunked method consistency
|
|
774
|
+
self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
775
|
+
return consensus_map
|
|
776
|
+
|
|
777
|
+
# Process in chunks
|
|
778
|
+
chunks = []
|
|
779
|
+
for i in range(0, n_samples, params.chunk_size):
|
|
780
|
+
chunk_end = min(i + params.chunk_size, n_samples)
|
|
781
|
+
chunks.append((i, self.features_maps[i:chunk_end]))
|
|
782
|
+
|
|
783
|
+
self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
|
|
784
|
+
|
|
785
|
+
# Process each chunk to create chunk consensus maps
|
|
786
|
+
chunk_consensus_maps = []
|
|
787
|
+
|
|
788
|
+
for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
|
|
789
|
+
chunk_consensus_map = oms.ConsensusMap()
|
|
790
|
+
|
|
791
|
+
# Set up file descriptions for chunk
|
|
792
|
+
file_descriptions = chunk_consensus_map.getColumnHeaders()
|
|
793
|
+
for j, feature_map in enumerate(chunk_maps):
|
|
794
|
+
file_description = file_descriptions.get(j, oms.ColumnHeader())
|
|
795
|
+
file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
|
|
796
|
+
file_description.size = feature_map.size()
|
|
797
|
+
file_description.unique_id = feature_map.getUniqueId()
|
|
798
|
+
file_descriptions[j] = file_description
|
|
799
|
+
|
|
800
|
+
chunk_consensus_map.setColumnHeaders(file_descriptions)
|
|
801
|
+
|
|
802
|
+
# Use KD algorithm for chunk
|
|
803
|
+
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
804
|
+
chunk_params = grouper.getParameters()
|
|
805
|
+
chunk_params.setValue("mz_unit", "Da")
|
|
806
|
+
chunk_params.setValue("nr_partitions", params.nr_partitions)
|
|
807
|
+
chunk_params.setValue("warp:enabled", "true")
|
|
808
|
+
chunk_params.setValue("warp:rt_tol", params.rt_tol)
|
|
809
|
+
chunk_params.setValue("warp:mz_tol", params.mz_tol)
|
|
810
|
+
chunk_params.setValue("link:rt_tol", params.rt_tol)
|
|
811
|
+
chunk_params.setValue("link:mz_tol", params.mz_tol)
|
|
812
|
+
chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
813
|
+
chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
814
|
+
chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
815
|
+
|
|
816
|
+
grouper.setParameters(chunk_params)
|
|
817
|
+
grouper.group(chunk_maps, chunk_consensus_map)
|
|
818
|
+
|
|
819
|
+
chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
|
|
820
|
+
|
|
821
|
+
# Merge chunk results with proper cross-chunk consensus building
|
|
822
|
+
_merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
823
|
+
|
|
824
|
+
# Create a dummy consensus map for compatibility (since other functions expect it)
|
|
825
|
+
consensus_map = oms.ConsensusMap()
|
|
826
|
+
return consensus_map
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
|
|
830
|
+
"""
|
|
831
|
+
Scalable aggregation of chunk consensus maps into final consensus_df.
|
|
832
|
+
|
|
833
|
+
This function implements cross-chunk consensus building by:
|
|
834
|
+
1. Extracting feature_uids from each chunk consensus map
|
|
835
|
+
2. Aggregating features close in RT/m/z across chunks
|
|
836
|
+
3. Building consensus_df and consensus_mapping_df directly
|
|
837
|
+
"""
|
|
838
|
+
|
|
839
|
+
if len(chunk_consensus_maps) == 1:
|
|
840
|
+
# Single chunk case - just extract using the true global min_samples.
|
|
841
|
+
# No need for permissive threshold because we are not discarding singletons pre-aggregation.
|
|
842
|
+
self._extract_consensus_features(
|
|
843
|
+
chunk_consensus_maps[0][1],
|
|
844
|
+
params.min_samples,
|
|
845
|
+
cached_adducts_df,
|
|
846
|
+
cached_valid_adducts,
|
|
847
|
+
)
|
|
848
|
+
return
|
|
849
|
+
|
|
850
|
+
# Build feature_uid to feature_data lookup for fast access
|
|
851
|
+
feature_uid_map = {
|
|
852
|
+
row["feature_id"]: row["feature_uid"]
|
|
853
|
+
for row in self.features_df.iter_rows(named=True)
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
features_lookup = _optimized_feature_lookup(self, self.features_df)
|
|
857
|
+
|
|
858
|
+
# Extract all consensus features from chunks with their feature_uids
|
|
859
|
+
all_chunk_consensus = []
|
|
860
|
+
consensus_id_counter = 0
|
|
861
|
+
|
|
862
|
+
for chunk_idx, (chunk_start_idx, chunk_consensus_map) in enumerate(chunk_consensus_maps):
|
|
863
|
+
for consensus_feature in chunk_consensus_map:
|
|
864
|
+
# ACCEPT ALL consensus features (size >=1) here.
|
|
865
|
+
# Reason: A feature that is globally present in many samples can still
|
|
866
|
+
# appear only once inside a given sample chunk. Early filtering at
|
|
867
|
+
# size>=2 causes irreversible loss and underestimates the final
|
|
868
|
+
# consensus count (observed ~296 vs 950 for KD). We defer filtering
|
|
869
|
+
# strictly to the final global min_samples.
|
|
870
|
+
|
|
871
|
+
# Extract feature_uids from this consensus feature
|
|
872
|
+
feature_uids = []
|
|
873
|
+
feature_data_list = []
|
|
874
|
+
sample_uids = []
|
|
875
|
+
|
|
876
|
+
for feature_handle in consensus_feature.getFeatureList():
|
|
877
|
+
fuid = str(feature_handle.getUniqueId())
|
|
878
|
+
if fuid not in feature_uid_map:
|
|
879
|
+
continue
|
|
880
|
+
|
|
881
|
+
feature_uid = feature_uid_map[fuid]
|
|
882
|
+
feature_data = features_lookup.get(feature_uid)
|
|
883
|
+
if feature_data:
|
|
884
|
+
feature_uids.append(feature_uid)
|
|
885
|
+
feature_data_list.append(feature_data)
|
|
886
|
+
sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
|
|
887
|
+
|
|
888
|
+
if not feature_data_list:
|
|
889
|
+
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
890
|
+
continue # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
|
|
891
|
+
rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
|
|
892
|
+
mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
|
|
893
|
+
if rt_vals_local:
|
|
894
|
+
rt_min_local = min(rt_vals_local)
|
|
895
|
+
rt_max_local = max(rt_vals_local)
|
|
896
|
+
else:
|
|
897
|
+
rt_min_local = rt_max_local = consensus_feature.getRT()
|
|
898
|
+
if mz_vals_local:
|
|
899
|
+
mz_min_local = min(mz_vals_local)
|
|
900
|
+
mz_max_local = max(mz_vals_local)
|
|
901
|
+
else:
|
|
902
|
+
mz_min_local = mz_max_local = consensus_feature.getMZ()
|
|
903
|
+
|
|
904
|
+
# Store chunk consensus with feature tracking
|
|
905
|
+
chunk_consensus_data = {
|
|
906
|
+
'consensus_id': consensus_id_counter,
|
|
907
|
+
'chunk_idx': chunk_idx,
|
|
908
|
+
'chunk_start_idx': chunk_start_idx,
|
|
909
|
+
'mz': consensus_feature.getMZ(),
|
|
910
|
+
'rt': consensus_feature.getRT(),
|
|
911
|
+
'mz_min': mz_min_local,
|
|
912
|
+
'mz_max': mz_max_local,
|
|
913
|
+
'rt_min': rt_min_local,
|
|
914
|
+
'rt_max': rt_max_local,
|
|
915
|
+
'intensity': consensus_feature.getIntensity(),
|
|
916
|
+
'quality': consensus_feature.getQuality(),
|
|
917
|
+
'feature_uids': feature_uids,
|
|
918
|
+
'feature_data_list': feature_data_list,
|
|
919
|
+
'sample_uids': sample_uids,
|
|
920
|
+
'sample_count': len(feature_data_list)
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
all_chunk_consensus.append(chunk_consensus_data)
|
|
924
|
+
consensus_id_counter += 1
|
|
925
|
+
|
|
926
|
+
if not all_chunk_consensus:
|
|
927
|
+
# No valid consensus features found
|
|
928
|
+
self.consensus_df = pl.DataFrame()
|
|
929
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
930
|
+
return
|
|
931
|
+
|
|
932
|
+
# Perform cross-chunk clustering using optimized spatial indexing
|
|
933
|
+
def _cluster_chunk_consensus(chunk_consensus_list: list, rt_tol: float, mz_tol: float) -> list:
|
|
934
|
+
"""Cluster chunk consensus features using interval overlap (no over-relaxation).
|
|
935
|
+
|
|
936
|
+
A union is formed if either centroids are within tolerance OR their RT / m/z
|
|
937
|
+
intervals (expanded by tolerance) overlap, and they originate from different chunks.
|
|
938
|
+
"""
|
|
939
|
+
if not chunk_consensus_list:
|
|
940
|
+
return []
|
|
941
|
+
|
|
942
|
+
n_features = len(chunk_consensus_list)
|
|
943
|
+
|
|
944
|
+
# Spatial bins using strict tolerances (improves candidate reduction without recall loss)
|
|
945
|
+
rt_bin_size = rt_tol if rt_tol > 0 else 1.0
|
|
946
|
+
mz_bin_size = mz_tol if mz_tol > 0 else 0.01
|
|
947
|
+
features_by_bin = defaultdict(list)
|
|
948
|
+
|
|
949
|
+
for i, cf in enumerate(chunk_consensus_list):
|
|
950
|
+
rt_bin = int(cf['rt'] / rt_bin_size)
|
|
951
|
+
mz_bin = int(cf['mz'] / mz_bin_size)
|
|
952
|
+
features_by_bin[(rt_bin, mz_bin)].append(i)
|
|
953
|
+
|
|
954
|
+
class UF:
|
|
955
|
+
def __init__(self, n):
|
|
956
|
+
self.p = list(range(n))
|
|
957
|
+
self.r = [0]*n
|
|
958
|
+
def find(self, x):
|
|
959
|
+
if self.p[x] != x:
|
|
960
|
+
self.p[x] = self.find(self.p[x])
|
|
961
|
+
return self.p[x]
|
|
962
|
+
def union(self, a,b):
|
|
963
|
+
pa, pb = self.find(a), self.find(b)
|
|
964
|
+
if pa == pb:
|
|
965
|
+
return
|
|
966
|
+
if self.r[pa] < self.r[pb]:
|
|
967
|
+
pa, pb = pb, pa
|
|
968
|
+
self.p[pb] = pa
|
|
969
|
+
if self.r[pa] == self.r[pb]:
|
|
970
|
+
self.r[pa] += 1
|
|
971
|
+
|
|
972
|
+
uf = UF(n_features)
|
|
973
|
+
checked = set()
|
|
974
|
+
for (rtb, mzb), idxs in features_by_bin.items():
|
|
975
|
+
for dr in (-1,0,1):
|
|
976
|
+
for dm in (-1,0,1):
|
|
977
|
+
neigh = (rtb+dr, mzb+dm)
|
|
978
|
+
if neigh not in features_by_bin:
|
|
979
|
+
continue
|
|
980
|
+
for i in idxs:
|
|
981
|
+
for j in features_by_bin[neigh]:
|
|
982
|
+
if i >= j:
|
|
983
|
+
continue
|
|
984
|
+
pair = (i,j)
|
|
985
|
+
if pair in checked:
|
|
986
|
+
continue
|
|
987
|
+
checked.add(pair)
|
|
988
|
+
a = chunk_consensus_list[i]
|
|
989
|
+
b = chunk_consensus_list[j]
|
|
990
|
+
if a['chunk_idx'] == b['chunk_idx']:
|
|
991
|
+
continue
|
|
992
|
+
|
|
993
|
+
# Primary check: centroid distance (strict)
|
|
994
|
+
centroid_close = (abs(a['rt']-b['rt']) <= rt_tol and abs(a['mz']-b['mz']) <= mz_tol)
|
|
995
|
+
|
|
996
|
+
# Secondary check: interval overlap (more conservative)
|
|
997
|
+
# Only allow interval overlap if centroids are reasonably close (within 2x tolerance)
|
|
998
|
+
centroids_reasonable = (abs(a['rt']-b['rt']) <= 2 * rt_tol and abs(a['mz']-b['mz']) <= 2 * mz_tol)
|
|
999
|
+
if centroids_reasonable:
|
|
1000
|
+
rt_overlap = (a['rt_min'] - rt_tol/2) <= (b['rt_max'] + rt_tol/2) and (b['rt_min'] - rt_tol/2) <= (a['rt_max'] + rt_tol/2)
|
|
1001
|
+
mz_overlap = (a['mz_min'] - mz_tol/2) <= (b['mz_max'] + mz_tol/2) and (b['mz_min'] - mz_tol/2) <= (a['mz_max'] + mz_tol/2)
|
|
1002
|
+
else:
|
|
1003
|
+
rt_overlap = mz_overlap = False
|
|
1004
|
+
|
|
1005
|
+
if centroid_close or (rt_overlap and mz_overlap):
|
|
1006
|
+
uf.union(i,j)
|
|
1007
|
+
|
|
1008
|
+
groups_by_root = defaultdict(list)
|
|
1009
|
+
for i in range(n_features):
|
|
1010
|
+
groups_by_root[uf.find(i)].append(chunk_consensus_list[i])
|
|
1011
|
+
return list(groups_by_root.values())
|
|
1012
|
+
# (Obsolete relaxed + centroid stitching code removed.)
|
|
1013
|
+
|
|
1014
|
+
# --- Stage 1: initial cross-chunk clustering of chunk consensus features ---
|
|
1015
|
+
initial_groups = _cluster_chunk_consensus(all_chunk_consensus, params.rt_tol, params.mz_tol)
|
|
1016
|
+
|
|
1017
|
+
# --- Stage 2: centroid refinement (lightweight second pass) ---
|
|
1018
|
+
def _refine_groups(groups: list, rt_tol: float, mz_tol: float) -> list:
|
|
1019
|
+
"""Refine groups by clustering group centroids (single-link) under same tolerances.
|
|
1020
|
+
|
|
1021
|
+
This reconciles borderline splits left after interval-overlap clustering without
|
|
1022
|
+
re-introducing broad over-merging. Works on group centroids only (low cost).
|
|
1023
|
+
"""
|
|
1024
|
+
if len(groups) <= 1:
|
|
1025
|
+
return groups
|
|
1026
|
+
# Build centroid list
|
|
1027
|
+
centroids = [] # (idx, rt, mz)
|
|
1028
|
+
for gi, g in enumerate(groups):
|
|
1029
|
+
if not g:
|
|
1030
|
+
continue
|
|
1031
|
+
rt_vals = [cf['rt'] for cf in g]
|
|
1032
|
+
mz_vals = [cf['mz'] for cf in g]
|
|
1033
|
+
if not rt_vals or not mz_vals:
|
|
1034
|
+
continue
|
|
1035
|
+
centroids.append((gi, float(np.mean(rt_vals)), float(np.mean(mz_vals))))
|
|
1036
|
+
if len(centroids) <= 1:
|
|
1037
|
+
return groups
|
|
1038
|
+
|
|
1039
|
+
# Spatial binning for centroid clustering
|
|
1040
|
+
rt_bin = rt_tol if rt_tol > 0 else 1.0
|
|
1041
|
+
mz_bin = mz_tol if mz_tol > 0 else 0.01
|
|
1042
|
+
bins = defaultdict(list)
|
|
1043
|
+
for idx, rt_c, mz_c in centroids:
|
|
1044
|
+
bins[(int(rt_c/rt_bin), int(mz_c/mz_bin))].append((idx, rt_c, mz_c))
|
|
1045
|
+
|
|
1046
|
+
# Union-Find over group indices
|
|
1047
|
+
parent = list(range(len(groups)))
|
|
1048
|
+
rank = [0]*len(groups)
|
|
1049
|
+
def find(x):
|
|
1050
|
+
if parent[x] != x:
|
|
1051
|
+
parent[x] = find(parent[x])
|
|
1052
|
+
return parent[x]
|
|
1053
|
+
def union(a,b):
|
|
1054
|
+
pa, pb = find(a), find(b)
|
|
1055
|
+
if pa == pb:
|
|
1056
|
+
return
|
|
1057
|
+
if rank[pa] < rank[pb]:
|
|
1058
|
+
pa, pb = pb, pa
|
|
1059
|
+
parent[pb] = pa
|
|
1060
|
+
if rank[pa] == rank[pb]:
|
|
1061
|
+
rank[pa] += 1
|
|
1062
|
+
|
|
1063
|
+
checked = set()
|
|
1064
|
+
for (rb, mb), items in bins.items():
|
|
1065
|
+
for dr in (-1,0,1):
|
|
1066
|
+
for dm in (-1,0,1):
|
|
1067
|
+
neigh_key = (rb+dr, mb+dm)
|
|
1068
|
+
if neigh_key not in bins:
|
|
1069
|
+
continue
|
|
1070
|
+
for (gi, rt_i, mz_i) in items:
|
|
1071
|
+
for (gj, rt_j, mz_j) in bins[neigh_key]:
|
|
1072
|
+
if gi >= gj:
|
|
1073
|
+
continue
|
|
1074
|
+
pair = (gi, gj)
|
|
1075
|
+
if pair in checked:
|
|
1076
|
+
continue
|
|
1077
|
+
checked.add(pair)
|
|
1078
|
+
if abs(rt_i-rt_j) <= rt_tol and abs(mz_i-mz_j) <= mz_tol:
|
|
1079
|
+
union(gi, gj)
|
|
1080
|
+
|
|
1081
|
+
merged = defaultdict(list)
|
|
1082
|
+
for gi, g in enumerate(groups):
|
|
1083
|
+
merged[find(gi)].extend(g)
|
|
1084
|
+
return list(merged.values())
|
|
1085
|
+
|
|
1086
|
+
refined_groups = _refine_groups(initial_groups, params.rt_tol, params.mz_tol)
|
|
1087
|
+
|
|
1088
|
+
# --- Stage 3: build final consensus feature metadata and mapping ---
|
|
1089
|
+
consensus_metadata = []
|
|
1090
|
+
consensus_mapping_list = []
|
|
1091
|
+
consensus_uid_counter = 0
|
|
1092
|
+
|
|
1093
|
+
for group in refined_groups:
|
|
1094
|
+
if not group:
|
|
1095
|
+
continue
|
|
1096
|
+
|
|
1097
|
+
# Aggregate underlying feature data (deduplicated by feature_uid)
|
|
1098
|
+
feature_data_acc = {}
|
|
1099
|
+
sample_uids_acc = set()
|
|
1100
|
+
rt_values_chunk = [] # use chunk-level centroids for statistic helper
|
|
1101
|
+
mz_values_chunk = []
|
|
1102
|
+
intensity_values_chunk = []
|
|
1103
|
+
quality_values_chunk = []
|
|
1104
|
+
|
|
1105
|
+
for cf in group:
|
|
1106
|
+
rt_values_chunk.append(cf['rt'])
|
|
1107
|
+
mz_values_chunk.append(cf['mz'])
|
|
1108
|
+
intensity_values_chunk.append(cf.get('intensity', 0.0) or 0.0)
|
|
1109
|
+
quality_values_chunk.append(cf.get('quality', 1.0) or 1.0)
|
|
1110
|
+
|
|
1111
|
+
for fd, samp_uid in zip(cf['feature_data_list'], cf['sample_uids']):
|
|
1112
|
+
fid = fd.get('feature_uid') or fd.get('uid') or fd.get('feature_id')
|
|
1113
|
+
# feature_uid expected in fd under 'feature_uid'; fallback attempts just in case
|
|
1114
|
+
if fid is None:
|
|
1115
|
+
continue
|
|
1116
|
+
if fid not in feature_data_acc:
|
|
1117
|
+
feature_data_acc[fid] = fd
|
|
1118
|
+
sample_uids_acc.add(samp_uid)
|
|
1119
|
+
|
|
1120
|
+
if not feature_data_acc:
|
|
1121
|
+
continue
|
|
1122
|
+
|
|
1123
|
+
number_samples = len(sample_uids_acc)
|
|
1124
|
+
|
|
1125
|
+
# NOTE: Don't filter by min_samples here - let _finalize_merge handle it
|
|
1126
|
+
# This allows proper cross-chunk consensus building before final filtering
|
|
1127
|
+
|
|
1128
|
+
metadata = _calculate_consensus_statistics(
|
|
1129
|
+
self,
|
|
1130
|
+
consensus_uid_counter,
|
|
1131
|
+
list(feature_data_acc.values()),
|
|
1132
|
+
rt_values_chunk,
|
|
1133
|
+
mz_values_chunk,
|
|
1134
|
+
intensity_values_chunk,
|
|
1135
|
+
quality_values_chunk,
|
|
1136
|
+
number_features=len(feature_data_acc),
|
|
1137
|
+
number_samples=number_samples,
|
|
1138
|
+
cached_adducts_df=cached_adducts_df,
|
|
1139
|
+
cached_valid_adducts=cached_valid_adducts,
|
|
1140
|
+
)
|
|
1141
|
+
|
|
1142
|
+
# Validate RT spread doesn't exceed tolerance (with some flexibility for chunked merge)
|
|
1143
|
+
rt_spread = metadata.get('rt_max', 0) - metadata.get('rt_min', 0)
|
|
1144
|
+
max_allowed_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
|
|
1145
|
+
|
|
1146
|
+
if rt_spread > max_allowed_spread:
|
|
1147
|
+
# Skip consensus features with excessive RT spread
|
|
1148
|
+
self.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
|
|
1149
|
+
consensus_uid_counter += 1
|
|
1150
|
+
continue
|
|
1151
|
+
|
|
1152
|
+
consensus_metadata.append(metadata)
|
|
1153
|
+
|
|
1154
|
+
# Build mapping rows (deduplicated)
|
|
1155
|
+
for fid, fd in feature_data_acc.items():
|
|
1156
|
+
samp_uid = fd.get('sample_uid') or fd.get('sample_id') or fd.get('sample')
|
|
1157
|
+
# If absent we attempt to derive from original group sample_uids pairing
|
|
1158
|
+
# but most feature_data rows should include sample_uid already.
|
|
1159
|
+
if samp_uid is None:
|
|
1160
|
+
# fallback: search for cf containing this fid
|
|
1161
|
+
for cf in group:
|
|
1162
|
+
for fd2, samp2 in zip(cf['feature_data_list'], cf['sample_uids']):
|
|
1163
|
+
f2id = fd2.get('feature_uid') or fd2.get('uid') or fd2.get('feature_id')
|
|
1164
|
+
if f2id == fid:
|
|
1165
|
+
samp_uid = samp2
|
|
1166
|
+
break
|
|
1167
|
+
if samp_uid is not None:
|
|
1168
|
+
break
|
|
1169
|
+
if samp_uid is None:
|
|
1170
|
+
continue
|
|
1171
|
+
consensus_mapping_list.append({
|
|
1172
|
+
'consensus_uid': consensus_uid_counter,
|
|
1173
|
+
'sample_uid': samp_uid,
|
|
1174
|
+
'feature_uid': fid,
|
|
1175
|
+
})
|
|
1176
|
+
|
|
1177
|
+
consensus_uid_counter += 1
|
|
1178
|
+
|
|
1179
|
+
# Assign DataFrames
|
|
1180
|
+
self.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
|
|
1181
|
+
self.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
|
|
1182
|
+
|
|
1183
|
+
# Ensure mapping only contains features from retained consensus_df
|
|
1184
|
+
if len(self.consensus_df) > 0:
|
|
1185
|
+
valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
|
|
1186
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
1187
|
+
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
1188
|
+
)
|
|
1189
|
+
else:
|
|
1190
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
1191
|
+
|
|
1192
|
+
# Attach empty consensus_map placeholder for downstream compatibility
|
|
1193
|
+
self.consensus_map = oms.ConsensusMap()
|
|
1194
|
+
return
|
|
1195
|
+
|
|
1196
|
+
|
|
1197
|
+
def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
|
|
1198
|
+
rt_values: list, mz_values: list,
|
|
1199
|
+
intensity_values: list, quality_values: list,
|
|
1200
|
+
number_features: int = None, number_samples: int = None,
|
|
1201
|
+
cached_adducts_df=None, cached_valid_adducts=None) -> dict:
|
|
1202
|
+
"""
|
|
1203
|
+
Calculate comprehensive statistics for a consensus feature from aggregated feature data.
|
|
1204
|
+
|
|
1205
|
+
Args:
|
|
1206
|
+
consensus_uid: Unique ID for this consensus feature
|
|
1207
|
+
feature_data_list: List of individual feature dictionaries
|
|
1208
|
+
rt_values: RT values from chunk consensus features
|
|
1209
|
+
mz_values: m/z values from chunk consensus features
|
|
1210
|
+
intensity_values: Intensity values from chunk consensus features
|
|
1211
|
+
quality_values: Quality values from chunk consensus features
|
|
1212
|
+
|
|
1213
|
+
Returns:
|
|
1214
|
+
Dictionary with consensus feature metadata
|
|
1215
|
+
"""
|
|
1216
|
+
if not feature_data_list:
|
|
1217
|
+
return {}
|
|
1218
|
+
|
|
1219
|
+
# Convert feature data to numpy arrays for vectorized computation
|
|
1220
|
+
rt_feat_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
|
|
1221
|
+
mz_feat_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
|
|
1222
|
+
rt_start_values = np.array([fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None])
|
|
1223
|
+
rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
|
|
1224
|
+
rt_delta_values = np.array([fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None])
|
|
1225
|
+
mz_start_values = np.array([fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None])
|
|
1226
|
+
mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
|
|
1227
|
+
inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
|
|
1228
|
+
coherence_values = np.array([fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None])
|
|
1229
|
+
prominence_values = np.array([fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None])
|
|
1230
|
+
prominence_scaled_values = np.array([fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None])
|
|
1231
|
+
height_scaled_values = np.array([fd.get("chrom_prominence_scaled", 0) for fd in feature_data_list if fd.get("chrom_prominence_scaled") is not None])
|
|
1232
|
+
iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
|
|
1233
|
+
charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
|
|
1234
|
+
|
|
1235
|
+
# Process adducts with cached validation
|
|
1236
|
+
all_adducts = []
|
|
1237
|
+
valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
|
|
1238
|
+
valid_adducts.add("?") # Always allow '?' adducts
|
|
1239
|
+
|
|
1240
|
+
for fd in feature_data_list:
|
|
1241
|
+
adduct = fd.get("adduct")
|
|
1242
|
+
if adduct is not None:
|
|
1243
|
+
# Only include adducts that are valid (from cached study adducts or contain '?')
|
|
1244
|
+
if adduct in valid_adducts or "?" in adduct:
|
|
1245
|
+
all_adducts.append(adduct)
|
|
1246
|
+
|
|
1247
|
+
# Calculate adduct consensus
|
|
1248
|
+
adduct_values = []
|
|
1249
|
+
adduct_top = None
|
|
1250
|
+
adduct_charge_top = None
|
|
1251
|
+
adduct_mass_neutral_top = None
|
|
1252
|
+
adduct_mass_shift_top = None
|
|
1253
|
+
|
|
1254
|
+
if all_adducts:
|
|
1255
|
+
adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
|
|
1256
|
+
total_count = sum(adduct_counts.values())
|
|
1257
|
+
for adduct, count in adduct_counts.items():
|
|
1258
|
+
percentage = (count / total_count) * 100 if total_count > 0 else 0
|
|
1259
|
+
adduct_values.append([str(adduct), int(count), float(round(percentage, 2))])
|
|
1260
|
+
|
|
1261
|
+
adduct_values.sort(key=lambda x: x[1], reverse=True)
|
|
1262
|
+
|
|
1263
|
+
if adduct_values:
|
|
1264
|
+
adduct_top = adduct_values[0][0]
|
|
1265
|
+
# Try to get charge and mass shift from cached study adducts
|
|
1266
|
+
adduct_found = False
|
|
1267
|
+
if cached_adducts_df is not None and not cached_adducts_df.is_empty():
|
|
1268
|
+
matching_adduct = cached_adducts_df.filter(
|
|
1269
|
+
pl.col("name") == adduct_top,
|
|
1270
|
+
)
|
|
1271
|
+
if not matching_adduct.is_empty():
|
|
1272
|
+
adduct_row = matching_adduct.row(0, named=True)
|
|
1273
|
+
adduct_charge_top = adduct_row["charge"]
|
|
1274
|
+
adduct_mass_shift_top = adduct_row["mass_shift"]
|
|
1275
|
+
adduct_found = True
|
|
1276
|
+
|
|
1277
|
+
if not adduct_found:
|
|
1278
|
+
# Set default charge and mass shift for top adduct
|
|
1279
|
+
adduct_charge_top = 1
|
|
1280
|
+
adduct_mass_shift_top = 1.007825
|
|
1281
|
+
else:
|
|
1282
|
+
# Default adduct based on study polarity
|
|
1283
|
+
study_polarity = getattr(study_obj, "polarity", "positive")
|
|
1284
|
+
if study_polarity in ["negative", "neg"]:
|
|
1285
|
+
adduct_top = "[M-?]1-"
|
|
1286
|
+
adduct_charge_top = -1
|
|
1287
|
+
adduct_mass_shift_top = -1.007825
|
|
1288
|
+
else:
|
|
1289
|
+
adduct_top = "[M+?]1+"
|
|
1290
|
+
adduct_charge_top = 1
|
|
1291
|
+
adduct_mass_shift_top = 1.007825
|
|
1292
|
+
|
|
1293
|
+
adduct_values = [[adduct_top, 1, 100.0]]
|
|
1294
|
+
|
|
1295
|
+
# Calculate neutral mass
|
|
1296
|
+
consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
1297
|
+
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
1298
|
+
adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
1299
|
+
|
|
1300
|
+
# Calculate MS2 count
|
|
1301
|
+
ms2_count = 0
|
|
1302
|
+
for fd in feature_data_list:
|
|
1303
|
+
ms2_scans = fd.get("ms2_scans")
|
|
1304
|
+
if ms2_scans is not None:
|
|
1305
|
+
ms2_count += len(ms2_scans)
|
|
1306
|
+
|
|
1307
|
+
# Build consensus metadata
|
|
1308
|
+
return {
|
|
1309
|
+
"consensus_uid": int(consensus_uid),
|
|
1310
|
+
"consensus_id": str(consensus_uid), # Use simple string ID
|
|
1311
|
+
"quality": round(float(np.mean(quality_values)), 3) if len(quality_values) > 0 else 1.0,
|
|
1312
|
+
"number_samples": number_samples if number_samples is not None else len(feature_data_list),
|
|
1313
|
+
"rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
|
|
1314
|
+
"mz": consensus_mz,
|
|
1315
|
+
"rt_min": round(float(np.min(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
|
|
1316
|
+
"rt_max": round(float(np.max(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
|
|
1317
|
+
"rt_mean": round(float(np.mean(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
|
|
1318
|
+
"rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
|
|
1319
|
+
"rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
|
|
1320
|
+
"rt_delta_mean": round(float(np.mean(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
|
|
1321
|
+
"mz_min": round(float(np.min(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
|
|
1322
|
+
"mz_max": round(float(np.max(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
|
|
1323
|
+
"mz_mean": round(float(np.mean(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
|
|
1324
|
+
"mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
|
|
1325
|
+
"mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
|
|
1326
|
+
"inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
|
|
1327
|
+
"bl": -1.0,
|
|
1328
|
+
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
|
|
1329
|
+
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
|
|
1330
|
+
"chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3) if len(prominence_scaled_values) > 0 else 0.0,
|
|
1331
|
+
"chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3) if len(height_scaled_values) > 0 else 0.0,
|
|
1332
|
+
"iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
|
|
1333
|
+
"charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
|
|
1334
|
+
"number_ms2": int(ms2_count),
|
|
1335
|
+
"adducts": adduct_values,
|
|
1336
|
+
"adduct_top": adduct_top,
|
|
1337
|
+
"adduct_charge_top": adduct_charge_top,
|
|
1338
|
+
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
|
|
1339
|
+
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
|
|
1340
|
+
"id_top_name": None,
|
|
1341
|
+
"id_top_class": None,
|
|
1342
|
+
"id_top_adduct": None,
|
|
1343
|
+
"id_top_score": None,
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) -> list:
|
|
1348
|
+
"""
|
|
1349
|
+
Cluster consensus features from different chunks based on RT and m/z similarity.
|
|
1350
|
+
|
|
1351
|
+
Args:
|
|
1352
|
+
features: List of feature dictionaries with 'mz', 'rt', 'id' keys
|
|
1353
|
+
rt_tol: RT tolerance in seconds
|
|
1354
|
+
mz_tol: m/z tolerance in Da
|
|
1355
|
+
|
|
1356
|
+
Returns:
|
|
1357
|
+
List of groups, where each group is a list of feature dictionaries
|
|
1358
|
+
"""
|
|
1359
|
+
if not features:
|
|
1360
|
+
return []
|
|
1361
|
+
|
|
1362
|
+
# Use Union-Find for efficient clustering
|
|
1363
|
+
class UnionFind:
|
|
1364
|
+
def __init__(self, n):
|
|
1365
|
+
self.parent = list(range(n))
|
|
1366
|
+
self.rank = [0] * n
|
|
1367
|
+
|
|
1368
|
+
def find(self, x):
|
|
1369
|
+
if self.parent[x] != x:
|
|
1370
|
+
self.parent[x] = self.find(self.parent[x])
|
|
1371
|
+
return self.parent[x]
|
|
1372
|
+
|
|
1373
|
+
def union(self, x, y):
|
|
1374
|
+
px, py = self.find(x), self.find(y)
|
|
1375
|
+
if px == py:
|
|
1376
|
+
return
|
|
1377
|
+
if self.rank[px] < self.rank[py]:
|
|
1378
|
+
px, py = py, px
|
|
1379
|
+
self.parent[py] = px
|
|
1380
|
+
if self.rank[px] == self.rank[py]:
|
|
1381
|
+
self.rank[px] += 1
|
|
1382
|
+
|
|
1383
|
+
n_features = len(features)
|
|
1384
|
+
uf = UnionFind(n_features)
|
|
1385
|
+
|
|
1386
|
+
# Build distance matrix and cluster features within tolerance
|
|
1387
|
+
for i in range(n_features):
|
|
1388
|
+
for j in range(i + 1, n_features):
|
|
1389
|
+
feat_i = features[i]
|
|
1390
|
+
feat_j = features[j]
|
|
1391
|
+
|
|
1392
|
+
# Skip if features are from the same chunk (they're already processed)
|
|
1393
|
+
if feat_i['chunk_idx'] == feat_j['chunk_idx']:
|
|
1394
|
+
continue
|
|
1395
|
+
|
|
1396
|
+
mz_diff = abs(feat_i['mz'] - feat_j['mz'])
|
|
1397
|
+
rt_diff = abs(feat_i['rt'] - feat_j['rt'])
|
|
1398
|
+
|
|
1399
|
+
# Cluster if within tolerance
|
|
1400
|
+
if mz_diff <= mz_tol and rt_diff <= rt_tol:
|
|
1401
|
+
uf.union(i, j)
|
|
1402
|
+
|
|
1403
|
+
# Extract groups
|
|
1404
|
+
groups_by_root = {}
|
|
1405
|
+
for i in range(n_features):
|
|
1406
|
+
root = uf.find(i)
|
|
1407
|
+
if root not in groups_by_root:
|
|
1408
|
+
groups_by_root[root] = []
|
|
1409
|
+
groups_by_root[root].append(features[i])
|
|
1410
|
+
|
|
1411
|
+
return list(groups_by_root.values())
|
|
1412
|
+
|
|
1413
|
+
|
|
1414
|
+
# Note: Restored proper chunked implementation with cross-chunk consensus clustering
|
|
1415
|
+
|
|
1416
|
+
|
|
1417
|
+
def _reset_consensus_data(self):
|
|
1418
|
+
"""Reset consensus-related DataFrames at the start of merge."""
|
|
1419
|
+
self.consensus_df = pl.DataFrame()
|
|
1420
|
+
self.consensus_ms2 = pl.DataFrame()
|
|
1421
|
+
self.consensus_mapping_df = pl.DataFrame()
|
|
1422
|
+
|
|
1423
|
+
|
|
1424
|
+
def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
|
|
1425
|
+
"""Extract consensus features and build metadata."""
|
|
1426
|
+
# create a dict to map uid to feature_uid using self.features_df
|
|
1427
|
+
feature_uid_map = {
|
|
1428
|
+
row["feature_id"]: row["feature_uid"]
|
|
1429
|
+
for row in self.features_df.iter_rows(named=True)
|
|
1430
|
+
}
|
|
1431
|
+
imax = consensus_map.size()
|
|
1432
|
+
|
|
1433
|
+
self.logger.debug(f"Found {imax} feature groups by clustering.")
|
|
1434
|
+
|
|
1435
|
+
# Pre-build fast lookup tables for features_df data using optimized approach
|
|
1436
|
+
features_lookup = _optimized_feature_lookup(self, self.features_df)
|
|
1437
|
+
|
|
1438
|
+
# create a list to store the consensus mapping
|
|
1439
|
+
consensus_mapping = []
|
|
1440
|
+
metadata_list = []
|
|
1441
|
+
|
|
1442
|
+
tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
|
|
1443
|
+
|
|
1444
|
+
for i, feature in enumerate(
|
|
1445
|
+
tqdm(
|
|
1446
|
+
consensus_map,
|
|
1447
|
+
total=imax,
|
|
1448
|
+
disable=tqdm_disable,
|
|
1449
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
|
|
1450
|
+
),
|
|
1451
|
+
):
|
|
1452
|
+
# get all features in the feature map with the same unique id as the consensus feature
|
|
1453
|
+
features_list = feature.getFeatureList()
|
|
1454
|
+
uids = []
|
|
1455
|
+
feature_data_list = []
|
|
1456
|
+
|
|
1457
|
+
for _j, f in enumerate(features_list):
|
|
1458
|
+
fuid = str(f.getUniqueId())
|
|
1459
|
+
if fuid not in feature_uid_map:
|
|
1460
|
+
# this is a feature that was removed but is still in the feature maps
|
|
1461
|
+
continue
|
|
1462
|
+
fuid = feature_uid_map[fuid]
|
|
1463
|
+
consensus_mapping.append(
|
|
1464
|
+
{
|
|
1465
|
+
"consensus_uid": i,
|
|
1466
|
+
"sample_uid": f.getMapIndex() + 1,
|
|
1467
|
+
"feature_uid": fuid,
|
|
1468
|
+
},
|
|
1469
|
+
)
|
|
1470
|
+
uids.append(fuid)
|
|
1471
|
+
|
|
1472
|
+
# Get feature data from lookup instead of DataFrame filtering
|
|
1473
|
+
feature_data = features_lookup.get(fuid)
|
|
1474
|
+
if feature_data:
|
|
1475
|
+
feature_data_list.append(feature_data)
|
|
1476
|
+
|
|
1477
|
+
if not feature_data_list:
|
|
1478
|
+
# Skip this consensus feature if no valid features found
|
|
1479
|
+
continue
|
|
1480
|
+
|
|
1481
|
+
# Compute statistics using vectorized operations on collected data
|
|
1482
|
+
# Convert to numpy arrays for faster computation
|
|
1483
|
+
rt_values = np.array(
|
|
1484
|
+
[fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
|
|
1485
|
+
)
|
|
1486
|
+
mz_values = np.array(
|
|
1487
|
+
[fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
|
|
1488
|
+
)
|
|
1489
|
+
rt_start_values = np.array(
|
|
1490
|
+
[
|
|
1491
|
+
fd.get("rt_start", 0)
|
|
1492
|
+
for fd in feature_data_list
|
|
1493
|
+
if fd.get("rt_start") is not None
|
|
1494
|
+
],
|
|
1495
|
+
)
|
|
1496
|
+
rt_end_values = np.array(
|
|
1497
|
+
[
|
|
1498
|
+
fd.get("rt_end", 0)
|
|
1499
|
+
for fd in feature_data_list
|
|
1500
|
+
if fd.get("rt_end") is not None
|
|
1501
|
+
],
|
|
1502
|
+
)
|
|
1503
|
+
rt_delta_values = np.array(
|
|
1504
|
+
[
|
|
1505
|
+
fd.get("rt_delta", 0)
|
|
1506
|
+
for fd in feature_data_list
|
|
1507
|
+
if fd.get("rt_delta") is not None
|
|
1508
|
+
],
|
|
1509
|
+
)
|
|
1510
|
+
mz_start_values = np.array(
|
|
1511
|
+
[
|
|
1512
|
+
fd.get("mz_start", 0)
|
|
1513
|
+
for fd in feature_data_list
|
|
1514
|
+
if fd.get("mz_start") is not None
|
|
1515
|
+
],
|
|
1516
|
+
)
|
|
1517
|
+
mz_end_values = np.array(
|
|
1518
|
+
[
|
|
1519
|
+
fd.get("mz_end", 0)
|
|
1520
|
+
for fd in feature_data_list
|
|
1521
|
+
if fd.get("mz_end") is not None
|
|
1522
|
+
],
|
|
1523
|
+
)
|
|
1524
|
+
inty_values = np.array(
|
|
1525
|
+
[
|
|
1526
|
+
fd.get("inty", 0)
|
|
1527
|
+
for fd in feature_data_list
|
|
1528
|
+
if fd.get("inty") is not None
|
|
1529
|
+
],
|
|
1530
|
+
)
|
|
1531
|
+
coherence_values = np.array(
|
|
1532
|
+
[
|
|
1533
|
+
fd.get("chrom_coherence", 0)
|
|
1534
|
+
for fd in feature_data_list
|
|
1535
|
+
if fd.get("chrom_coherence") is not None
|
|
1536
|
+
],
|
|
1537
|
+
)
|
|
1538
|
+
prominence_values = np.array(
|
|
1539
|
+
[
|
|
1540
|
+
fd.get("chrom_prominence", 0)
|
|
1541
|
+
for fd in feature_data_list
|
|
1542
|
+
if fd.get("chrom_prominence") is not None
|
|
1543
|
+
],
|
|
1544
|
+
)
|
|
1545
|
+
prominence_scaled_values = np.array(
|
|
1546
|
+
[
|
|
1547
|
+
fd.get("chrom_height_scaled", 0)
|
|
1548
|
+
for fd in feature_data_list
|
|
1549
|
+
if fd.get("chrom_height_scaled") is not None
|
|
1550
|
+
],
|
|
1551
|
+
)
|
|
1552
|
+
height_scaled_values = np.array(
|
|
1553
|
+
[
|
|
1554
|
+
fd.get("chrom_prominence_scaled", 0)
|
|
1555
|
+
for fd in feature_data_list
|
|
1556
|
+
if fd.get("chrom_prominence_scaled") is not None
|
|
1557
|
+
],
|
|
1558
|
+
)
|
|
1559
|
+
iso_values = np.array(
|
|
1560
|
+
[fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
|
|
1561
|
+
)
|
|
1562
|
+
charge_values = np.array(
|
|
1563
|
+
[
|
|
1564
|
+
fd.get("charge", 0)
|
|
1565
|
+
for fd in feature_data_list
|
|
1566
|
+
if fd.get("charge") is not None
|
|
1567
|
+
],
|
|
1568
|
+
)
|
|
1569
|
+
|
|
1570
|
+
# adduct_values
|
|
1571
|
+
# Collect all adducts from feature_data_list to create consensus adduct information
|
|
1572
|
+
# Only consider adducts that are in study._get_adducts() plus items with '?'
|
|
1573
|
+
all_adducts = []
|
|
1574
|
+
adduct_masses = {}
|
|
1575
|
+
|
|
1576
|
+
# Get valid adducts from cached result (avoid repeated _get_adducts() calls)
|
|
1577
|
+
valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
|
|
1578
|
+
valid_adducts.add("?") # Always allow '?' adducts
|
|
1579
|
+
|
|
1580
|
+
for fd in feature_data_list:
|
|
1581
|
+
# Get individual adduct and mass from each feature data (fd)
|
|
1582
|
+
adduct = fd.get("adduct")
|
|
1583
|
+
adduct_mass = fd.get("adduct_mass")
|
|
1584
|
+
|
|
1585
|
+
if adduct is not None:
|
|
1586
|
+
# Only include adducts that are valid (from study._get_adducts() or contain '?')
|
|
1587
|
+
if adduct in valid_adducts or "?" in adduct:
|
|
1588
|
+
all_adducts.append(adduct)
|
|
1589
|
+
if adduct_mass is not None:
|
|
1590
|
+
adduct_masses[adduct] = adduct_mass
|
|
1591
|
+
|
|
1592
|
+
# Calculate adduct_values for the consensus feature
|
|
1593
|
+
adduct_values = []
|
|
1594
|
+
if all_adducts:
|
|
1595
|
+
adduct_counts = {
|
|
1596
|
+
adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
|
|
1597
|
+
}
|
|
1598
|
+
total_count = sum(adduct_counts.values())
|
|
1599
|
+
for adduct, count in adduct_counts.items():
|
|
1600
|
+
percentage = (count / total_count) * 100 if total_count > 0 else 0
|
|
1601
|
+
# Store as list with [name, num, %] format for the adducts column
|
|
1602
|
+
adduct_values.append(
|
|
1603
|
+
[
|
|
1604
|
+
str(adduct),
|
|
1605
|
+
int(count),
|
|
1606
|
+
float(round(percentage, 2)),
|
|
1607
|
+
],
|
|
1608
|
+
)
|
|
1609
|
+
|
|
1610
|
+
# Sort adduct_values by count in descending order
|
|
1611
|
+
adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
|
|
1612
|
+
# Store adduct_values for use in metadata
|
|
1613
|
+
consensus_adduct_values = adduct_values
|
|
1614
|
+
|
|
1615
|
+
# Extract top adduct information for new columns
|
|
1616
|
+
adduct_top = None
|
|
1617
|
+
adduct_charge_top = None
|
|
1618
|
+
adduct_mass_neutral_top = None
|
|
1619
|
+
adduct_mass_shift_top = None
|
|
1620
|
+
|
|
1621
|
+
if consensus_adduct_values:
|
|
1622
|
+
top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
|
|
1623
|
+
adduct_top = top_adduct_name
|
|
1624
|
+
|
|
1625
|
+
# Parse adduct information to extract charge and mass shift
|
|
1626
|
+
# Handle "?" as "H" and parse common adduct formats
|
|
1627
|
+
if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
|
|
1628
|
+
adduct_charge_top = 1
|
|
1629
|
+
adduct_mass_shift_top = 1.007825 # H mass
|
|
1630
|
+
elif top_adduct_name == "[M+?]-":
|
|
1631
|
+
adduct_charge_top = -1
|
|
1632
|
+
adduct_mass_shift_top = -1.007825 # -H mass
|
|
1633
|
+
else:
|
|
1634
|
+
# Try to get charge and mass shift from cached study adducts
|
|
1635
|
+
adduct_found = False
|
|
1636
|
+
if cached_adducts_df is not None and not cached_adducts_df.is_empty():
|
|
1637
|
+
# Look for exact match in study adducts
|
|
1638
|
+
matching_adduct = cached_adducts_df.filter(
|
|
1639
|
+
pl.col("name") == top_adduct_name,
|
|
1640
|
+
)
|
|
1641
|
+
if not matching_adduct.is_empty():
|
|
1642
|
+
adduct_row = matching_adduct.row(0, named=True)
|
|
1643
|
+
adduct_charge_top = adduct_row["charge"]
|
|
1644
|
+
adduct_mass_shift_top = adduct_row["mass_shift"]
|
|
1645
|
+
adduct_found = True
|
|
1646
|
+
|
|
1647
|
+
if not adduct_found:
|
|
1648
|
+
# Fallback to regex parsing
|
|
1649
|
+
import re
|
|
1650
|
+
|
|
1651
|
+
# Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
|
|
1652
|
+
pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
|
|
1653
|
+
match = re.match(pattern, top_adduct_name)
|
|
1654
|
+
|
|
1655
|
+
if match:
|
|
1656
|
+
sign = match.group(1)
|
|
1657
|
+
element = match.group(2)
|
|
1658
|
+
multiplier_str = match.group(3)
|
|
1659
|
+
charge_sign = match.group(4)
|
|
1660
|
+
|
|
1661
|
+
multiplier = int(multiplier_str) if multiplier_str else 1
|
|
1662
|
+
charge = multiplier if charge_sign == "+" else -multiplier
|
|
1663
|
+
adduct_charge_top = charge
|
|
1664
|
+
|
|
1665
|
+
# Calculate mass shift based on element
|
|
1666
|
+
element_masses = {
|
|
1667
|
+
"H": 1.007825,
|
|
1668
|
+
"Na": 22.989769,
|
|
1669
|
+
"K": 38.963708,
|
|
1670
|
+
"NH4": 18.033823,
|
|
1671
|
+
"Li": 7.016930,
|
|
1672
|
+
"Cl": 34.969401,
|
|
1673
|
+
"Br": 78.918885,
|
|
1674
|
+
"HCOO": 44.998201,
|
|
1675
|
+
"CH3COO": 59.013851,
|
|
1676
|
+
"H2O": 18.010565,
|
|
1677
|
+
}
|
|
1678
|
+
|
|
1679
|
+
base_mass = element_masses.get(
|
|
1680
|
+
element,
|
|
1681
|
+
1.007825,
|
|
1682
|
+
) # Default to H if unknown
|
|
1683
|
+
mass_shift = (
|
|
1684
|
+
base_mass * multiplier
|
|
1685
|
+
if sign == "+"
|
|
1686
|
+
else -base_mass * multiplier
|
|
1687
|
+
)
|
|
1688
|
+
adduct_mass_shift_top = mass_shift
|
|
1689
|
+
else:
|
|
1690
|
+
# Default fallback
|
|
1691
|
+
adduct_charge_top = 1
|
|
1692
|
+
adduct_mass_shift_top = 1.007825
|
|
1693
|
+
else:
|
|
1694
|
+
# No valid adducts found - assign default based on study polarity
|
|
1695
|
+
study_polarity = getattr(self, "polarity", "positive")
|
|
1696
|
+
if study_polarity in ["negative", "neg"]:
|
|
1697
|
+
# Negative mode default
|
|
1698
|
+
adduct_top = "[M-?]1-"
|
|
1699
|
+
adduct_charge_top = -1
|
|
1700
|
+
adduct_mass_shift_top = -1.007825 # -H mass (loss of proton)
|
|
1701
|
+
else:
|
|
1702
|
+
# Positive mode default (includes 'positive', 'pos', or any other value)
|
|
1703
|
+
adduct_top = "[M+?]1+"
|
|
1704
|
+
adduct_charge_top = 1
|
|
1705
|
+
adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
|
|
1706
|
+
|
|
1707
|
+
# Create a single default adduct entry in the adducts list for consistency
|
|
1708
|
+
consensus_adduct_values = [[adduct_top, 1, 100.0]]
|
|
1709
|
+
|
|
1710
|
+
# Calculate neutral mass from consensus mz (for both cases)
|
|
1711
|
+
consensus_mz = (
|
|
1712
|
+
round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
|
|
1713
|
+
)
|
|
1714
|
+
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
1715
|
+
adduct_mass_neutral_top = (
|
|
1716
|
+
consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
1717
|
+
)
|
|
1718
|
+
|
|
1719
|
+
# Calculate number of MS2 spectra
|
|
1720
|
+
ms2_count = 0
|
|
1721
|
+
for fd in feature_data_list:
|
|
1722
|
+
ms2_scans = fd.get("ms2_scans")
|
|
1723
|
+
if ms2_scans is not None:
|
|
1724
|
+
ms2_count += len(ms2_scans)
|
|
1725
|
+
|
|
1726
|
+
metadata_list.append(
|
|
1727
|
+
{
|
|
1728
|
+
"consensus_uid": int(i), # "consensus_id": i,
|
|
1729
|
+
"consensus_id": str(feature.getUniqueId()),
|
|
1730
|
+
"quality": round(float(feature.getQuality()), 3),
|
|
1731
|
+
"number_samples": len(feature_data_list),
|
|
1732
|
+
# "number_ext": int(len(features_list)),
|
|
1733
|
+
"rt": round(float(np.mean(rt_values)), 4)
|
|
1734
|
+
if len(rt_values) > 0
|
|
1735
|
+
else 0.0,
|
|
1736
|
+
"mz": round(float(np.mean(mz_values)), 4)
|
|
1737
|
+
if len(mz_values) > 0
|
|
1738
|
+
else 0.0,
|
|
1739
|
+
"rt_min": round(float(np.min(rt_values)), 3)
|
|
1740
|
+
if len(rt_values) > 0
|
|
1741
|
+
else 0.0,
|
|
1742
|
+
"rt_max": round(float(np.max(rt_values)), 3)
|
|
1743
|
+
if len(rt_values) > 0
|
|
1744
|
+
else 0.0,
|
|
1745
|
+
"rt_mean": round(float(np.mean(rt_values)), 3)
|
|
1746
|
+
if len(rt_values) > 0
|
|
1747
|
+
else 0.0,
|
|
1748
|
+
"rt_start_mean": round(float(np.mean(rt_start_values)), 3)
|
|
1749
|
+
if len(rt_start_values) > 0
|
|
1750
|
+
else 0.0,
|
|
1751
|
+
"rt_end_mean": round(float(np.mean(rt_end_values)), 3)
|
|
1752
|
+
if len(rt_end_values) > 0
|
|
1753
|
+
else 0.0,
|
|
1754
|
+
"rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
|
|
1755
|
+
if len(rt_delta_values) > 0
|
|
1756
|
+
else 0.0,
|
|
1757
|
+
"mz_min": round(float(np.min(mz_values)), 4)
|
|
1758
|
+
if len(mz_values) > 0
|
|
1759
|
+
else 0.0,
|
|
1760
|
+
"mz_max": round(float(np.max(mz_values)), 4)
|
|
1761
|
+
if len(mz_values) > 0
|
|
1762
|
+
else 0.0,
|
|
1763
|
+
"mz_mean": round(float(np.mean(mz_values)), 4)
|
|
1764
|
+
if len(mz_values) > 0
|
|
1765
|
+
else 0.0,
|
|
1766
|
+
"mz_start_mean": round(float(np.mean(mz_start_values)), 4)
|
|
1767
|
+
if len(mz_start_values) > 0
|
|
1768
|
+
else 0.0,
|
|
1769
|
+
"mz_end_mean": round(float(np.mean(mz_end_values)), 4)
|
|
1770
|
+
if len(mz_end_values) > 0
|
|
1771
|
+
else 0.0,
|
|
1772
|
+
"inty_mean": round(float(np.mean(inty_values)), 0)
|
|
1773
|
+
if len(inty_values) > 0
|
|
1774
|
+
else 0.0,
|
|
1775
|
+
"bl": -1.0,
|
|
1776
|
+
"chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
|
|
1777
|
+
if len(coherence_values) > 0
|
|
1778
|
+
else 0.0,
|
|
1779
|
+
"chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
|
|
1780
|
+
if len(prominence_values) > 0
|
|
1781
|
+
else 0.0,
|
|
1782
|
+
"chrom_prominence_scaled_mean": round(
|
|
1783
|
+
float(np.mean(prominence_scaled_values)),
|
|
1784
|
+
3,
|
|
1785
|
+
)
|
|
1786
|
+
if len(prominence_scaled_values) > 0
|
|
1787
|
+
else 0.0,
|
|
1788
|
+
"chrom_height_scaled_mean": round(
|
|
1789
|
+
float(np.mean(height_scaled_values)),
|
|
1790
|
+
3,
|
|
1791
|
+
)
|
|
1792
|
+
if len(height_scaled_values) > 0
|
|
1793
|
+
else 0.0,
|
|
1794
|
+
"iso_mean": round(float(np.mean(iso_values)), 2)
|
|
1795
|
+
if len(iso_values) > 0
|
|
1796
|
+
else 0.0,
|
|
1797
|
+
"charge_mean": round(float(np.mean(charge_values)), 2)
|
|
1798
|
+
if len(charge_values) > 0
|
|
1799
|
+
else 0.0,
|
|
1800
|
+
"number_ms2": int(ms2_count),
|
|
1801
|
+
"adducts": consensus_adduct_values
|
|
1802
|
+
if consensus_adduct_values
|
|
1803
|
+
else [], # Ensure it's always a list
|
|
1804
|
+
# New columns for top-ranked adduct information
|
|
1805
|
+
"adduct_top": adduct_top,
|
|
1806
|
+
"adduct_charge_top": adduct_charge_top,
|
|
1807
|
+
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
|
|
1808
|
+
if adduct_mass_neutral_top is not None
|
|
1809
|
+
else None,
|
|
1810
|
+
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
|
|
1811
|
+
if adduct_mass_shift_top is not None
|
|
1812
|
+
else None,
|
|
1813
|
+
# New columns for top-scoring identification results
|
|
1814
|
+
"id_top_name": None,
|
|
1815
|
+
"id_top_class": None,
|
|
1816
|
+
"id_top_adduct": None,
|
|
1817
|
+
"id_top_score": None,
|
|
1818
|
+
},
|
|
1819
|
+
)
|
|
1820
|
+
|
|
1821
|
+
consensus_mapping_df = pl.DataFrame(consensus_mapping)
|
|
1822
|
+
# remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
|
|
1823
|
+
l1 = len(consensus_mapping_df)
|
|
1824
|
+
consensus_mapping_df = consensus_mapping_df.filter(
|
|
1825
|
+
pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
|
|
1826
|
+
)
|
|
1827
|
+
self.logger.debug(
|
|
1828
|
+
f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
|
|
1829
|
+
)
|
|
1830
|
+
self.consensus_mapping_df = consensus_mapping_df
|
|
1831
|
+
self.consensus_df = pl.DataFrame(metadata_list, strict=False)
|
|
1832
|
+
|
|
1833
|
+
if min_samples is None:
|
|
1834
|
+
min_samples = 1
|
|
1835
|
+
if min_samples < 1:
|
|
1836
|
+
min_samples = int(min_samples * len(self.samples_df))
|
|
1837
|
+
|
|
1838
|
+
# Validate that min_samples doesn't exceed the number of samples
|
|
1839
|
+
if min_samples > len(self.samples_df):
|
|
1840
|
+
self.logger.warning(
|
|
1841
|
+
f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
|
|
1842
|
+
f"Setting min_samples to {len(self.samples_df)}.",
|
|
1843
|
+
)
|
|
1844
|
+
min_samples = len(self.samples_df)
|
|
1845
|
+
|
|
1846
|
+
# filter out consensus features with less than min_samples features
|
|
1847
|
+
l1 = len(self.consensus_df)
|
|
1848
|
+
self.consensus_df = self.consensus_df.filter(
|
|
1849
|
+
pl.col("number_samples") >= min_samples,
|
|
1850
|
+
)
|
|
1851
|
+
self.logger.debug(
|
|
1852
|
+
f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
1853
|
+
)
|
|
1854
|
+
# filter out consensus mapping with less than min_samples features
|
|
1855
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
1856
|
+
pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
|
|
1857
|
+
)
|
|
1858
|
+
|
|
1859
|
+
self.consensus_map = consensus_map
|
|
1860
|
+
|
|
1861
|
+
|
|
1862
|
+
def _perform_adduct_grouping(self, rt_tol, mz_tol):
|
|
1863
|
+
"""Perform adduct grouping on consensus features."""
|
|
1864
|
+
import polars as pl
|
|
1865
|
+
|
|
1866
|
+
# Add adduct grouping and adduct_of assignment
|
|
1867
|
+
if len(self.consensus_df) > 0:
|
|
1868
|
+
# Get relevant columns for grouping
|
|
1869
|
+
consensus_data = []
|
|
1870
|
+
for row in self.consensus_df.iter_rows(named=True):
|
|
1871
|
+
consensus_data.append(
|
|
1872
|
+
{
|
|
1873
|
+
"consensus_uid": row["consensus_uid"],
|
|
1874
|
+
"rt": row["rt"],
|
|
1875
|
+
"adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
|
|
1876
|
+
"adduct_top": row.get("adduct_top"),
|
|
1877
|
+
"inty_mean": row.get("inty_mean", 0),
|
|
1878
|
+
},
|
|
1879
|
+
)
|
|
1880
|
+
|
|
1881
|
+
# Use optimized adduct grouping
|
|
1882
|
+
adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
|
|
1883
|
+
self, consensus_data, rt_tol, mz_tol
|
|
1884
|
+
)
|
|
1885
|
+
|
|
1886
|
+
# Add the new columns to consensus_df
|
|
1887
|
+
self.consensus_df = self.consensus_df.with_columns(
|
|
1888
|
+
[
|
|
1889
|
+
pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
|
|
1890
|
+
pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
|
|
1891
|
+
],
|
|
1892
|
+
)
|
|
1893
|
+
|
|
1894
|
+
|
|
1895
|
+
def _finalize_merge(self, link_ms2, min_samples):
|
|
1896
|
+
"""Complete the merge process with final calculations and cleanup."""
|
|
1897
|
+
import polars as pl
|
|
1898
|
+
|
|
1899
|
+
# Check if consensus_df is empty or missing required columns
|
|
1900
|
+
if len(self.consensus_df) == 0 or "number_samples" not in self.consensus_df.columns:
|
|
1901
|
+
self.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
|
|
1902
|
+
return
|
|
1903
|
+
|
|
1904
|
+
# Validate min_samples parameter
|
|
1905
|
+
if min_samples is None:
|
|
1906
|
+
min_samples = 1
|
|
1907
|
+
if min_samples < 1:
|
|
1908
|
+
min_samples = int(min_samples * len(self.samples_df))
|
|
1909
|
+
|
|
1910
|
+
# Validate that min_samples doesn't exceed the number of samples
|
|
1911
|
+
if min_samples > len(self.samples_df):
|
|
1912
|
+
self.logger.warning(
|
|
1913
|
+
f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
|
|
1914
|
+
f"Setting min_samples to {len(self.samples_df)}.",
|
|
1915
|
+
)
|
|
1916
|
+
min_samples = len(self.samples_df)
|
|
1917
|
+
|
|
1918
|
+
# Filter out consensus features with less than min_samples features
|
|
1919
|
+
l1 = len(self.consensus_df)
|
|
1920
|
+
self.consensus_df = self.consensus_df.filter(
|
|
1921
|
+
pl.col("number_samples") >= min_samples,
|
|
1922
|
+
)
|
|
1923
|
+
self.logger.debug(
|
|
1924
|
+
f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
|
|
1925
|
+
)
|
|
1926
|
+
|
|
1927
|
+
# Filter out consensus mapping with less than min_samples features
|
|
1928
|
+
self.consensus_mapping_df = self.consensus_mapping_df.filter(
|
|
1929
|
+
pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
|
|
1930
|
+
)
|
|
1931
|
+
|
|
1932
|
+
# Calculate the completeness of the consensus map
|
|
1933
|
+
if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
|
|
1934
|
+
c = (
|
|
1935
|
+
len(self.consensus_mapping_df)
|
|
1936
|
+
/ len(self.consensus_df)
|
|
1937
|
+
/ len(self.samples_df)
|
|
1938
|
+
)
|
|
1939
|
+
self.logger.info(
|
|
1940
|
+
f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
|
|
1941
|
+
)
|
|
1942
|
+
else:
|
|
1943
|
+
self.logger.warning(
|
|
1944
|
+
f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
|
|
1945
|
+
f"This may be due to min_samples ({min_samples}) being too high for the available data.",
|
|
1946
|
+
)
|
|
1947
|
+
|
|
1948
|
+
if link_ms2:
|
|
1949
|
+
self.find_ms2()
|
|
1950
|
+
|
|
1951
|
+
|
|
1952
|
+
def _optimized_feature_lookup(study_obj, features_df):
|
|
1953
|
+
"""
|
|
1954
|
+
Optimized feature lookup creation using Polars operations.
|
|
1955
|
+
"""
|
|
1956
|
+
study_obj.logger.debug("Creating optimized feature lookup...")
|
|
1957
|
+
start_time = time.time()
|
|
1958
|
+
|
|
1959
|
+
# Use Polars select for faster conversion
|
|
1960
|
+
feature_columns = [
|
|
1961
|
+
"feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
|
|
1962
|
+
"mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
|
|
1963
|
+
"chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
|
|
1964
|
+
"ms2_scans", "adduct", "adduct_mass"
|
|
1965
|
+
]
|
|
1966
|
+
|
|
1967
|
+
# Filter to only existing columns
|
|
1968
|
+
existing_columns = [col for col in feature_columns if col in features_df.columns]
|
|
1969
|
+
|
|
1970
|
+
# Convert to dictionary more efficiently
|
|
1971
|
+
selected_df = features_df.select(existing_columns)
|
|
1972
|
+
|
|
1973
|
+
features_lookup = {}
|
|
1974
|
+
for row in selected_df.iter_rows(named=True):
|
|
1975
|
+
feature_uid = row["feature_uid"]
|
|
1976
|
+
# Keep feature_uid in the dictionary for chunked merge compatibility
|
|
1977
|
+
features_lookup[feature_uid] = {k: v for k, v in row.items()}
|
|
1978
|
+
|
|
1979
|
+
lookup_time = time.time() - start_time
|
|
1980
|
+
if len(features_lookup) > 50000:
|
|
1981
|
+
study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
|
|
1982
|
+
return features_lookup
|
|
1983
|
+
|
|
1984
|
+
|
|
1985
|
+
def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
1986
|
+
"""
|
|
1987
|
+
Optimized O(n log n) adduct grouping using spatial indexing.
|
|
1988
|
+
|
|
1989
|
+
Args:
|
|
1990
|
+
study_obj: Study object with logger
|
|
1991
|
+
consensus_data: List of consensus feature dictionaries
|
|
1992
|
+
rt_tol: RT tolerance in minutes
|
|
1993
|
+
mz_tol: m/z tolerance in Da
|
|
1994
|
+
|
|
1995
|
+
Returns:
|
|
1996
|
+
Tuple of (adduct_group_list, adduct_of_list)
|
|
1997
|
+
"""
|
|
1998
|
+
if not consensus_data:
|
|
1999
|
+
return [], []
|
|
2000
|
+
|
|
2001
|
+
n_features = len(consensus_data)
|
|
2002
|
+
if n_features > 10000:
|
|
2003
|
+
study_obj.logger.info(f"Adduct grouping for {n_features} consensus features...")
|
|
2004
|
+
else:
|
|
2005
|
+
study_obj.logger.debug(f"Adduct grouping for {n_features} consensus features...")
|
|
2006
|
+
|
|
2007
|
+
# Build spatial index using RT and neutral mass as coordinates
|
|
2008
|
+
features_by_mass = defaultdict(list)
|
|
2009
|
+
mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
|
|
2010
|
+
|
|
2011
|
+
valid_features = []
|
|
2012
|
+
for feature in consensus_data:
|
|
2013
|
+
consensus_uid = feature["consensus_uid"]
|
|
2014
|
+
rt = feature["rt"]
|
|
2015
|
+
neutral_mass = feature.get("adduct_mass_neutral_top")
|
|
2016
|
+
intensity = feature.get("inty_mean", 0)
|
|
2017
|
+
adduct = feature.get("adduct_top", "")
|
|
2018
|
+
|
|
2019
|
+
if neutral_mass is not None:
|
|
2020
|
+
mass_bin = int(neutral_mass / mass_bin_size)
|
|
2021
|
+
features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
|
|
2022
|
+
valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
|
|
2023
|
+
|
|
2024
|
+
# Union-Find for efficient grouping
|
|
2025
|
+
class UnionFind:
|
|
2026
|
+
def __init__(self, n):
|
|
2027
|
+
self.parent = list(range(n))
|
|
2028
|
+
self.rank = [0] * n
|
|
2029
|
+
|
|
2030
|
+
def find(self, x):
|
|
2031
|
+
if self.parent[x] != x:
|
|
2032
|
+
self.parent[x] = self.find(self.parent[x])
|
|
2033
|
+
return self.parent[x]
|
|
2034
|
+
|
|
2035
|
+
def union(self, x, y):
|
|
2036
|
+
px, py = self.find(x), self.find(y)
|
|
2037
|
+
if px == py:
|
|
2038
|
+
return
|
|
2039
|
+
if self.rank[px] < self.rank[py]:
|
|
2040
|
+
px, py = py, px
|
|
2041
|
+
self.parent[py] = px
|
|
2042
|
+
if self.rank[px] == self.rank[py]:
|
|
2043
|
+
self.rank[px] += 1
|
|
2044
|
+
|
|
2045
|
+
uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
|
|
2046
|
+
uf = UnionFind(len(valid_features))
|
|
2047
|
+
|
|
2048
|
+
# Find groups using spatial index
|
|
2049
|
+
checked_pairs = set()
|
|
2050
|
+
for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
|
|
2051
|
+
for bin_offset in [-1, 0, 1]:
|
|
2052
|
+
check_bin = bin1 + bin_offset
|
|
2053
|
+
if check_bin not in features_by_mass:
|
|
2054
|
+
continue
|
|
2055
|
+
|
|
2056
|
+
for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
|
|
2057
|
+
if uid1 >= uid2:
|
|
2058
|
+
continue
|
|
2059
|
+
|
|
2060
|
+
pair = (min(uid1, uid2), max(uid1, uid2))
|
|
2061
|
+
if pair in checked_pairs:
|
|
2062
|
+
continue
|
|
2063
|
+
checked_pairs.add(pair)
|
|
2064
|
+
|
|
2065
|
+
mass_diff = abs(mass1 - mass2)
|
|
2066
|
+
rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
|
|
2067
|
+
|
|
2068
|
+
if mass_diff <= mz_tol and rt_diff <= rt_tol:
|
|
2069
|
+
j = uid_to_idx[uid2]
|
|
2070
|
+
uf.union(i, j)
|
|
2071
|
+
|
|
2072
|
+
# Extract groups
|
|
2073
|
+
groups_by_root = defaultdict(list)
|
|
2074
|
+
for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
|
|
2075
|
+
root = uf.find(i)
|
|
2076
|
+
groups_by_root[root].append((uid, rt, mass, inty, adduct))
|
|
2077
|
+
|
|
2078
|
+
groups = {}
|
|
2079
|
+
group_id = 1
|
|
2080
|
+
assigned_groups = {}
|
|
2081
|
+
|
|
2082
|
+
for group_members in groups_by_root.values():
|
|
2083
|
+
member_uids = [uid for uid, _, _, _, _ in group_members]
|
|
2084
|
+
|
|
2085
|
+
for uid in member_uids:
|
|
2086
|
+
assigned_groups[uid] = group_id
|
|
2087
|
+
groups[group_id] = member_uids
|
|
2088
|
+
group_id += 1
|
|
2089
|
+
|
|
2090
|
+
# Handle features without neutral mass
|
|
2091
|
+
for feature in consensus_data:
|
|
2092
|
+
uid = feature["consensus_uid"]
|
|
2093
|
+
if uid not in assigned_groups:
|
|
2094
|
+
assigned_groups[uid] = group_id
|
|
2095
|
+
groups[group_id] = [uid]
|
|
2096
|
+
group_id += 1
|
|
2097
|
+
|
|
2098
|
+
# Determine adduct_of for each group
|
|
2099
|
+
group_adduct_of = {}
|
|
2100
|
+
for grp_id, member_uids in groups.items():
|
|
2101
|
+
best_uid = None
|
|
2102
|
+
best_priority = -1
|
|
2103
|
+
best_intensity = 0
|
|
2104
|
+
|
|
2105
|
+
for uid in member_uids:
|
|
2106
|
+
feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
|
|
2107
|
+
if not feature_data:
|
|
2108
|
+
continue
|
|
2109
|
+
|
|
2110
|
+
adduct = feature_data.get("adduct_top", "")
|
|
2111
|
+
intensity = feature_data.get("inty_mean", 0)
|
|
2112
|
+
|
|
2113
|
+
priority = 0
|
|
2114
|
+
if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
|
|
2115
|
+
priority = 3
|
|
2116
|
+
elif adduct and "[M-H]" in adduct:
|
|
2117
|
+
priority = 2
|
|
2118
|
+
elif adduct and "M" in adduct:
|
|
2119
|
+
priority = 1
|
|
2120
|
+
|
|
2121
|
+
if priority > best_priority or (priority == best_priority and intensity > best_intensity):
|
|
2122
|
+
best_uid = uid
|
|
2123
|
+
best_priority = priority
|
|
2124
|
+
best_intensity = intensity
|
|
2125
|
+
|
|
2126
|
+
group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
|
|
2127
|
+
|
|
2128
|
+
# Build final lists in same order as consensus_data
|
|
2129
|
+
adduct_group_list = []
|
|
2130
|
+
adduct_of_list = []
|
|
2131
|
+
|
|
2132
|
+
for feature in consensus_data:
|
|
2133
|
+
uid = feature["consensus_uid"]
|
|
2134
|
+
group = assigned_groups.get(uid, 0)
|
|
2135
|
+
adduct_of = group_adduct_of.get(group, uid)
|
|
2136
|
+
|
|
2137
|
+
adduct_group_list.append(group)
|
|
2138
|
+
adduct_of_list.append(adduct_of)
|
|
2139
|
+
|
|
2140
|
+
if n_features > 10000:
|
|
2141
|
+
study_obj.logger.info("Adduct grouping completed.")
|
|
2142
|
+
else:
|
|
2143
|
+
study_obj.logger.debug("Adduct grouping completed.")
|
|
2144
|
+
|
|
2145
|
+
return adduct_group_list, adduct_of_list
|