masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/h5.py +11 -11
- masster/sample/helpers.py +2 -2
- masster/sample/load.py +10 -8
- masster/sample/processing.py +1 -1
- masster/sample/sample.py +7 -3
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +230 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +95 -73
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/RECORD +24 -25
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.4.dist-info}/licenses/LICENSE +0 -0
masster/study/merge.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Unified merge module for the Study class.
|
|
3
|
-
Supports multiple merge methods: 'kd', 'qt', '
|
|
3
|
+
Supports multiple merge methods: 'kd', 'qt', 'kd_chunked', 'qt_chunked'
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
6
|
import time
|
|
@@ -12,6 +12,7 @@ import pyopenms as oms
|
|
|
12
12
|
import polars as pl
|
|
13
13
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
14
14
|
from concurrent.futures.process import BrokenProcessPool
|
|
15
|
+
from scipy.spatial import cKDTree
|
|
15
16
|
from masster.study.defaults import merge_defaults
|
|
16
17
|
|
|
17
18
|
|
|
@@ -115,47 +116,6 @@ def _process_kd_chunk_parallel(chunk_data):
|
|
|
115
116
|
return chunk_start_idx, consensus_features
|
|
116
117
|
|
|
117
118
|
|
|
118
|
-
def _deserialize_consensus_features(consensus_features):
|
|
119
|
-
"""
|
|
120
|
-
Deserialize consensus features back into an OpenMS ConsensusMap.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
consensus_features: List of serialized consensus feature dictionaries
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
OpenMS ConsensusMap object
|
|
127
|
-
"""
|
|
128
|
-
import pyopenms as oms
|
|
129
|
-
|
|
130
|
-
consensus_map = oms.ConsensusMap()
|
|
131
|
-
|
|
132
|
-
for feature_data in consensus_features:
|
|
133
|
-
consensus_feature = oms.ConsensusFeature()
|
|
134
|
-
consensus_feature.setRT(float(feature_data['rt']))
|
|
135
|
-
consensus_feature.setMZ(float(feature_data['mz']))
|
|
136
|
-
consensus_feature.setIntensity(float(feature_data['intensity']))
|
|
137
|
-
consensus_feature.setQuality(float(feature_data['quality']))
|
|
138
|
-
consensus_feature.setUniqueId(int(feature_data['unique_id']))
|
|
139
|
-
|
|
140
|
-
# Reconstruct feature handles (simplified approach)
|
|
141
|
-
feature_handles = []
|
|
142
|
-
for handle_data in feature_data['features']:
|
|
143
|
-
feature_handle = oms.FeatureHandle()
|
|
144
|
-
feature_handle.setUniqueId(int(handle_data['unique_id']))
|
|
145
|
-
feature_handle.setMapIndex(int(handle_data['map_index']))
|
|
146
|
-
feature_handles.append(feature_handle)
|
|
147
|
-
|
|
148
|
-
# Set the feature list - properly add feature handles back to consensus feature
|
|
149
|
-
if feature_handles:
|
|
150
|
-
# Add each feature handle to the consensus feature using the correct OpenMS API
|
|
151
|
-
for feature_handle in feature_handles:
|
|
152
|
-
consensus_feature.getFeatureList().append(feature_handle)
|
|
153
|
-
|
|
154
|
-
consensus_map.push_back(consensus_feature)
|
|
155
|
-
|
|
156
|
-
return consensus_map
|
|
157
|
-
|
|
158
|
-
|
|
159
119
|
def _process_qt_chunk_parallel(chunk_data):
|
|
160
120
|
"""
|
|
161
121
|
Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
|
|
@@ -222,7 +182,8 @@ def _process_qt_chunk_parallel(chunk_data):
|
|
|
222
182
|
chunk_params.setValue("distance_MZ:unit", "Da")
|
|
223
183
|
chunk_params.setValue("ignore_charge", "true")
|
|
224
184
|
chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
|
|
225
|
-
|
|
185
|
+
|
|
186
|
+
|
|
226
187
|
grouper.setParameters(chunk_params)
|
|
227
188
|
grouper.group(chunk_maps, chunk_consensus_map)
|
|
228
189
|
|
|
@@ -251,29 +212,6 @@ def _process_qt_chunk_parallel(chunk_data):
|
|
|
251
212
|
return chunk_start_idx, consensus_features
|
|
252
213
|
|
|
253
214
|
|
|
254
|
-
def _serialize_feature_map(feature_map):
|
|
255
|
-
"""
|
|
256
|
-
Serialize a FeatureMap to a list of dictionaries for multiprocessing.
|
|
257
|
-
|
|
258
|
-
Args:
|
|
259
|
-
feature_map: OpenMS FeatureMap object
|
|
260
|
-
|
|
261
|
-
Returns:
|
|
262
|
-
List of feature dictionaries
|
|
263
|
-
"""
|
|
264
|
-
features_data = []
|
|
265
|
-
for feature in feature_map:
|
|
266
|
-
feature_data = {
|
|
267
|
-
'rt': feature.getRT(),
|
|
268
|
-
'mz': feature.getMZ(),
|
|
269
|
-
'intensity': feature.getIntensity(),
|
|
270
|
-
'charge': feature.getCharge(),
|
|
271
|
-
'unique_id': feature.getUniqueId()
|
|
272
|
-
}
|
|
273
|
-
features_data.append(feature_data)
|
|
274
|
-
return features_data
|
|
275
|
-
|
|
276
|
-
|
|
277
215
|
def merge(study, **kwargs) -> None:
|
|
278
216
|
"""
|
|
279
217
|
Group features across samples into consensus features using various algorithms.
|
|
@@ -285,74 +223,155 @@ def merge(study, **kwargs) -> None:
|
|
|
285
223
|
----------
|
|
286
224
|
**kwargs : dict
|
|
287
225
|
Parameters from merge_defaults class:
|
|
288
|
-
- method : str, default '
|
|
289
|
-
Merge algorithm: '
|
|
290
|
-
- min_samples : int, default
|
|
226
|
+
- method : str, default 'kd'
|
|
227
|
+
Merge algorithm: 'kd', 'qt', 'kd_chunked', 'qt_chunked'
|
|
228
|
+
- min_samples : int, default 2
|
|
291
229
|
Minimum number of samples for consensus feature
|
|
292
|
-
- rt_tol : float, default
|
|
230
|
+
- rt_tol : float, default 5.0
|
|
293
231
|
RT tolerance in seconds
|
|
294
232
|
- mz_tol : float, default 0.01
|
|
295
233
|
m/z tolerance in Da (Daltons) for all methods
|
|
296
234
|
- chunk_size : int, default 500
|
|
297
|
-
Chunk size for
|
|
298
|
-
-
|
|
299
|
-
|
|
300
|
-
-
|
|
235
|
+
Chunk size for chunked methods
|
|
236
|
+
- dechunking : str, default 'hierarchical'
|
|
237
|
+
Cross-chunk merging algorithm: 'hierarchical', 'kdtree', 'qt', 'none'
|
|
238
|
+
- threads : int, default None
|
|
239
|
+
Number of parallel processes for chunked methods (None=sequential)
|
|
240
|
+
- nr_partitions : int, default 1000
|
|
301
241
|
Number of partitions in m/z dimension for KD algorithms
|
|
302
|
-
- min_rel_cc_size : float, default 0.
|
|
303
|
-
Minimum relative connected component size for conflict resolution
|
|
304
|
-
- max_pairwise_log_fc : float, default 0
|
|
305
|
-
Maximum pairwise log fold change for conflict resolution
|
|
242
|
+
- min_rel_cc_size : float, default 0.1
|
|
243
|
+
Minimum relative connected component size for conflict resolution (chunked only)
|
|
244
|
+
- max_pairwise_log_fc : float, default -1.0
|
|
245
|
+
Maximum pairwise log fold change for conflict resolution (chunked only)
|
|
306
246
|
- max_nr_conflicts : int, default 0
|
|
307
|
-
Maximum number of conflicts allowed in consensus feature
|
|
247
|
+
Maximum number of conflicts allowed in consensus feature (chunked only)
|
|
308
248
|
- link_ms2 : bool, default True
|
|
309
249
|
Whether to link MS2 spectra to consensus features
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
250
|
+
- extract_ms1 : bool, default True
|
|
251
|
+
Whether to extract MS1 spectra for consensus features
|
|
252
|
+
|
|
253
|
+
Algorithm Selection Guide
|
|
254
|
+
------------------------
|
|
255
|
+
Choose your merge method based on dataset size and performance requirements:
|
|
256
|
+
|
|
257
|
+
**KD (K-D Tree)** - *Recommended Default*
|
|
258
|
+
- Fast O(n log n) algorithm with RT warping
|
|
259
|
+
- Best balance of speed, accuracy, and memory usage
|
|
260
|
+
- Suitable for most dataset sizes (50 - 5,000 samples)
|
|
261
|
+
- Uses spatial partitioning for efficient feature matching
|
|
262
|
+
|
|
263
|
+
**QT (Quality Threshold)**
|
|
264
|
+
- Thorough O(n²) clustering algorithm
|
|
265
|
+
- Most accurate but slowest method
|
|
266
|
+
- Recommended for small datasets (<1,000 samples)
|
|
267
|
+
- Guarantees quality threshold constraints
|
|
268
|
+
|
|
269
|
+
**KD-Chunked** - *For Large Datasets*
|
|
270
|
+
- Memory-optimized KD algorithm for very large datasets (>5,000 samples)
|
|
271
|
+
- Processes data in chunks with cross-chunk consensus building
|
|
272
|
+
- Supports parallel processing with threads parameter
|
|
273
|
+
- Maintains high feature recovery through hierarchical dechunking
|
|
274
|
+
|
|
275
|
+
**QT-Chunked** - *For Large Datasets with Maximum Accuracy*
|
|
276
|
+
- Memory-optimized QT algorithm for very large datasets (>5,000 samples)
|
|
277
|
+
- Uses QT clustering within chunks, then cross-chunk consensus
|
|
278
|
+
- Slowest but most thorough for large datasets
|
|
279
|
+
- Best when accuracy is more important than speed
|
|
280
|
+
|
|
281
|
+
Cross-Chunk Merging (Dechunking) Methods
|
|
282
|
+
----------------------------------------
|
|
283
|
+
For chunked methods, choose dechunking algorithm based on your priorities:
|
|
284
|
+
|
|
285
|
+
**Hierarchical** - *Recommended Default*
|
|
286
|
+
- Priority-based merging starting from high sample count features
|
|
287
|
+
- Achieves ~97% feature recovery vs original ~10% recovery
|
|
288
|
+
- Best overall balance of recovery and accuracy
|
|
289
|
+
|
|
290
|
+
**KDTree** - *High Sample Feature Preservation*
|
|
291
|
+
- Spatial indexing approach optimized for frequent features
|
|
292
|
+
- ~95% high sample count feature recovery
|
|
293
|
+
- Best for preserving features present in many samples
|
|
294
|
+
|
|
295
|
+
Performance Guidelines
|
|
296
|
+
---------------------
|
|
297
|
+
- **Small datasets (≤1,000 samples)**: Use 'qt' for maximum accuracy
|
|
298
|
+
- **Medium datasets (1,000-5,000 samples)**: Use 'kd' (default)
|
|
299
|
+
- **Large datasets (>5,000 samples)**: Use 'kd_chunked' or 'qt_chunked'
|
|
300
|
+
- **Memory constrained**: Use chunked methods with smaller chunk_size
|
|
301
|
+
- **Time constrained**: Use 'kd' or 'kd_chunked' with hierarchical dechunking
|
|
324
302
|
|
|
325
303
|
Parallel Processing
|
|
326
304
|
------------------
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
305
|
+
Chunked methods support parallel processing:
|
|
306
|
+
- Set threads=N (where N is number of CPU cores to use)
|
|
307
|
+
- Recommended: threads=4 to 8 for most systems
|
|
308
|
+
- Each chunk is processed independently in parallel
|
|
309
|
+
- Significantly reduces processing time for large datasets
|
|
310
|
+
|
|
311
|
+
Tolerance Settings
|
|
312
|
+
-----------------
|
|
313
|
+
- **rt_tol**: RT tolerance in seconds (typical range: 1-10s)
|
|
314
|
+
- Smaller values: more specific, may fragment features
|
|
315
|
+
- Larger values: more permissive, may merge distinct features
|
|
316
|
+
- **mz_tol**: m/z tolerance in Daltons (typical range: 0.005-0.05 Da)
|
|
317
|
+
- High-resolution MS: 0.005-0.01 Da
|
|
318
|
+
- Lower resolution MS: 0.01-0.05 Da
|
|
319
|
+
|
|
320
|
+
Examples
|
|
321
|
+
--------
|
|
322
|
+
Basic usage with default KD algorithm:
|
|
323
|
+
study.merge()
|
|
324
|
+
|
|
325
|
+
High-accuracy small dataset:
|
|
326
|
+
study.merge(method='qt', rt_tol=2.0, mz_tol=0.005, min_samples=5)
|
|
327
|
+
|
|
328
|
+
Large dataset with parallel processing:
|
|
329
|
+
study.merge(method='kd_chunked', threads=8, chunk_size=500,
|
|
330
|
+
dechunking='hierarchical')
|
|
331
|
+
|
|
332
|
+
Custom tolerances for specific instrument:
|
|
333
|
+
study.merge(method='kd', rt_tol=1.5, mz_tol=0.01, min_samples=10)
|
|
334
|
+
|
|
335
|
+
Notes
|
|
336
|
+
-----
|
|
337
|
+
- Features must be loaded before merging (study.load_features())
|
|
338
|
+
- Results are stored in study.consensus_df and study.consensus_mapping_df
|
|
339
|
+
- Merge parameters are saved to study history for reproducibility
|
|
340
|
+
- MS2 spectra are automatically linked when link_ms2=True
|
|
341
|
+
- Adduct relationships are identified and stored after merging
|
|
333
342
|
"""
|
|
334
343
|
start_time = time.time()
|
|
335
344
|
|
|
336
345
|
# Initialize with defaults and override with kwargs
|
|
337
|
-
params = merge_defaults()
|
|
338
|
-
|
|
339
|
-
#
|
|
340
|
-
|
|
346
|
+
params = merge_defaults()
|
|
347
|
+
|
|
348
|
+
# Handle 'params' keyword argument specifically (like merge does)
|
|
349
|
+
if 'params' in kwargs:
|
|
350
|
+
provided_params = kwargs.pop('params')
|
|
351
|
+
if isinstance(provided_params, merge_defaults):
|
|
352
|
+
params = provided_params
|
|
353
|
+
study.logger.debug("Using provided merge_defaults parameters from 'params' argument")
|
|
354
|
+
else:
|
|
355
|
+
study.logger.warning("'params' argument is not an merge_defaults instance, ignoring")
|
|
356
|
+
|
|
357
|
+
# Process remaining kwargs
|
|
341
358
|
for key, value in kwargs.items():
|
|
342
|
-
if
|
|
343
|
-
|
|
359
|
+
if isinstance(value, merge_defaults):
|
|
360
|
+
params = value
|
|
361
|
+
study.logger.debug("Using provided merge_defaults parameters")
|
|
344
362
|
else:
|
|
345
|
-
|
|
346
|
-
|
|
363
|
+
if hasattr(params, key):
|
|
364
|
+
if params.set(key, value, validate=True):
|
|
365
|
+
study.logger.debug(f"Updated parameter {key} = {value}")
|
|
366
|
+
else:
|
|
367
|
+
study.logger.warning(
|
|
368
|
+
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
369
|
+
)
|
|
370
|
+
else:
|
|
371
|
+
study.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
372
|
+
|
|
347
373
|
# Backward compatibility: Map old method names to new names
|
|
348
374
|
method_mapping = {
|
|
349
|
-
'kd': 'sensitivity',
|
|
350
|
-
'kd-nowarp': 'nowarp',
|
|
351
|
-
'kd_nowarp': 'nowarp',
|
|
352
|
-
'kd-strict': 'quality',
|
|
353
|
-
'kd_strict': 'quality',
|
|
354
|
-
'kdstrict': 'quality',
|
|
355
|
-
'chunked': 'kd_chunked', # Map old 'chunked' to 'kd_chunked'
|
|
356
375
|
'qtchunked': 'qt_chunked', # QT chunked variants
|
|
357
376
|
'qt-chunked': 'qt_chunked',
|
|
358
377
|
'kdchunked': 'kd_chunked', # KD chunked variants
|
|
@@ -365,18 +384,28 @@ def merge(study, **kwargs) -> None:
|
|
|
365
384
|
study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
|
|
366
385
|
|
|
367
386
|
# Validate method
|
|
368
|
-
if params.method not in ['
|
|
369
|
-
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['
|
|
370
|
-
|
|
387
|
+
if params.method not in ['kd', 'qt', 'kd_chunked', 'qt_chunked']:
|
|
388
|
+
raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['kd', 'qt', 'kd_chunked', 'qt_chunked']")
|
|
389
|
+
|
|
371
390
|
# Check if chunked method is advisable for large datasets
|
|
372
391
|
num_samples = len(study.samples_df) if hasattr(study, 'samples_df') and study.samples_df is not None else 0
|
|
392
|
+
if num_samples == 0:
|
|
393
|
+
raise ValueError("No samples loaded in study. Load features before merging.")
|
|
394
|
+
if params.method == 'kd' and num_samples > params.chunk_size:
|
|
395
|
+
params.method = 'kd_chunked'
|
|
396
|
+
study.logger.info(
|
|
397
|
+
f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
|
|
398
|
+
)
|
|
399
|
+
if params.method == 'qt' and num_samples > params.chunk_size:
|
|
400
|
+
params.method = 'qt_chunked'
|
|
401
|
+
study.logger.info(
|
|
402
|
+
f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
|
|
403
|
+
)
|
|
404
|
+
|
|
373
405
|
if num_samples > 500:
|
|
374
|
-
|
|
375
|
-
if params.method not in chunked_methods:
|
|
406
|
+
if params.method not in {'kd_chunked', 'qt_chunked'}:
|
|
376
407
|
study.logger.warning(
|
|
377
|
-
f"Large dataset detected ({num_samples} samples > 500). "
|
|
378
|
-
f"For better performance and memory efficiency, consider using a chunked method: "
|
|
379
|
-
f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
|
|
408
|
+
f"Large dataset detected ({num_samples} samples > 500). Consider dropping chunk_size to 500 to use chunked methods."
|
|
380
409
|
)
|
|
381
410
|
|
|
382
411
|
# Persist last used params for diagnostics
|
|
@@ -403,8 +432,10 @@ def merge(study, **kwargs) -> None:
|
|
|
403
432
|
f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
|
|
404
433
|
)
|
|
405
434
|
|
|
406
|
-
# Initialize
|
|
407
|
-
|
|
435
|
+
# Initialize
|
|
436
|
+
study.consensus_df = pl.DataFrame()
|
|
437
|
+
study.consensus_ms2 = pl.DataFrame()
|
|
438
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
408
439
|
|
|
409
440
|
# Cache adducts for performance (avoid repeated _get_adducts() calls)
|
|
410
441
|
cached_adducts_df = None
|
|
@@ -424,7 +455,7 @@ def merge(study, **kwargs) -> None:
|
|
|
424
455
|
cached_valid_adducts.add("?")
|
|
425
456
|
|
|
426
457
|
# Route to algorithm implementation
|
|
427
|
-
if params.method == '
|
|
458
|
+
if params.method == 'kd':
|
|
428
459
|
consensus_map = _merge_kd(study, params)
|
|
429
460
|
# Extract consensus features
|
|
430
461
|
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
@@ -432,13 +463,6 @@ def merge(study, **kwargs) -> None:
|
|
|
432
463
|
consensus_map = _merge_qt(study, params)
|
|
433
464
|
# Extract consensus features
|
|
434
465
|
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
435
|
-
elif params.method == 'nowarp':
|
|
436
|
-
consensus_map = _merge_kd_nowarp(study, params)
|
|
437
|
-
# Extract consensus features
|
|
438
|
-
_extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
|
|
439
|
-
elif params.method == 'quality':
|
|
440
|
-
consensus_map = _merge_kd_strict(study, params)
|
|
441
|
-
# Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
|
|
442
466
|
elif params.method == 'kd_chunked':
|
|
443
467
|
consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
444
468
|
# Note: _merge_kd_chunked populates consensus_df directly, no need to extract
|
|
@@ -446,19 +470,23 @@ def merge(study, **kwargs) -> None:
|
|
|
446
470
|
consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
|
|
447
471
|
# Note: _merge_qt_chunked populates consensus_df directly, no need to extract
|
|
448
472
|
|
|
449
|
-
# Enhanced post-clustering to merge over-segmented features (for
|
|
450
|
-
|
|
451
|
-
|
|
473
|
+
# Enhanced post-clustering to merge over-segmented features (for non-chunked methods)
|
|
474
|
+
# Chunked methods already perform their own cross-chunk consensus building
|
|
475
|
+
if params.method in ['qt', 'kd']:
|
|
476
|
+
__consensus_cleanup(study, params.rt_tol, params.mz_tol)
|
|
452
477
|
|
|
453
478
|
# Perform adduct grouping
|
|
454
479
|
_perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
|
|
455
480
|
|
|
456
481
|
# Identify coeluting consensus features by mass shifts and update adduct information
|
|
457
|
-
|
|
482
|
+
__identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
|
|
483
|
+
|
|
484
|
+
# Post-processing for chunked methods: merge partial consensus features
|
|
485
|
+
if params.method in ['qt_chunked', 'kd_chunked']:
|
|
486
|
+
_merge_partial_consensus_features(study, params.rt_tol, params.mz_tol)
|
|
458
487
|
|
|
459
|
-
#
|
|
460
|
-
|
|
461
|
-
_finalize_merge(study, params.link_ms2, params.min_samples)
|
|
488
|
+
# Finalize merge: filter by min_samples and add isotope/MS2 data
|
|
489
|
+
__finalize_merge(study, params.link_ms2, params.extract_ms1, params.min_samples)
|
|
462
490
|
|
|
463
491
|
# Log completion without the misleading feature count
|
|
464
492
|
elapsed = time.time() - start_time
|
|
@@ -494,10 +522,6 @@ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
494
522
|
params_oms.setValue("warp:mz_tol", params.mz_tol)
|
|
495
523
|
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
496
524
|
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
497
|
-
#params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
498
|
-
#params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
499
|
-
#params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
500
|
-
#params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
|
|
501
525
|
|
|
502
526
|
grouper.setParameters(params_oms)
|
|
503
527
|
grouper.group(temp_feature_maps, consensus_map)
|
|
@@ -505,92 +529,6 @@ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
505
529
|
return consensus_map
|
|
506
530
|
|
|
507
531
|
|
|
508
|
-
def _generate_feature_maps_from_samples(study):
|
|
509
|
-
"""
|
|
510
|
-
Generate feature maps using Study-level features_df instead of Sample-level loading.
|
|
511
|
-
This uses the study's existing features_df which is already loaded.
|
|
512
|
-
|
|
513
|
-
Args:
|
|
514
|
-
study: Study object containing features_df
|
|
515
|
-
|
|
516
|
-
Returns:
|
|
517
|
-
list: List of temporary FeatureMap objects built from Study-level data
|
|
518
|
-
"""
|
|
519
|
-
import pyopenms as oms
|
|
520
|
-
|
|
521
|
-
temp_feature_maps = []
|
|
522
|
-
|
|
523
|
-
study.logger.info(f"Building feature maps using Study-level features_df from {len(study.samples_df)} samples")
|
|
524
|
-
|
|
525
|
-
# Use the features_df from the study that's already loaded
|
|
526
|
-
if not hasattr(study, 'features_df') or study.features_df is None or study.features_df.is_empty():
|
|
527
|
-
study.logger.warning("No features_df available - features must be loaded first")
|
|
528
|
-
return temp_feature_maps
|
|
529
|
-
|
|
530
|
-
# Group features by sample
|
|
531
|
-
study.logger.info(f"Processing {len(study.features_df)} features grouped by sample")
|
|
532
|
-
|
|
533
|
-
# Get unique sample names/indices
|
|
534
|
-
if 'sample_uid' in study.features_df.columns:
|
|
535
|
-
sample_groups = study.features_df.group_by('sample_uid')
|
|
536
|
-
study.logger.debug("Grouping features by 'sample_uid' column")
|
|
537
|
-
elif 'sample_id' in study.features_df.columns:
|
|
538
|
-
sample_groups = study.features_df.group_by('sample_id')
|
|
539
|
-
study.logger.debug("Grouping features by 'sample_id' column")
|
|
540
|
-
elif 'sample' in study.features_df.columns:
|
|
541
|
-
sample_groups = study.features_df.group_by('sample')
|
|
542
|
-
study.logger.debug("Grouping features by 'sample' column")
|
|
543
|
-
else:
|
|
544
|
-
study.logger.warning("No sample grouping column found in features_df")
|
|
545
|
-
study.logger.info(f"Available columns: {study.features_df.columns}")
|
|
546
|
-
return temp_feature_maps
|
|
547
|
-
|
|
548
|
-
# Process each sample group
|
|
549
|
-
processed_samples = 0
|
|
550
|
-
for sample_key, sample_features in sample_groups:
|
|
551
|
-
try:
|
|
552
|
-
feature_map = oms.FeatureMap()
|
|
553
|
-
feature_count = 0
|
|
554
|
-
|
|
555
|
-
# Build features from this sample's features
|
|
556
|
-
for row in sample_features.iter_rows(named=True):
|
|
557
|
-
try:
|
|
558
|
-
feature = oms.Feature()
|
|
559
|
-
|
|
560
|
-
# Set feature properties
|
|
561
|
-
if row.get("feature_id") is not None:
|
|
562
|
-
feature.setUniqueId(int(row["feature_id"]))
|
|
563
|
-
if row.get("mz") is not None:
|
|
564
|
-
feature.setMZ(float(row["mz"]))
|
|
565
|
-
if row.get("rt") is not None:
|
|
566
|
-
feature.setRT(float(row["rt"]))
|
|
567
|
-
if row.get("inty") is not None:
|
|
568
|
-
feature.setIntensity(float(row["inty"]))
|
|
569
|
-
if row.get("quality") is not None:
|
|
570
|
-
feature.setOverallQuality(float(row["quality"]))
|
|
571
|
-
if row.get("charge") is not None:
|
|
572
|
-
feature.setCharge(int(row["charge"]))
|
|
573
|
-
|
|
574
|
-
feature_map.push_back(feature)
|
|
575
|
-
feature_count += 1
|
|
576
|
-
|
|
577
|
-
except (ValueError, TypeError) as e:
|
|
578
|
-
study.logger.warning(f"Skipping feature in sample {sample_key} due to conversion error: {e}")
|
|
579
|
-
continue
|
|
580
|
-
|
|
581
|
-
temp_feature_maps.append(feature_map)
|
|
582
|
-
processed_samples += 1
|
|
583
|
-
study.logger.debug(f"Built feature map for sample {sample_key} with {feature_count} features")
|
|
584
|
-
|
|
585
|
-
except Exception as e:
|
|
586
|
-
study.logger.warning(f"Failed to process sample group {sample_key}: {e}")
|
|
587
|
-
# Add empty feature map for failed samples to maintain sample order
|
|
588
|
-
temp_feature_maps.append(oms.FeatureMap())
|
|
589
|
-
|
|
590
|
-
study.logger.info(f"Generated {len(temp_feature_maps)} feature maps from {processed_samples} samples using Study-level features_df")
|
|
591
|
-
return temp_feature_maps
|
|
592
|
-
|
|
593
|
-
|
|
594
532
|
def _generate_feature_maps_on_demand(study):
|
|
595
533
|
"""
|
|
596
534
|
Generate feature maps on-demand using Sample-level _load_ms1() for merge operations.
|
|
@@ -610,9 +548,9 @@ def _generate_feature_maps_on_demand(study):
|
|
|
610
548
|
use_sample_loading = True # Default to Sample-level loading as requested
|
|
611
549
|
|
|
612
550
|
# Use Sample-level loading if requested and samples_df is available
|
|
613
|
-
if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
|
|
614
|
-
|
|
615
|
-
|
|
551
|
+
#if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
|
|
552
|
+
# study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
|
|
553
|
+
# return _generate_feature_maps_from_samples(study)
|
|
616
554
|
|
|
617
555
|
# Fallback to original features_df approach
|
|
618
556
|
if study.features_df is None or len(study.features_df) == 0:
|
|
@@ -750,9 +688,6 @@ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
750
688
|
params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
|
|
751
689
|
params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
|
|
752
690
|
params_oms.setValue("ignore_charge", "true")
|
|
753
|
-
#params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
|
|
754
|
-
#params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
755
|
-
#params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
|
|
756
691
|
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
757
692
|
|
|
758
693
|
grouper.setParameters(params_oms)
|
|
@@ -761,534 +696,6 @@ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
|
761
696
|
return consensus_map
|
|
762
697
|
|
|
763
698
|
|
|
764
|
-
def _merge_kd_strict(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
765
|
-
"""
|
|
766
|
-
Quality merge: Standard KD algorithm with post-processing quality control.
|
|
767
|
-
|
|
768
|
-
This method combines the sensitivity of KD clustering with post-processing steps
|
|
769
|
-
to reduce oversegmentation while maintaining high-quality consensus features.
|
|
770
|
-
This is the recommended default method.
|
|
771
|
-
|
|
772
|
-
Post-processing features:
|
|
773
|
-
1. RT tolerance optimization (optional)
|
|
774
|
-
2. Secondary clustering for close features
|
|
775
|
-
3. Sample overlap validation
|
|
776
|
-
4. RT spread quality filtering
|
|
777
|
-
5. Chromatographic coherence validation
|
|
778
|
-
|
|
779
|
-
Additional parameters supported in params:
|
|
780
|
-
- optimize_rt_tol: bool - Enable RT tolerance optimization
|
|
781
|
-
- rt_tol_range: tuple - RT tolerance range for optimization (min, max)
|
|
782
|
-
- secondary_merge_rt_tol: float - Secondary merge RT tolerance (default: 0.5s)
|
|
783
|
-
- secondary_merge_mz_tol: float - Secondary merge m/z tolerance (default: 0.005)
|
|
784
|
-
- min_sample_overlap: float - Minimum sample overlap for merging (0.0-1.0, default: 0.8)
|
|
785
|
-
- max_rt_spread: float - Maximum RT spread allowed (default: 2x rt_tol)
|
|
786
|
-
- min_coherence: float - Minimum chromatographic coherence (default: 0.0, disabled)
|
|
787
|
-
"""
|
|
788
|
-
|
|
789
|
-
# Check for RT tolerance optimization
|
|
790
|
-
optimize_rt_tol = getattr(params, 'optimize_rt_tol', False)
|
|
791
|
-
|
|
792
|
-
if optimize_rt_tol:
|
|
793
|
-
# Optimize RT tolerance first
|
|
794
|
-
optimal_rt_tol = _optimize_rt_tolerance(study, params)
|
|
795
|
-
study.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
|
|
796
|
-
# Create modified params with optimal RT tolerance
|
|
797
|
-
import copy
|
|
798
|
-
optimized_params = copy.deepcopy(params)
|
|
799
|
-
optimized_params.rt_tol = optimal_rt_tol
|
|
800
|
-
else:
|
|
801
|
-
optimized_params = params
|
|
802
|
-
|
|
803
|
-
# Phase 1: Standard KD clustering
|
|
804
|
-
study.logger.debug("Initial KD clustering")
|
|
805
|
-
consensus_map = _merge_kd(study, optimized_params)
|
|
806
|
-
|
|
807
|
-
# Phase 2: Post-processing quality control
|
|
808
|
-
study.logger.debug("Post-processing quality control")
|
|
809
|
-
consensus_map = _apply_kd_strict_postprocessing(study, consensus_map, optimized_params)
|
|
810
|
-
|
|
811
|
-
return consensus_map
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
def _optimize_rt_tolerance(study, params: merge_defaults) -> float:
|
|
815
|
-
"""
|
|
816
|
-
Optimize RT tolerance by testing different values and measuring oversegmentation.
|
|
817
|
-
|
|
818
|
-
Args:
|
|
819
|
-
study: Study object
|
|
820
|
-
params: Merge parameters
|
|
821
|
-
|
|
822
|
-
Returns:
|
|
823
|
-
Optimal RT tolerance value
|
|
824
|
-
"""
|
|
825
|
-
rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
|
|
826
|
-
rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
|
|
827
|
-
|
|
828
|
-
study.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
|
|
829
|
-
|
|
830
|
-
# Generate test values
|
|
831
|
-
test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
|
|
832
|
-
for i in range(rt_tol_steps)]
|
|
833
|
-
|
|
834
|
-
best_rt_tol = params.rt_tol
|
|
835
|
-
best_score = float('inf')
|
|
836
|
-
|
|
837
|
-
# Store original features for restoration
|
|
838
|
-
original_consensus_df = getattr(study, 'consensus_df', pl.DataFrame())
|
|
839
|
-
original_consensus_mapping_df = getattr(study, 'consensus_mapping_df', pl.DataFrame())
|
|
840
|
-
|
|
841
|
-
for test_rt_tol in test_rt_tols:
|
|
842
|
-
try:
|
|
843
|
-
# Create test parameters
|
|
844
|
-
import copy
|
|
845
|
-
test_params = copy.deepcopy(params)
|
|
846
|
-
test_params.rt_tol = test_rt_tol
|
|
847
|
-
|
|
848
|
-
# Run KD merge with test parameters
|
|
849
|
-
test_consensus_map = _merge_kd(study, test_params)
|
|
850
|
-
|
|
851
|
-
# Extract consensus features temporarily for analysis
|
|
852
|
-
_extract_consensus_features(study, test_consensus_map, test_params.min_samples)
|
|
853
|
-
|
|
854
|
-
if len(study.consensus_df) == 0:
|
|
855
|
-
continue
|
|
856
|
-
|
|
857
|
-
# Calculate oversegmentation metrics
|
|
858
|
-
oversegmentation_score = _calculate_oversegmentation_score(study, test_rt_tol)
|
|
859
|
-
|
|
860
|
-
study.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(study.consensus_df)} features, score: {oversegmentation_score:.3f}")
|
|
861
|
-
|
|
862
|
-
# Lower score is better (less oversegmentation)
|
|
863
|
-
if oversegmentation_score < best_score:
|
|
864
|
-
best_score = oversegmentation_score
|
|
865
|
-
best_rt_tol = test_rt_tol
|
|
866
|
-
|
|
867
|
-
except Exception as e:
|
|
868
|
-
study.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
|
|
869
|
-
continue
|
|
870
|
-
|
|
871
|
-
# Restore original consensus data
|
|
872
|
-
study.consensus_df = original_consensus_df
|
|
873
|
-
study.consensus_mapping_df = original_consensus_mapping_df
|
|
874
|
-
|
|
875
|
-
study.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
|
|
876
|
-
return best_rt_tol
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
def _calculate_oversegmentation_score(study, rt_tol: float) -> float:
|
|
880
|
-
"""
|
|
881
|
-
Calculate oversegmentation score based on feature density and RT spread metrics.
|
|
882
|
-
Lower scores indicate less oversegmentation.
|
|
883
|
-
|
|
884
|
-
Args:
|
|
885
|
-
study: Study object
|
|
886
|
-
rt_tol: RT tolerance used
|
|
887
|
-
|
|
888
|
-
Returns:
|
|
889
|
-
Oversegmentation score (lower = better)
|
|
890
|
-
"""
|
|
891
|
-
if len(study.consensus_df) == 0:
|
|
892
|
-
return float('inf')
|
|
893
|
-
|
|
894
|
-
# Metric 1: Feature density (features per RT second)
|
|
895
|
-
rt_range = study.consensus_df['rt'].max() - study.consensus_df['rt'].min()
|
|
896
|
-
if rt_range <= 0:
|
|
897
|
-
return float('inf')
|
|
898
|
-
|
|
899
|
-
feature_density = len(study.consensus_df) / rt_range
|
|
900
|
-
|
|
901
|
-
# Metric 2: Average RT spread relative to tolerance
|
|
902
|
-
rt_spreads = (study.consensus_df['rt_max'] - study.consensus_df['rt_min'])
|
|
903
|
-
avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
|
|
904
|
-
|
|
905
|
-
# Metric 3: Proportion of features with low sample counts (indicates fragmentation)
|
|
906
|
-
low_sample_features = len(study.consensus_df.filter(pl.col('number_samples') <= 5))
|
|
907
|
-
low_sample_ratio = low_sample_features / len(study.consensus_df)
|
|
908
|
-
|
|
909
|
-
# Metric 4: Number of features with excessive RT spread
|
|
910
|
-
excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
|
|
911
|
-
excessive_spread_ratio = excessive_spread_features / len(study.consensus_df)
|
|
912
|
-
|
|
913
|
-
# Combined score (weighted combination)
|
|
914
|
-
oversegmentation_score = (
|
|
915
|
-
0.4 * (feature_density / 10.0) + # Normalize to reasonable scale
|
|
916
|
-
0.3 * avg_rt_spread_ratio +
|
|
917
|
-
0.2 * low_sample_ratio +
|
|
918
|
-
0.1 * excessive_spread_ratio
|
|
919
|
-
)
|
|
920
|
-
|
|
921
|
-
return oversegmentation_score
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
def _apply_kd_strict_postprocessing(study, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
|
|
925
|
-
"""
|
|
926
|
-
Apply post-processing quality control to KD consensus map.
|
|
927
|
-
|
|
928
|
-
Args:
|
|
929
|
-
consensus_map: Initial consensus map from KD
|
|
930
|
-
params: Merge parameters with kd-strict options
|
|
931
|
-
|
|
932
|
-
Returns:
|
|
933
|
-
Processed consensus map with reduced oversegmentation
|
|
934
|
-
"""
|
|
935
|
-
if consensus_map.size() == 0:
|
|
936
|
-
study.logger.warning("Empty consensus map provided to post-processing")
|
|
937
|
-
return consensus_map
|
|
938
|
-
|
|
939
|
-
study.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
|
|
940
|
-
|
|
941
|
-
# Step 1: Extract initial consensus features
|
|
942
|
-
original_min_samples = params.min_samples
|
|
943
|
-
params.min_samples = 1 # Extract all features initially
|
|
944
|
-
|
|
945
|
-
_extract_consensus_features(study, consensus_map, params.min_samples)
|
|
946
|
-
initial_feature_count = len(study.consensus_df)
|
|
947
|
-
|
|
948
|
-
if initial_feature_count == 0:
|
|
949
|
-
study.logger.warning("No consensus features extracted for post-processing")
|
|
950
|
-
params.min_samples = original_min_samples
|
|
951
|
-
return consensus_map
|
|
952
|
-
|
|
953
|
-
# Step 2: Secondary clustering for close features
|
|
954
|
-
secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
|
|
955
|
-
secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
|
|
956
|
-
|
|
957
|
-
study.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
|
|
958
|
-
merged_features = _perform_secondary_clustering(study, secondary_merge_rt_tol, secondary_merge_mz_tol)
|
|
959
|
-
|
|
960
|
-
# Step 3: Sample overlap validation
|
|
961
|
-
min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
|
|
962
|
-
if min_sample_overlap > 0:
|
|
963
|
-
study.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
|
|
964
|
-
merged_features = _validate_sample_overlap(study, merged_features, min_sample_overlap)
|
|
965
|
-
|
|
966
|
-
# Step 4: RT spread quality filtering
|
|
967
|
-
if params.rt_tol is not None:
|
|
968
|
-
max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
|
|
969
|
-
if max_rt_spread is not None:
|
|
970
|
-
study.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
|
|
971
|
-
merged_features = _filter_rt_spread(study, merged_features, max_rt_spread)
|
|
972
|
-
else:
|
|
973
|
-
study.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
|
|
974
|
-
else:
|
|
975
|
-
study.logger.debug("Skipping RT spread filtering - rt_tol is None")
|
|
976
|
-
|
|
977
|
-
# Step 5: Chromatographic coherence filtering (optional)
|
|
978
|
-
min_coherence = getattr(params, 'min_coherence', 0.0)
|
|
979
|
-
if min_coherence > 0:
|
|
980
|
-
study.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
|
|
981
|
-
merged_features = _filter_coherence(study, merged_features, min_coherence)
|
|
982
|
-
|
|
983
|
-
# Step 6: Rebuild consensus_df with filtered features and preserve mapping
|
|
984
|
-
original_mapping_df = study.consensus_mapping_df.clone() # Save original mapping
|
|
985
|
-
study.consensus_df = pl.DataFrame(merged_features, strict=False)
|
|
986
|
-
|
|
987
|
-
# Step 7: Apply original min_samples filter
|
|
988
|
-
params.min_samples = original_min_samples
|
|
989
|
-
if params.min_samples > 1:
|
|
990
|
-
l1 = len(study.consensus_df)
|
|
991
|
-
study.consensus_df = study.consensus_df.filter(
|
|
992
|
-
pl.col("number_samples") >= params.min_samples
|
|
993
|
-
)
|
|
994
|
-
filtered_count = l1 - len(study.consensus_df)
|
|
995
|
-
if filtered_count > 0:
|
|
996
|
-
study.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
|
|
997
|
-
|
|
998
|
-
# Step 8: Update consensus_mapping_df to match final consensus_df
|
|
999
|
-
if len(study.consensus_df) > 0 and len(original_mapping_df) > 0:
|
|
1000
|
-
valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
|
|
1001
|
-
study.consensus_mapping_df = original_mapping_df.filter(
|
|
1002
|
-
pl.col('consensus_uid').is_in(list(valid_consensus_ids))
|
|
1003
|
-
)
|
|
1004
|
-
else:
|
|
1005
|
-
study.consensus_mapping_df = pl.DataFrame()
|
|
1006
|
-
|
|
1007
|
-
final_feature_count = len(study.consensus_df)
|
|
1008
|
-
reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
|
|
1009
|
-
|
|
1010
|
-
study.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
|
|
1011
|
-
|
|
1012
|
-
# Create a new consensus map for compatibility (the processed data is in consensus_df)
|
|
1013
|
-
processed_consensus_map = oms.ConsensusMap()
|
|
1014
|
-
return processed_consensus_map
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
def _perform_secondary_clustering(study, rt_tol: float, mz_tol: float) -> list:
|
|
1018
|
-
"""
|
|
1019
|
-
Perform secondary clustering to merge very close features.
|
|
1020
|
-
|
|
1021
|
-
Args:
|
|
1022
|
-
rt_tol: RT tolerance for secondary clustering
|
|
1023
|
-
mz_tol: m/z tolerance for secondary clustering
|
|
1024
|
-
|
|
1025
|
-
Returns:
|
|
1026
|
-
List of merged consensus feature dictionaries
|
|
1027
|
-
"""
|
|
1028
|
-
if len(study.consensus_df) == 0:
|
|
1029
|
-
return []
|
|
1030
|
-
|
|
1031
|
-
# Convert consensus_df to list of dictionaries for clustering
|
|
1032
|
-
consensus_features = []
|
|
1033
|
-
for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
|
|
1034
|
-
consensus_features.append(dict(row))
|
|
1035
|
-
|
|
1036
|
-
# Use Union-Find for efficient clustering
|
|
1037
|
-
class UnionFind:
|
|
1038
|
-
def __init__(study, n):
|
|
1039
|
-
study.parent = list(range(n))
|
|
1040
|
-
study.rank = [0] * n
|
|
1041
|
-
|
|
1042
|
-
def find(study, x):
|
|
1043
|
-
if study.parent[x] != x:
|
|
1044
|
-
study.parent[x] = study.find(study.parent[x])
|
|
1045
|
-
return study.parent[x]
|
|
1046
|
-
|
|
1047
|
-
def union(study, x, y):
|
|
1048
|
-
px, py = study.find(x), study.find(y)
|
|
1049
|
-
if px == py:
|
|
1050
|
-
return
|
|
1051
|
-
if study.rank[px] < study.rank[py]:
|
|
1052
|
-
px, py = py, px
|
|
1053
|
-
study.parent[py] = px
|
|
1054
|
-
if study.rank[px] == study.rank[py]:
|
|
1055
|
-
study.rank[px] += 1
|
|
1056
|
-
|
|
1057
|
-
n_features = len(consensus_features)
|
|
1058
|
-
uf = UnionFind(n_features)
|
|
1059
|
-
|
|
1060
|
-
# Find features to merge based on proximity
|
|
1061
|
-
merge_count = 0
|
|
1062
|
-
for i in range(n_features):
|
|
1063
|
-
for j in range(i + 1, n_features):
|
|
1064
|
-
feat_i = consensus_features[i]
|
|
1065
|
-
feat_j = consensus_features[j]
|
|
1066
|
-
|
|
1067
|
-
rt_diff = abs(feat_i['rt'] - feat_j['rt'])
|
|
1068
|
-
mz_diff = abs(feat_i['mz'] - feat_j['mz'])
|
|
1069
|
-
|
|
1070
|
-
if rt_diff <= rt_tol and mz_diff <= mz_tol:
|
|
1071
|
-
uf.union(i, j)
|
|
1072
|
-
merge_count += 1
|
|
1073
|
-
|
|
1074
|
-
# Group features by their root
|
|
1075
|
-
groups_by_root = defaultdict(list)
|
|
1076
|
-
for i in range(n_features):
|
|
1077
|
-
root = uf.find(i)
|
|
1078
|
-
groups_by_root[root].append(consensus_features[i])
|
|
1079
|
-
|
|
1080
|
-
# Merge features within each group
|
|
1081
|
-
merged_features = []
|
|
1082
|
-
for group in groups_by_root.values():
|
|
1083
|
-
if len(group) == 1:
|
|
1084
|
-
# Single feature - keep as is
|
|
1085
|
-
merged_features.append(group[0])
|
|
1086
|
-
else:
|
|
1087
|
-
# Multiple features - merge them
|
|
1088
|
-
merged_feature = _merge_feature_group(group)
|
|
1089
|
-
merged_features.append(merged_feature)
|
|
1090
|
-
|
|
1091
|
-
study.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
|
|
1092
|
-
return merged_features
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
def _merge_feature_group(feature_group: list) -> dict:
|
|
1096
|
-
"""
|
|
1097
|
-
Merge a group of similar consensus features into one.
|
|
1098
|
-
|
|
1099
|
-
Args:
|
|
1100
|
-
feature_group: List of consensus feature dictionaries to merge
|
|
1101
|
-
|
|
1102
|
-
Returns:
|
|
1103
|
-
Merged consensus feature dictionary
|
|
1104
|
-
"""
|
|
1105
|
-
if not feature_group:
|
|
1106
|
-
return {}
|
|
1107
|
-
|
|
1108
|
-
if len(feature_group) == 1:
|
|
1109
|
-
return feature_group[0]
|
|
1110
|
-
|
|
1111
|
-
# Use the feature with highest sample count as base
|
|
1112
|
-
base_feature = max(feature_group, key=lambda f: f.get('number_samples', 0))
|
|
1113
|
-
merged = base_feature.copy()
|
|
1114
|
-
|
|
1115
|
-
# Aggregate numeric statistics
|
|
1116
|
-
rt_values = [f['rt'] for f in feature_group if f.get('rt') is not None]
|
|
1117
|
-
mz_values = [f['mz'] for f in feature_group if f.get('mz') is not None]
|
|
1118
|
-
sample_counts = [f.get('number_samples', 0) for f in feature_group]
|
|
1119
|
-
intensities = [f.get('inty_mean', 0) for f in feature_group if f.get('inty_mean') is not None]
|
|
1120
|
-
|
|
1121
|
-
# Update merged feature statistics
|
|
1122
|
-
if rt_values:
|
|
1123
|
-
merged['rt'] = float(np.mean(rt_values))
|
|
1124
|
-
merged['rt_min'] = min([f.get('rt_min', f['rt']) for f in feature_group])
|
|
1125
|
-
merged['rt_max'] = max([f.get('rt_max', f['rt']) for f in feature_group])
|
|
1126
|
-
merged['rt_mean'] = float(np.mean(rt_values))
|
|
1127
|
-
|
|
1128
|
-
if mz_values:
|
|
1129
|
-
merged['mz'] = float(np.mean(mz_values))
|
|
1130
|
-
merged['mz_min'] = min([f.get('mz_min', f['mz']) for f in feature_group])
|
|
1131
|
-
merged['mz_max'] = max([f.get('mz_max', f['mz']) for f in feature_group])
|
|
1132
|
-
merged['mz_mean'] = float(np.mean(mz_values))
|
|
1133
|
-
|
|
1134
|
-
# Use maximum sample count (features might be detected in overlapping but different samples)
|
|
1135
|
-
merged['number_samples'] = max(sample_counts)
|
|
1136
|
-
|
|
1137
|
-
# Use weighted average intensity (by sample count)
|
|
1138
|
-
if intensities and sample_counts:
|
|
1139
|
-
total_weight = sum(sample_counts)
|
|
1140
|
-
if total_weight > 0:
|
|
1141
|
-
weighted_intensity = sum(inty * count for inty, count in zip(intensities, sample_counts)) / total_weight
|
|
1142
|
-
merged['inty_mean'] = float(weighted_intensity)
|
|
1143
|
-
|
|
1144
|
-
# Aggregate chromatographic quality metrics if available
|
|
1145
|
-
coherence_values = [f.get('chrom_coherence_mean', 0) for f in feature_group if f.get('chrom_coherence_mean') is not None]
|
|
1146
|
-
prominence_values = [f.get('chrom_prominence_mean', 0) for f in feature_group if f.get('chrom_prominence_mean') is not None]
|
|
1147
|
-
|
|
1148
|
-
if coherence_values:
|
|
1149
|
-
merged['chrom_coherence_mean'] = float(np.mean(coherence_values))
|
|
1150
|
-
if prominence_values:
|
|
1151
|
-
merged['chrom_prominence_mean'] = float(np.mean(prominence_values))
|
|
1152
|
-
|
|
1153
|
-
# Merge MS2 counts
|
|
1154
|
-
ms2_counts = [f.get('number_ms2', 0) for f in feature_group]
|
|
1155
|
-
merged['number_ms2'] = sum(ms2_counts)
|
|
1156
|
-
|
|
1157
|
-
# Keep the best quality score
|
|
1158
|
-
quality_scores = [f.get('quality', 1.0) for f in feature_group if f.get('quality') is not None]
|
|
1159
|
-
if quality_scores:
|
|
1160
|
-
merged['quality'] = max(quality_scores)
|
|
1161
|
-
|
|
1162
|
-
return merged
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
def _validate_sample_overlap(study, features: list, min_overlap: float) -> list:
|
|
1166
|
-
"""
|
|
1167
|
-
Validate that merged features have sufficient sample overlap.
|
|
1168
|
-
|
|
1169
|
-
Args:
|
|
1170
|
-
features: List of consensus feature dictionaries
|
|
1171
|
-
min_overlap: Minimum sample overlap ratio (0.0-1.0)
|
|
1172
|
-
|
|
1173
|
-
Returns:
|
|
1174
|
-
List of validated features
|
|
1175
|
-
"""
|
|
1176
|
-
# This is a placeholder for sample overlap validation
|
|
1177
|
-
# Implementation would require access to which samples each feature appears in
|
|
1178
|
-
# For now, we'll use a simple heuristic based on feature statistics
|
|
1179
|
-
|
|
1180
|
-
validated_features = []
|
|
1181
|
-
for feature in features:
|
|
1182
|
-
# Simple validation based on RT spread and sample count ratio
|
|
1183
|
-
rt_spread = feature.get('rt_max', feature['rt']) - feature.get('rt_min', feature['rt'])
|
|
1184
|
-
sample_count = feature.get('number_samples', 1)
|
|
1185
|
-
|
|
1186
|
-
# Features with very tight RT spread and high sample counts are more reliable
|
|
1187
|
-
if rt_spread <= 2.0 or sample_count >= 10: # More permissive validation
|
|
1188
|
-
validated_features.append(feature)
|
|
1189
|
-
else:
|
|
1190
|
-
# Could implement more sophisticated sample overlap checking here
|
|
1191
|
-
validated_features.append(feature) # Keep for now
|
|
1192
|
-
|
|
1193
|
-
return validated_features
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
def _filter_rt_spread(study, features: list, max_rt_spread: float) -> list:
|
|
1197
|
-
"""
|
|
1198
|
-
Filter out features with excessive RT spread.
|
|
1199
|
-
|
|
1200
|
-
Args:
|
|
1201
|
-
features: List of consensus feature dictionaries
|
|
1202
|
-
max_rt_spread: Maximum allowed RT spread in seconds
|
|
1203
|
-
|
|
1204
|
-
Returns:
|
|
1205
|
-
List of filtered features
|
|
1206
|
-
"""
|
|
1207
|
-
filtered_features = []
|
|
1208
|
-
filtered_count = 0
|
|
1209
|
-
|
|
1210
|
-
for feature in features:
|
|
1211
|
-
rt_min = feature.get('rt_min', feature['rt'])
|
|
1212
|
-
rt_max = feature.get('rt_max', feature['rt'])
|
|
1213
|
-
rt_spread = rt_max - rt_min
|
|
1214
|
-
|
|
1215
|
-
if rt_spread <= max_rt_spread:
|
|
1216
|
-
filtered_features.append(feature)
|
|
1217
|
-
else:
|
|
1218
|
-
filtered_count += 1
|
|
1219
|
-
|
|
1220
|
-
if filtered_count > 0:
|
|
1221
|
-
study.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
|
|
1222
|
-
|
|
1223
|
-
return filtered_features
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
def _filter_coherence(study, features: list, min_coherence: float) -> list:
|
|
1227
|
-
"""
|
|
1228
|
-
Filter out features with low chromatographic coherence.
|
|
1229
|
-
|
|
1230
|
-
Args:
|
|
1231
|
-
features: List of consensus feature dictionaries
|
|
1232
|
-
min_coherence: Minimum chromatographic coherence score
|
|
1233
|
-
|
|
1234
|
-
Returns:
|
|
1235
|
-
List of filtered features
|
|
1236
|
-
"""
|
|
1237
|
-
filtered_features = []
|
|
1238
|
-
filtered_count = 0
|
|
1239
|
-
|
|
1240
|
-
for feature in features:
|
|
1241
|
-
coherence = feature.get('chrom_coherence_mean', 1.0) # Default to high coherence if missing
|
|
1242
|
-
|
|
1243
|
-
if coherence >= min_coherence:
|
|
1244
|
-
filtered_features.append(feature)
|
|
1245
|
-
else:
|
|
1246
|
-
filtered_count += 1
|
|
1247
|
-
|
|
1248
|
-
if filtered_count > 0:
|
|
1249
|
-
study.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
|
|
1250
|
-
|
|
1251
|
-
return filtered_features
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
def _merge_kd_nowarp(study, params: merge_defaults) -> oms.ConsensusMap:
|
|
1255
|
-
"""KD-tree based merge without RT warping"""
|
|
1256
|
-
|
|
1257
|
-
# Generate temporary feature maps on-demand from features_df
|
|
1258
|
-
temp_feature_maps = _generate_feature_maps_on_demand(study)
|
|
1259
|
-
|
|
1260
|
-
consensus_map = oms.ConsensusMap()
|
|
1261
|
-
file_descriptions = consensus_map.getColumnHeaders()
|
|
1262
|
-
|
|
1263
|
-
for i, feature_map in enumerate(temp_feature_maps):
|
|
1264
|
-
file_description = file_descriptions.get(i, oms.ColumnHeader())
|
|
1265
|
-
file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
|
|
1266
|
-
file_description.size = feature_map.size()
|
|
1267
|
-
file_description.unique_id = feature_map.getUniqueId()
|
|
1268
|
-
file_descriptions[i] = file_description
|
|
1269
|
-
|
|
1270
|
-
consensus_map.setColumnHeaders(file_descriptions)
|
|
1271
|
-
|
|
1272
|
-
# Configure KD algorithm with warping disabled for memory efficiency
|
|
1273
|
-
grouper = oms.FeatureGroupingAlgorithmKD()
|
|
1274
|
-
params_oms = grouper.getParameters()
|
|
1275
|
-
|
|
1276
|
-
params_oms.setValue("mz_unit", "Da")
|
|
1277
|
-
params_oms.setValue("nr_partitions", params.nr_partitions)
|
|
1278
|
-
params_oms.setValue("warp:enabled", "false") # Disabled for memory efficiency
|
|
1279
|
-
params_oms.setValue("link:rt_tol", params.rt_tol)
|
|
1280
|
-
params_oms.setValue("link:mz_tol", params.mz_tol)
|
|
1281
|
-
params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
|
|
1282
|
-
params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
|
|
1283
|
-
params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
|
|
1284
|
-
#params_oms.setValue("link:charge_merging", "Any")
|
|
1285
|
-
|
|
1286
|
-
grouper.setParameters(params_oms)
|
|
1287
|
-
grouper.group(temp_feature_maps, consensus_map)
|
|
1288
|
-
|
|
1289
|
-
return consensus_map
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
699
|
def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
|
|
1293
700
|
"""KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
|
|
1294
701
|
|
|
@@ -1462,7 +869,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
1462
869
|
|
|
1463
870
|
# Merge chunk results with proper cross-chunk consensus building
|
|
1464
871
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1465
|
-
|
|
872
|
+
_dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1466
873
|
|
|
1467
874
|
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
1468
875
|
consensus_map = oms.ConsensusMap()
|
|
@@ -1637,14 +1044,14 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
|
|
|
1637
1044
|
|
|
1638
1045
|
# Merge chunk results with proper cross-chunk consensus building
|
|
1639
1046
|
# _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
|
|
1640
|
-
|
|
1047
|
+
_dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
|
|
1641
1048
|
|
|
1642
1049
|
# Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
|
|
1643
1050
|
consensus_map = oms.ConsensusMap()
|
|
1644
1051
|
return consensus_map
|
|
1645
1052
|
|
|
1646
1053
|
|
|
1647
|
-
def
|
|
1054
|
+
def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
|
|
1648
1055
|
"""
|
|
1649
1056
|
Scalable aggregation of chunk consensus maps into final consensus_df.
|
|
1650
1057
|
|
|
@@ -1672,7 +1079,7 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
1672
1079
|
for row in study.features_df.iter_rows(named=True)
|
|
1673
1080
|
}
|
|
1674
1081
|
|
|
1675
|
-
features_lookup =
|
|
1082
|
+
features_lookup = __merge_feature_lookup(study, study.features_df)
|
|
1676
1083
|
|
|
1677
1084
|
# Extract all consensus features from chunks with their feature_uids
|
|
1678
1085
|
all_chunk_consensus = []
|
|
@@ -1705,7 +1112,10 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
1705
1112
|
if feature_data:
|
|
1706
1113
|
feature_uids.append(feature_uid)
|
|
1707
1114
|
feature_data_list.append(feature_data)
|
|
1708
|
-
|
|
1115
|
+
|
|
1116
|
+
# Use feature_uid to lookup actual sample_uid instead of chunk position
|
|
1117
|
+
actual_sample_uid = feature_data['sample_uid']
|
|
1118
|
+
sample_uids.append(actual_sample_uid)
|
|
1709
1119
|
|
|
1710
1120
|
if not feature_data_list:
|
|
1711
1121
|
# No retrievable feature metadata (possible stale map reference) -> skip
|
|
@@ -1725,13 +1135,6 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
1725
1135
|
|
|
1726
1136
|
# Process the consensus features (now all in serialized format)
|
|
1727
1137
|
for consensus_feature_data in consensus_features_data:
|
|
1728
|
-
# ACCEPT ALL consensus features (size >=1) here.
|
|
1729
|
-
# Reason: A feature that is globally present in many samples can still
|
|
1730
|
-
# appear only once inside a given sample chunk. Early filtering at
|
|
1731
|
-
# size>=2 causes irreversible loss and underestimates the final
|
|
1732
|
-
# consensus count (observed ~296 vs 950 for KD). We defer filtering
|
|
1733
|
-
# strictly to the final global min_samples.
|
|
1734
|
-
|
|
1735
1138
|
# For parallel processing, feature data is already extracted
|
|
1736
1139
|
if isinstance(chunk_data, list):
|
|
1737
1140
|
# Extract feature_uids and data from serialized format for parallel processing
|
|
@@ -1749,11 +1152,14 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
1749
1152
|
if feature_data:
|
|
1750
1153
|
feature_uids.append(feature_uid)
|
|
1751
1154
|
feature_data_list.append(feature_data)
|
|
1752
|
-
|
|
1155
|
+
|
|
1156
|
+
# Use feature_uid to lookup actual sample_uid instead of chunk position
|
|
1157
|
+
actual_sample_uid = feature_data['sample_uid']
|
|
1158
|
+
sample_uids.append(actual_sample_uid)
|
|
1753
1159
|
|
|
1754
1160
|
if not feature_data_list:
|
|
1755
1161
|
continue
|
|
1756
|
-
|
|
1162
|
+
|
|
1757
1163
|
# Get RT/MZ from consensus feature data
|
|
1758
1164
|
consensus_rt = consensus_feature_data['rt']
|
|
1759
1165
|
consensus_mz = consensus_feature_data['mz']
|
|
@@ -1818,163 +1224,301 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
1818
1224
|
study.consensus_mapping_df = pl.DataFrame()
|
|
1819
1225
|
return
|
|
1820
1226
|
|
|
1821
|
-
#
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
|
|
1227
|
+
# CROSS-CHUNK DECHUNKING ALGORITHMS
|
|
1228
|
+
# Multiple algorithms available for combining chunk results
|
|
1229
|
+
|
|
1230
|
+
class HierarchicalAnchorMerger:
|
|
1231
|
+
"""
|
|
1232
|
+
Hierarchical Anchor Merger: Comprehensive cross-chunk feature preservation.
|
|
1233
|
+
Uses Union-Find clustering for transitive matching across multiple chunks.
|
|
1827
1234
|
"""
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1235
|
+
def __init__(self, rt_tol: float, mz_tol: float):
|
|
1236
|
+
self.rt_tol = rt_tol
|
|
1237
|
+
self.mz_tol = mz_tol
|
|
1238
|
+
|
|
1239
|
+
def merge(self, chunk_consensus_list: list) -> list:
|
|
1240
|
+
"""Fixed hierarchical merging with union-find clustering for complete feature preservation"""
|
|
1241
|
+
if not chunk_consensus_list:
|
|
1242
|
+
return []
|
|
1243
|
+
|
|
1244
|
+
study.logger.debug(f"FIXED HierarchicalAnchorMerger: processing {len(chunk_consensus_list)} chunk features")
|
|
1245
|
+
|
|
1246
|
+
# Union-Find data structure for transitive clustering
|
|
1247
|
+
class UnionFind:
|
|
1248
|
+
def __init__(self, n):
|
|
1249
|
+
self.parent = list(range(n))
|
|
1250
|
+
self.rank = [0] * n
|
|
1251
|
+
|
|
1252
|
+
def find(self, x):
|
|
1253
|
+
if self.parent[x] != x:
|
|
1254
|
+
self.parent[x] = self.find(self.parent[x]) # Path compression
|
|
1255
|
+
return self.parent[x]
|
|
1256
|
+
|
|
1257
|
+
def union(self, x, y):
|
|
1258
|
+
px, py = self.find(x), self.find(y)
|
|
1259
|
+
if px == py:
|
|
1260
|
+
return False # Already in same component
|
|
1261
|
+
# Union by rank for balanced trees
|
|
1262
|
+
if self.rank[px] < self.rank[py]:
|
|
1263
|
+
px, py = py, px
|
|
1264
|
+
self.parent[py] = px
|
|
1265
|
+
if self.rank[px] == self.rank[py]:
|
|
1266
|
+
self.rank[px] += 1
|
|
1267
|
+
return True # Union was performed
|
|
1268
|
+
|
|
1269
|
+
n_features = len(chunk_consensus_list)
|
|
1270
|
+
uf = UnionFind(n_features)
|
|
1271
|
+
merges_made = 0
|
|
1272
|
+
|
|
1273
|
+
# Optimized cross-chunk feature matching using KD-tree spatial indexing
|
|
1274
|
+
|
|
1275
|
+
# Proper dimensional scaling for RT vs m/z
|
|
1276
|
+
rt_scale = 1.0 # RT in seconds (1-30 min range)
|
|
1277
|
+
mz_scale = 100.0 # m/z in Da (100-1000 range) - scale to match RT magnitude
|
|
1278
|
+
|
|
1279
|
+
# Build spatial index with scaled coordinates
|
|
1280
|
+
points = np.array([[f['rt'] * rt_scale, f['mz'] * mz_scale] for f in chunk_consensus_list])
|
|
1281
|
+
tree = cKDTree(points, balanced_tree=True, compact_nodes=True)
|
|
1282
|
+
|
|
1283
|
+
# Calculate proper Euclidean radius in scaled space
|
|
1284
|
+
scaled_rt_tol = self.rt_tol * rt_scale
|
|
1285
|
+
scaled_mz_tol = self.mz_tol * mz_scale
|
|
1286
|
+
radius = np.sqrt(scaled_rt_tol**2 + scaled_mz_tol**2)
|
|
1287
|
+
|
|
1288
|
+
# Efficient neighbor search for feature matching
|
|
1289
|
+
for i in range(n_features):
|
|
1290
|
+
feature_i = chunk_consensus_list[i]
|
|
1291
|
+
chunk_i = feature_i.get('chunk_idx', -1)
|
|
1292
|
+
|
|
1293
|
+
# Query spatial index for nearby features
|
|
1294
|
+
neighbor_indices = tree.query_ball_point(points[i], r=radius, p=2)
|
|
1295
|
+
|
|
1296
|
+
for j in neighbor_indices:
|
|
1297
|
+
if i >= j: # Skip duplicates and self
|
|
1868
1298
|
continue
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
|
|
1299
|
+
|
|
1300
|
+
feature_j = chunk_consensus_list[j]
|
|
1301
|
+
chunk_j = feature_j.get('chunk_idx', -1)
|
|
1302
|
+
|
|
1303
|
+
# Skip features from same chunk (already clustered within chunk)
|
|
1304
|
+
if chunk_i == chunk_j:
|
|
1305
|
+
continue
|
|
1306
|
+
|
|
1307
|
+
# Verify with precise original tolerances (more accurate than scaled)
|
|
1308
|
+
rt_diff = abs(feature_i['rt'] - feature_j['rt'])
|
|
1309
|
+
mz_diff = abs(feature_i['mz'] - feature_j['mz'])
|
|
1310
|
+
|
|
1311
|
+
if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
|
|
1312
|
+
if uf.union(i, j): # Merge if not already connected
|
|
1313
|
+
merges_made += 1
|
|
1314
|
+
|
|
1315
|
+
study.logger.debug(f"FIXED HierarchicalAnchorMerger: made {merges_made} cross-chunk merges")
|
|
1316
|
+
|
|
1317
|
+
# Group features by their connected component
|
|
1318
|
+
clusters = {}
|
|
1319
|
+
for i in range(n_features):
|
|
1320
|
+
root = uf.find(i)
|
|
1321
|
+
if root not in clusters:
|
|
1322
|
+
clusters[root] = []
|
|
1323
|
+
clusters[root].append(chunk_consensus_list[i])
|
|
1324
|
+
|
|
1325
|
+
# Merge each cluster into a single consensus feature
|
|
1326
|
+
result = []
|
|
1327
|
+
for cluster_features in clusters.values():
|
|
1328
|
+
merged = self._merge_cluster(cluster_features)
|
|
1329
|
+
result.append(merged)
|
|
1330
|
+
|
|
1331
|
+
study.logger.debug(f"FIXED HierarchicalAnchorMerger: output {len(result)} merged features (from {n_features} inputs)")
|
|
1332
|
+
|
|
1333
|
+
# VERIFICATION: Ensure we haven't lost features
|
|
1334
|
+
if len(result) > len(chunk_consensus_list):
|
|
1335
|
+
study.logger.warning(f"FIXED HierarchicalAnchorMerger: More outputs than inputs ({len(result)} > {n_features})")
|
|
1336
|
+
|
|
1337
|
+
return result
|
|
1338
|
+
|
|
1339
|
+
def _merge_cluster(self, cluster: list) -> dict:
|
|
1340
|
+
"""Merge cluster using sample-weighted consensus with robust error handling"""
|
|
1341
|
+
if len(cluster) == 1:
|
|
1342
|
+
return cluster[0] # No merging needed for single feature
|
|
1343
|
+
|
|
1344
|
+
# Calculate weights robustly to prevent division by zero
|
|
1345
|
+
weights = []
|
|
1346
|
+
for c in cluster:
|
|
1347
|
+
sample_count = c.get('sample_count', 0)
|
|
1348
|
+
# Use minimum weight of 1 to prevent zero weights
|
|
1349
|
+
weights.append(max(sample_count, 1))
|
|
1350
|
+
|
|
1351
|
+
total_weight = sum(weights)
|
|
1352
|
+
# Fallback for edge cases
|
|
1353
|
+
if total_weight == 0:
|
|
1354
|
+
total_weight = len(cluster)
|
|
1355
|
+
weights = [1] * len(cluster)
|
|
1356
|
+
|
|
1357
|
+
# Weighted consensus for RT/mz coordinates
|
|
1358
|
+
merged = {
|
|
1359
|
+
'consensus_id': cluster[0]['consensus_id'], # Use first feature's ID
|
|
1360
|
+
'chunk_indices': [c.get('chunk_idx', 0) for c in cluster],
|
|
1361
|
+
'mz': sum(c['mz'] * w for c, w in zip(cluster, weights)) / total_weight,
|
|
1362
|
+
'rt': sum(c['rt'] * w for c, w in zip(cluster, weights)) / total_weight,
|
|
1363
|
+
'intensity': sum(c.get('intensity', 0) for c in cluster),
|
|
1364
|
+
'quality': sum(c.get('quality', 1) * w for c, w in zip(cluster, weights)) / total_weight,
|
|
1365
|
+
'feature_uids': [],
|
|
1366
|
+
'feature_data_list': [],
|
|
1367
|
+
'sample_uids': [],
|
|
1368
|
+
'sample_count': 0
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
# Aggregate all features and samples from all chunks
|
|
1372
|
+
all_feature_uids = []
|
|
1373
|
+
all_feature_data = []
|
|
1374
|
+
all_sample_uids = []
|
|
1375
|
+
|
|
1376
|
+
for chunk in cluster:
|
|
1377
|
+
# Collect feature UIDs
|
|
1378
|
+
chunk_feature_uids = chunk.get('feature_uids', [])
|
|
1379
|
+
all_feature_uids.extend(chunk_feature_uids)
|
|
1380
|
+
|
|
1381
|
+
# Collect feature data
|
|
1382
|
+
chunk_feature_data = chunk.get('feature_data_list', [])
|
|
1383
|
+
all_feature_data.extend(chunk_feature_data)
|
|
1384
|
+
|
|
1385
|
+
# Collect sample UIDs
|
|
1386
|
+
chunk_sample_uids = chunk.get('sample_uids', [])
|
|
1387
|
+
all_sample_uids.extend(chunk_sample_uids)
|
|
1388
|
+
|
|
1389
|
+
# Remove duplicates properly and count unique samples
|
|
1390
|
+
merged['feature_uids'] = list(set(all_feature_uids))
|
|
1391
|
+
merged['feature_data_list'] = all_feature_data # Keep all feature data
|
|
1392
|
+
merged['sample_uids'] = list(set(all_sample_uids)) # Unique sample UIDs only
|
|
1393
|
+
merged['sample_count'] = len(merged['sample_uids']) # Count of unique samples
|
|
1394
|
+
|
|
1395
|
+
return merged
|
|
1396
|
+
|
|
1397
|
+
class KDTreeSpatialMerger:
|
|
1912
1398
|
"""
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1399
|
+
KD-Tree Spatial Merger: Optimized for high-sample features.
|
|
1400
|
+
"""
|
|
1401
|
+
def __init__(self, rt_tol: float, mz_tol: float):
|
|
1402
|
+
self.rt_tol = rt_tol
|
|
1403
|
+
self.mz_tol = mz_tol
|
|
1404
|
+
|
|
1405
|
+
def merge(self, chunk_consensus_list: list) -> list:
|
|
1406
|
+
"""KD-tree based spatial merging"""
|
|
1407
|
+
if not chunk_consensus_list:
|
|
1408
|
+
return []
|
|
1409
|
+
|
|
1410
|
+
try:
|
|
1411
|
+
from scipy.spatial import cKDTree
|
|
1412
|
+
import numpy as np
|
|
1413
|
+
except ImportError:
|
|
1414
|
+
# Fallback to simple clustering if scipy not available
|
|
1415
|
+
return self._fallback_merge(chunk_consensus_list)
|
|
1416
|
+
|
|
1417
|
+
# Build spatial index
|
|
1418
|
+
points = np.array([[c['rt'], c['mz']] for c in chunk_consensus_list])
|
|
1419
|
+
tree = cKDTree(points)
|
|
1420
|
+
|
|
1421
|
+
# Scale tolerances for KD-tree query
|
|
1422
|
+
rt_scale = 1.0 / self.rt_tol if self.rt_tol > 0 else 1.0
|
|
1423
|
+
mz_scale = 1.0 / self.mz_tol if self.mz_tol > 0 else 1.0
|
|
1424
|
+
scaled_points = points * np.array([rt_scale, mz_scale])
|
|
1425
|
+
scaled_tree = cKDTree(scaled_points)
|
|
1426
|
+
|
|
1427
|
+
clusters = []
|
|
1428
|
+
used = set()
|
|
1429
|
+
|
|
1430
|
+
# Priority processing for high-sample features
|
|
1431
|
+
high_sample_indices = [i for i, c in enumerate(chunk_consensus_list) if c['sample_count'] >= 100]
|
|
1432
|
+
remaining_indices = [i for i in range(len(chunk_consensus_list)) if i not in high_sample_indices]
|
|
1433
|
+
|
|
1434
|
+
for idx in high_sample_indices + remaining_indices:
|
|
1435
|
+
if idx in used:
|
|
1436
|
+
continue
|
|
1437
|
+
|
|
1438
|
+
# Find neighbors in scaled space
|
|
1439
|
+
neighbors = scaled_tree.query_ball_point(scaled_points[idx], r=1.0)
|
|
1440
|
+
cluster_indices = [i for i in neighbors if i not in used and i != idx]
|
|
1441
|
+
cluster_indices.append(idx)
|
|
1442
|
+
|
|
1443
|
+
if cluster_indices:
|
|
1444
|
+
cluster = [chunk_consensus_list[i] for i in cluster_indices]
|
|
1445
|
+
clusters.append(self._merge_cluster(cluster))
|
|
1446
|
+
used.update(cluster_indices)
|
|
1447
|
+
|
|
1448
|
+
return clusters
|
|
1449
|
+
|
|
1450
|
+
def _fallback_merge(self, chunk_consensus_list: list) -> list:
|
|
1451
|
+
"""Simple distance-based fallback when scipy unavailable"""
|
|
1452
|
+
clusters = []
|
|
1453
|
+
used = set()
|
|
1454
|
+
|
|
1455
|
+
for i, anchor in enumerate(chunk_consensus_list):
|
|
1456
|
+
if i in used:
|
|
1457
|
+
continue
|
|
1458
|
+
|
|
1459
|
+
cluster = [anchor]
|
|
1460
|
+
used.add(i)
|
|
1461
|
+
|
|
1462
|
+
for j, candidate in enumerate(chunk_consensus_list):
|
|
1463
|
+
if j in used or j == i:
|
|
1958
1464
|
continue
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1465
|
+
|
|
1466
|
+
rt_diff = abs(candidate['rt'] - anchor['rt'])
|
|
1467
|
+
mz_diff = abs(candidate['mz'] - anchor['mz'])
|
|
1468
|
+
|
|
1469
|
+
if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
|
|
1470
|
+
cluster.append(candidate)
|
|
1471
|
+
used.add(j)
|
|
1472
|
+
|
|
1473
|
+
clusters.append(self._merge_cluster(cluster))
|
|
1474
|
+
|
|
1475
|
+
return clusters
|
|
1476
|
+
|
|
1477
|
+
def _merge_cluster(self, cluster: list) -> dict:
|
|
1478
|
+
"""Merge cluster with intensity-weighted consensus"""
|
|
1479
|
+
if len(cluster) == 1:
|
|
1480
|
+
return cluster[0]
|
|
1481
|
+
|
|
1482
|
+
# Weight by intensity for spatial accuracy
|
|
1483
|
+
total_intensity = sum(c['intensity'] for c in cluster)
|
|
1484
|
+
|
|
1485
|
+
merged = {
|
|
1486
|
+
'consensus_id': cluster[0]['consensus_id'],
|
|
1487
|
+
'chunk_indices': [c['chunk_idx'] for c in cluster],
|
|
1488
|
+
'mz': sum(c['mz'] * c['intensity'] for c in cluster) / total_intensity,
|
|
1489
|
+
'rt': sum(c['rt'] * c['intensity'] for c in cluster) / total_intensity,
|
|
1490
|
+
'intensity': total_intensity,
|
|
1491
|
+
'quality': sum(c['quality'] for c in cluster) / len(cluster),
|
|
1492
|
+
'feature_uids': [],
|
|
1493
|
+
'feature_data_list': [],
|
|
1494
|
+
'sample_uids': [],
|
|
1495
|
+
'sample_count': 0
|
|
1496
|
+
}
|
|
1497
|
+
|
|
1498
|
+
# Aggregate features
|
|
1499
|
+
for chunk in cluster:
|
|
1500
|
+
merged['feature_uids'].extend(chunk['feature_uids'])
|
|
1501
|
+
merged['feature_data_list'].extend(chunk['feature_data_list'])
|
|
1502
|
+
merged['sample_uids'].extend(chunk['sample_uids'])
|
|
1503
|
+
|
|
1504
|
+
merged['feature_uids'] = list(set(merged['feature_uids']))
|
|
1505
|
+
merged['sample_count'] = len(set(merged['sample_uids']))
|
|
1506
|
+
|
|
1507
|
+
return merged
|
|
1508
|
+
# SELECT DECHUNKING ALGORITHM BASED ON PARAMETER
|
|
1509
|
+
if params.dechunking == "hierarchical":
|
|
1510
|
+
merger = HierarchicalAnchorMerger(params.rt_tol, params.mz_tol)
|
|
1511
|
+
final_consensus = merger.merge(all_chunk_consensus)
|
|
1512
|
+
elif params.dechunking == "kdtree":
|
|
1513
|
+
merger = KDTreeSpatialMerger(params.rt_tol, params.mz_tol)
|
|
1514
|
+
final_consensus = merger.merge(all_chunk_consensus)
|
|
1515
|
+
else:
|
|
1516
|
+
raise ValueError(f"Invalid dechunking method '{params.dechunking}'. Must be one of: ['hierarchical', 'kdtree']")
|
|
1517
|
+
|
|
1518
|
+
# --- Stage 1: Cross-chunk clustering using selected dechunking algorithm ---
|
|
1519
|
+
# New algorithms return final consensus features, no further refinement needed
|
|
1520
|
+
# Convert each merged consensus feature to a "group" of one feature for compatibility
|
|
1521
|
+
refined_groups = [[feature] for feature in final_consensus]
|
|
1978
1522
|
consensus_metadata = []
|
|
1979
1523
|
consensus_mapping_list = []
|
|
1980
1524
|
consensus_uid_counter = 0
|
|
@@ -2011,7 +1555,6 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
2011
1555
|
|
|
2012
1556
|
number_samples = len(sample_uids_acc)
|
|
2013
1557
|
|
|
2014
|
-
# NOTE: Don't filter by min_samples here - let _finalize_merge handle it
|
|
2015
1558
|
# This allows proper cross-chunk consensus building before final filtering
|
|
2016
1559
|
|
|
2017
1560
|
metadata = _calculate_consensus_statistics(
|
|
@@ -2028,13 +1571,29 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
2028
1571
|
cached_valid_adducts=cached_valid_adducts,
|
|
2029
1572
|
)
|
|
2030
1573
|
|
|
2031
|
-
# Validate RT spread
|
|
1574
|
+
# Validate RT and m/z spread don't exceed tolerance limits
|
|
2032
1575
|
rt_spread = metadata.get('rt_max', 0) - metadata.get('rt_min', 0)
|
|
2033
|
-
|
|
1576
|
+
mz_spread = metadata.get('mz_max', 0) - metadata.get('mz_min', 0)
|
|
1577
|
+
max_allowed_rt_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
|
|
1578
|
+
max_allowed_mz_spread = params.mz_tol * 2 # Enforce strict m/z spread limit
|
|
1579
|
+
|
|
1580
|
+
skip_feature = False
|
|
1581
|
+
skip_reason = ""
|
|
2034
1582
|
|
|
2035
|
-
if rt_spread >
|
|
2036
|
-
|
|
2037
|
-
|
|
1583
|
+
if rt_spread > max_allowed_rt_spread:
|
|
1584
|
+
skip_feature = True
|
|
1585
|
+
skip_reason = f"RT spread {rt_spread:.3f}s > {max_allowed_rt_spread:.3f}s"
|
|
1586
|
+
|
|
1587
|
+
if mz_spread > max_allowed_mz_spread:
|
|
1588
|
+
skip_feature = True
|
|
1589
|
+
if skip_reason:
|
|
1590
|
+
skip_reason += f" AND m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
|
|
1591
|
+
else:
|
|
1592
|
+
skip_reason = f"m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
|
|
1593
|
+
|
|
1594
|
+
if skip_feature:
|
|
1595
|
+
# Skip consensus features with excessive spread
|
|
1596
|
+
study.logger.debug(f"Skipping consensus feature {consensus_uid_counter}: {skip_reason}")
|
|
2038
1597
|
consensus_uid_counter += 1
|
|
2039
1598
|
continue
|
|
2040
1599
|
|
|
@@ -2043,6 +1602,7 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
|
|
|
2043
1602
|
# Build mapping rows (deduplicated)
|
|
2044
1603
|
for fid, fd in feature_data_acc.items():
|
|
2045
1604
|
samp_uid = fd.get('sample_uid') or fd.get('sample_id') or fd.get('sample')
|
|
1605
|
+
|
|
2046
1606
|
# If absent we attempt to derive from original group sample_uids pairing
|
|
2047
1607
|
# but most feature_data rows should include sample_uid already.
|
|
2048
1608
|
if samp_uid is None:
|
|
@@ -2238,80 +1798,6 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
|
|
|
2238
1798
|
}
|
|
2239
1799
|
|
|
2240
1800
|
|
|
2241
|
-
def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) -> list:
|
|
2242
|
-
"""
|
|
2243
|
-
Cluster consensus features from different chunks based on RT and m/z similarity.
|
|
2244
|
-
|
|
2245
|
-
Args:
|
|
2246
|
-
features: List of feature dictionaries with 'mz', 'rt', 'id' keys
|
|
2247
|
-
rt_tol: RT tolerance in seconds
|
|
2248
|
-
mz_tol: m/z tolerance in Da
|
|
2249
|
-
|
|
2250
|
-
Returns:
|
|
2251
|
-
List of groups, where each group is a list of feature dictionaries
|
|
2252
|
-
"""
|
|
2253
|
-
if not features:
|
|
2254
|
-
return []
|
|
2255
|
-
|
|
2256
|
-
# Use Union-Find for efficient clustering
|
|
2257
|
-
class UnionFind:
|
|
2258
|
-
def __init__(study, n):
|
|
2259
|
-
study.parent = list(range(n))
|
|
2260
|
-
study.rank = [0] * n
|
|
2261
|
-
|
|
2262
|
-
def find(study, x):
|
|
2263
|
-
if study.parent[x] != x:
|
|
2264
|
-
study.parent[x] = study.find(study.parent[x])
|
|
2265
|
-
return study.parent[x]
|
|
2266
|
-
|
|
2267
|
-
def union(study, x, y):
|
|
2268
|
-
px, py = study.find(x), study.find(y)
|
|
2269
|
-
if px == py:
|
|
2270
|
-
return
|
|
2271
|
-
if study.rank[px] < study.rank[py]:
|
|
2272
|
-
px, py = py, px
|
|
2273
|
-
study.parent[py] = px
|
|
2274
|
-
if study.rank[px] == study.rank[py]:
|
|
2275
|
-
study.rank[px] += 1
|
|
2276
|
-
|
|
2277
|
-
n_features = len(features)
|
|
2278
|
-
uf = UnionFind(n_features)
|
|
2279
|
-
|
|
2280
|
-
# Build distance matrix and cluster features within tolerance
|
|
2281
|
-
for i in range(n_features):
|
|
2282
|
-
for j in range(i + 1, n_features):
|
|
2283
|
-
feat_i = features[i]
|
|
2284
|
-
feat_j = features[j]
|
|
2285
|
-
|
|
2286
|
-
# Skip if features are from the same chunk (they're already processed)
|
|
2287
|
-
if feat_i['chunk_idx'] == feat_j['chunk_idx']:
|
|
2288
|
-
continue
|
|
2289
|
-
|
|
2290
|
-
mz_diff = abs(feat_i['mz'] - feat_j['mz'])
|
|
2291
|
-
rt_diff = abs(feat_i['rt'] - feat_j['rt'])
|
|
2292
|
-
|
|
2293
|
-
# Cluster if within tolerance
|
|
2294
|
-
if mz_diff <= mz_tol and rt_diff <= rt_tol:
|
|
2295
|
-
uf.union(i, j)
|
|
2296
|
-
|
|
2297
|
-
# Extract groups
|
|
2298
|
-
groups_by_root = {}
|
|
2299
|
-
for i in range(n_features):
|
|
2300
|
-
root = uf.find(i)
|
|
2301
|
-
if root not in groups_by_root:
|
|
2302
|
-
groups_by_root[root] = []
|
|
2303
|
-
groups_by_root[root].append(features[i])
|
|
2304
|
-
|
|
2305
|
-
return list(groups_by_root.values())
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
def _reset_consensus_data(study):
|
|
2309
|
-
"""Reset consensus-related DataFrames at the start of merge."""
|
|
2310
|
-
study.consensus_df = pl.DataFrame()
|
|
2311
|
-
study.consensus_ms2 = pl.DataFrame()
|
|
2312
|
-
study.consensus_mapping_df = pl.DataFrame()
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
1801
|
def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
|
|
2316
1802
|
"""Extract consensus features and build metadata."""
|
|
2317
1803
|
# create a dict to map uid to feature_uid using study.features_df
|
|
@@ -2324,7 +1810,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2324
1810
|
study.logger.debug(f"Found {imax} feature groups by clustering.")
|
|
2325
1811
|
|
|
2326
1812
|
# Pre-build fast lookup tables for features_df data using optimized approach
|
|
2327
|
-
features_lookup =
|
|
1813
|
+
features_lookup = __merge_feature_lookup(study, study.features_df)
|
|
2328
1814
|
|
|
2329
1815
|
# create a list to store the consensus mapping
|
|
2330
1816
|
consensus_mapping = []
|
|
@@ -2752,7 +2238,11 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
|
|
|
2752
2238
|
pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
|
|
2753
2239
|
)
|
|
2754
2240
|
|
|
2755
|
-
|
|
2241
|
+
# Log final counts
|
|
2242
|
+
study.logger.info(
|
|
2243
|
+
f"Extracted {len(study.consensus_df)} consensus features with "
|
|
2244
|
+
f"at least {min_samples} samples."
|
|
2245
|
+
)
|
|
2756
2246
|
|
|
2757
2247
|
|
|
2758
2248
|
def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
@@ -2775,7 +2265,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
|
|
|
2775
2265
|
)
|
|
2776
2266
|
|
|
2777
2267
|
# Use optimized adduct grouping
|
|
2778
|
-
adduct_group_list, adduct_of_list =
|
|
2268
|
+
adduct_group_list, adduct_of_list = __merge_adduct_grouping(
|
|
2779
2269
|
study, consensus_data, rt_tol, mz_tol
|
|
2780
2270
|
)
|
|
2781
2271
|
|
|
@@ -2802,70 +2292,186 @@ def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> i
|
|
|
2802
2292
|
if len(study.consensus_df) < 2:
|
|
2803
2293
|
return 0
|
|
2804
2294
|
|
|
2805
|
-
# Extract consensus feature
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
'rt': row['rt']
|
|
2812
|
-
})
|
|
2813
|
-
|
|
2814
|
-
# Build spatial index using bins
|
|
2815
|
-
rt_bin_size = rt_tol / 2
|
|
2816
|
-
mz_bin_size = mz_tol / 2
|
|
2817
|
-
|
|
2818
|
-
bins = defaultdict(list)
|
|
2819
|
-
for feature in consensus_data:
|
|
2820
|
-
rt_bin = int(feature['rt'] / rt_bin_size)
|
|
2821
|
-
mz_bin = int(feature['mz'] / mz_bin_size)
|
|
2822
|
-
bins[(rt_bin, mz_bin)].append(feature)
|
|
2295
|
+
# Extract consensus feature coordinates efficiently
|
|
2296
|
+
feature_coords = study.consensus_df.select([
|
|
2297
|
+
pl.col("consensus_uid"),
|
|
2298
|
+
pl.col("mz"),
|
|
2299
|
+
pl.col("rt")
|
|
2300
|
+
]).to_numpy()
|
|
2823
2301
|
|
|
2824
|
-
|
|
2302
|
+
n_features = len(feature_coords)
|
|
2303
|
+
processed = [False] * n_features
|
|
2825
2304
|
tight_clusters_count = 0
|
|
2826
2305
|
|
|
2827
|
-
|
|
2828
|
-
|
|
2306
|
+
# Use vectorized distance calculations for efficiency
|
|
2307
|
+
for i in range(n_features):
|
|
2308
|
+
if processed[i]:
|
|
2829
2309
|
continue
|
|
2830
2310
|
|
|
2831
|
-
#
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
# Check 8 neighboring bins
|
|
2836
|
-
for drt in [-1, 0, 1]:
|
|
2837
|
-
for dmz in [-1, 0, 1]:
|
|
2838
|
-
if drt == 0 and dmz == 0:
|
|
2839
|
-
continue
|
|
2840
|
-
neighbor_key = (rt_bin + drt, mz_bin + dmz)
|
|
2841
|
-
if neighbor_key in bins:
|
|
2842
|
-
all_nearby_features.extend(bins[neighbor_key])
|
|
2311
|
+
# Find all features within tolerance of feature i
|
|
2312
|
+
cluster_members = [i]
|
|
2313
|
+
rt_i, mz_i = feature_coords[i][2], feature_coords[i][1]
|
|
2843
2314
|
|
|
2844
|
-
|
|
2845
|
-
|
|
2846
|
-
for feature in all_nearby_features:
|
|
2847
|
-
if feature['consensus_uid'] in processed_features:
|
|
2315
|
+
for j in range(i + 1, n_features):
|
|
2316
|
+
if processed[j]:
|
|
2848
2317
|
continue
|
|
2849
2318
|
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
if rt_diff <= rt_tol and mz_diff <= mz_tol:
|
|
2856
|
-
valid_cluster_features.append(feature)
|
|
2857
|
-
break
|
|
2319
|
+
rt_j, mz_j = feature_coords[j][2], feature_coords[j][1]
|
|
2320
|
+
|
|
2321
|
+
if abs(rt_i - rt_j) <= rt_tol and abs(mz_i - mz_j) <= mz_tol:
|
|
2322
|
+
cluster_members.append(j)
|
|
2858
2323
|
|
|
2859
|
-
#
|
|
2860
|
-
if len(
|
|
2324
|
+
# Mark cluster as tight if it has 2+ members
|
|
2325
|
+
if len(cluster_members) >= 2:
|
|
2861
2326
|
tight_clusters_count += 1
|
|
2862
|
-
for
|
|
2863
|
-
|
|
2327
|
+
for idx in cluster_members:
|
|
2328
|
+
processed[idx] = True
|
|
2864
2329
|
|
|
2865
2330
|
return tight_clusters_count
|
|
2866
2331
|
|
|
2867
2332
|
|
|
2868
|
-
def
|
|
2333
|
+
def _merge_partial_consensus_features(study, rt_tol, mz_tol):
|
|
2334
|
+
"""
|
|
2335
|
+
Merge partial consensus features that likely represent the same compound but were
|
|
2336
|
+
split across chunks. This is specifically for chunked methods.
|
|
2337
|
+
"""
|
|
2338
|
+
if len(study.consensus_df) == 0:
|
|
2339
|
+
return
|
|
2340
|
+
|
|
2341
|
+
initial_count = len(study.consensus_df)
|
|
2342
|
+
study.logger.debug(f"Post-processing chunked results: merging partial consensus features from {initial_count} features")
|
|
2343
|
+
|
|
2344
|
+
# Convert to list of dictionaries for easier processing
|
|
2345
|
+
consensus_features = []
|
|
2346
|
+
for row in study.consensus_df.iter_rows(named=True):
|
|
2347
|
+
consensus_features.append({
|
|
2348
|
+
'consensus_uid': row['consensus_uid'],
|
|
2349
|
+
'rt': row['rt'],
|
|
2350
|
+
'mz': row['mz'],
|
|
2351
|
+
'number_samples': row.get('number_samples', 0),
|
|
2352
|
+
'inty_mean': row.get('inty_mean', 0.0)
|
|
2353
|
+
})
|
|
2354
|
+
|
|
2355
|
+
# Use Union-Find to group features that should be merged
|
|
2356
|
+
class UnionFind:
|
|
2357
|
+
def __init__(self, n):
|
|
2358
|
+
self.parent = list(range(n))
|
|
2359
|
+
|
|
2360
|
+
def find(self, x):
|
|
2361
|
+
if self.parent[x] != x:
|
|
2362
|
+
self.parent[x] = self.find(self.parent[x])
|
|
2363
|
+
return self.parent[x]
|
|
2364
|
+
|
|
2365
|
+
def union(self, x, y):
|
|
2366
|
+
px, py = self.find(x), self.find(y)
|
|
2367
|
+
if px != py:
|
|
2368
|
+
self.parent[py] = px
|
|
2369
|
+
|
|
2370
|
+
n_features = len(consensus_features)
|
|
2371
|
+
uf = UnionFind(n_features)
|
|
2372
|
+
|
|
2373
|
+
# Find features that should be merged using original tolerances
|
|
2374
|
+
for i in range(n_features):
|
|
2375
|
+
for j in range(i + 1, n_features):
|
|
2376
|
+
feature_a = consensus_features[i]
|
|
2377
|
+
feature_b = consensus_features[j]
|
|
2378
|
+
|
|
2379
|
+
rt_diff = abs(feature_a['rt'] - feature_b['rt'])
|
|
2380
|
+
mz_diff = abs(feature_a['mz'] - feature_b['mz'])
|
|
2381
|
+
|
|
2382
|
+
# Merge if within tolerance
|
|
2383
|
+
if rt_diff <= rt_tol and mz_diff <= mz_tol:
|
|
2384
|
+
uf.union(i, j)
|
|
2385
|
+
|
|
2386
|
+
# Group features by their root
|
|
2387
|
+
groups = {}
|
|
2388
|
+
for i, feature in enumerate(consensus_features):
|
|
2389
|
+
root = uf.find(i)
|
|
2390
|
+
if root not in groups:
|
|
2391
|
+
groups[root] = []
|
|
2392
|
+
groups[root].append(consensus_features[i])
|
|
2393
|
+
|
|
2394
|
+
# Create merged features
|
|
2395
|
+
merged_features = []
|
|
2396
|
+
merged_mapping_data = []
|
|
2397
|
+
uids_to_remove = set()
|
|
2398
|
+
|
|
2399
|
+
for group in groups.values():
|
|
2400
|
+
if len(group) < 2:
|
|
2401
|
+
# Single feature, keep as is
|
|
2402
|
+
continue
|
|
2403
|
+
else:
|
|
2404
|
+
# Multiple features, merge them
|
|
2405
|
+
# Find best representative feature (highest sample count, then intensity)
|
|
2406
|
+
best_feature = max(group, key=lambda x: (x['number_samples'], x['inty_mean']))
|
|
2407
|
+
|
|
2408
|
+
# Calculate merged properties
|
|
2409
|
+
total_samples = sum(f['number_samples'] for f in group)
|
|
2410
|
+
weighted_rt = sum(f['rt'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['rt']
|
|
2411
|
+
weighted_mz = sum(f['mz'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['mz']
|
|
2412
|
+
mean_intensity = sum(f['inty_mean'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['inty_mean']
|
|
2413
|
+
|
|
2414
|
+
# Keep the best feature's UID but update its properties
|
|
2415
|
+
merged_features.append({
|
|
2416
|
+
'consensus_uid': best_feature['consensus_uid'],
|
|
2417
|
+
'rt': weighted_rt,
|
|
2418
|
+
'mz': weighted_mz,
|
|
2419
|
+
'number_samples': total_samples,
|
|
2420
|
+
'inty_mean': mean_intensity
|
|
2421
|
+
})
|
|
2422
|
+
|
|
2423
|
+
# Mark other features for removal
|
|
2424
|
+
for f in group:
|
|
2425
|
+
if f['consensus_uid'] != best_feature['consensus_uid']:
|
|
2426
|
+
uids_to_remove.add(f['consensus_uid'])
|
|
2427
|
+
|
|
2428
|
+
if merged_features:
|
|
2429
|
+
study.logger.debug(f"Merging {len(merged_features)} groups of partial consensus features")
|
|
2430
|
+
|
|
2431
|
+
# Update consensus_df with merged features
|
|
2432
|
+
for merged_feature in merged_features:
|
|
2433
|
+
study.consensus_df = study.consensus_df.with_columns([
|
|
2434
|
+
pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
|
|
2435
|
+
.then(pl.lit(merged_feature['rt']))
|
|
2436
|
+
.otherwise(pl.col('rt'))
|
|
2437
|
+
.alias('rt'),
|
|
2438
|
+
|
|
2439
|
+
pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
|
|
2440
|
+
.then(pl.lit(merged_feature['mz']))
|
|
2441
|
+
.otherwise(pl.col('mz'))
|
|
2442
|
+
.alias('mz'),
|
|
2443
|
+
|
|
2444
|
+
pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
|
|
2445
|
+
.then(pl.lit(merged_feature['number_samples']))
|
|
2446
|
+
.otherwise(pl.col('number_samples'))
|
|
2447
|
+
.alias('number_samples'),
|
|
2448
|
+
|
|
2449
|
+
pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
|
|
2450
|
+
.then(pl.lit(merged_feature['inty_mean']))
|
|
2451
|
+
.otherwise(pl.col('inty_mean'))
|
|
2452
|
+
.alias('inty_mean')
|
|
2453
|
+
])
|
|
2454
|
+
|
|
2455
|
+
# Remove duplicate features
|
|
2456
|
+
if uids_to_remove:
|
|
2457
|
+
study.consensus_df = study.consensus_df.filter(
|
|
2458
|
+
~pl.col('consensus_uid').is_in(list(uids_to_remove))
|
|
2459
|
+
)
|
|
2460
|
+
|
|
2461
|
+
# Also update consensus_mapping_df - reassign mappings from removed UIDs
|
|
2462
|
+
if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
|
|
2463
|
+
study.consensus_mapping_df = study.consensus_mapping_df.with_columns(
|
|
2464
|
+
pl.when(pl.col('consensus_uid').is_in(list(uids_to_remove)))
|
|
2465
|
+
.then(pl.lit(None)) # Will be handled by subsequent operations
|
|
2466
|
+
.otherwise(pl.col('consensus_uid'))
|
|
2467
|
+
.alias('consensus_uid')
|
|
2468
|
+
)
|
|
2469
|
+
|
|
2470
|
+
final_count = len(study.consensus_df)
|
|
2471
|
+
study.logger.debug(f"Partial consensus merging: {initial_count} → {final_count} features")
|
|
2472
|
+
|
|
2473
|
+
|
|
2474
|
+
def __consensus_cleanup(study, rt_tol, mz_tol):
|
|
2869
2475
|
"""
|
|
2870
2476
|
Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
|
|
2871
2477
|
|
|
@@ -3091,7 +2697,7 @@ def _consensus_cleanup(study, rt_tol, mz_tol):
|
|
|
3091
2697
|
study.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
|
|
3092
2698
|
|
|
3093
2699
|
|
|
3094
|
-
def
|
|
2700
|
+
def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
3095
2701
|
"""
|
|
3096
2702
|
Identify coeluting consensus features by characteristic mass shifts between adducts
|
|
3097
2703
|
and update their adduct information accordingly.
|
|
@@ -3378,7 +2984,7 @@ def _identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
|
|
|
3378
2984
|
study.logger.debug("No consensus features updated based on mass shift analysis")
|
|
3379
2985
|
|
|
3380
2986
|
|
|
3381
|
-
def
|
|
2987
|
+
def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
|
|
3382
2988
|
"""Complete the merge process with final calculations and cleanup."""
|
|
3383
2989
|
import polars as pl
|
|
3384
2990
|
|
|
@@ -3438,12 +3044,13 @@ def _finalize_merge(study, link_ms2, min_samples):
|
|
|
3438
3044
|
)
|
|
3439
3045
|
|
|
3440
3046
|
# add iso data from raw files.
|
|
3441
|
-
study.find_iso()
|
|
3442
3047
|
if link_ms2:
|
|
3443
3048
|
study.find_ms2()
|
|
3049
|
+
if extract_ms1:
|
|
3050
|
+
study.find_iso()
|
|
3444
3051
|
|
|
3445
3052
|
|
|
3446
|
-
def
|
|
3053
|
+
def __merge_feature_lookup(study_obj, features_df):
|
|
3447
3054
|
"""
|
|
3448
3055
|
Optimized feature lookup creation using Polars operations.
|
|
3449
3056
|
"""
|
|
@@ -3452,7 +3059,7 @@ def _optimized_feature_lookup(study_obj, features_df):
|
|
|
3452
3059
|
|
|
3453
3060
|
# Use Polars select for faster conversion
|
|
3454
3061
|
feature_columns = [
|
|
3455
|
-
"feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
|
|
3062
|
+
"feature_uid", "sample_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
|
|
3456
3063
|
"mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
|
|
3457
3064
|
"chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
|
|
3458
3065
|
"ms2_scans", "adduct", "adduct_mass"
|
|
@@ -3476,12 +3083,12 @@ def _optimized_feature_lookup(study_obj, features_df):
|
|
|
3476
3083
|
return features_lookup
|
|
3477
3084
|
|
|
3478
3085
|
|
|
3479
|
-
def
|
|
3086
|
+
def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
3480
3087
|
"""
|
|
3481
3088
|
Optimized O(n log n) adduct grouping using spatial indexing.
|
|
3482
3089
|
|
|
3483
3090
|
Args:
|
|
3484
|
-
|
|
3091
|
+
study: Study object with logger
|
|
3485
3092
|
consensus_data: List of consensus feature dictionaries
|
|
3486
3093
|
rt_tol: RT tolerance in minutes
|
|
3487
3094
|
mz_tol: m/z tolerance in Da
|
|
@@ -3494,9 +3101,9 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
|
3494
3101
|
|
|
3495
3102
|
n_features = len(consensus_data)
|
|
3496
3103
|
if n_features > 10000:
|
|
3497
|
-
|
|
3104
|
+
study.logger.info(f"Adduct grouping for {n_features} consensus features...")
|
|
3498
3105
|
else:
|
|
3499
|
-
|
|
3106
|
+
study.logger.debug(f"Adduct grouping for {n_features} consensus features...")
|
|
3500
3107
|
|
|
3501
3108
|
# Build spatial index using RT and neutral mass as coordinates
|
|
3502
3109
|
features_by_mass = defaultdict(list)
|
|
@@ -3567,14 +3174,14 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
|
3567
3174
|
groups_by_root = defaultdict(list)
|
|
3568
3175
|
for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
|
|
3569
3176
|
root = uf.find(i)
|
|
3570
|
-
groups_by_root[root].append(
|
|
3177
|
+
groups_by_root[root].append(valid_features[i])
|
|
3571
3178
|
|
|
3572
3179
|
groups = {}
|
|
3573
3180
|
group_id = 1
|
|
3574
3181
|
assigned_groups = {}
|
|
3575
3182
|
|
|
3576
3183
|
for group_members in groups_by_root.values():
|
|
3577
|
-
member_uids = [uid for uid, _, _, _, _ in group_members]
|
|
3184
|
+
member_uids = [uid for uid, _, _, _, _, _ in group_members]
|
|
3578
3185
|
|
|
3579
3186
|
for uid in member_uids:
|
|
3580
3187
|
assigned_groups[uid] = group_id
|
|
@@ -3632,8 +3239,8 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
|
|
|
3632
3239
|
adduct_of_list.append(adduct_of)
|
|
3633
3240
|
|
|
3634
3241
|
if n_features > 10000:
|
|
3635
|
-
|
|
3242
|
+
study.logger.info("Adduct grouping completed.")
|
|
3636
3243
|
else:
|
|
3637
|
-
|
|
3244
|
+
study.logger.debug("Adduct grouping completed.")
|
|
3638
3245
|
|
|
3639
3246
|
return adduct_group_list, adduct_of_list
|