masster 0.4.14__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py ADDED
@@ -0,0 +1,2145 @@
1
+ """
2
+ Unified merge module for the Study class.
3
+ Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'chunked'
4
+ """
5
+
6
+ import time
7
+ import numpy as np
8
+ from collections import defaultdict
9
+ from datetime import datetime
10
+ from tqdm import tqdm
11
+ import pyopenms as oms
12
+ import polars as pl
13
+ from masster.study.defaults import merge_defaults
14
+
15
+
16
+ def merge(self, **kwargs) -> None:
17
+ """
18
+ Group features across samples into consensus features using various algorithms.
19
+
20
+ This function provides a unified interface to multiple feature grouping algorithms,
21
+ each optimized for different dataset sizes and analysis requirements.
22
+
23
+ Parameters
24
+ ----------
25
+ **kwargs : dict
26
+ Parameters from merge_defaults class:
27
+ - method : str, default 'quality'
28
+ Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'chunked', 'quality'
29
+ - min_samples : int, default 10
30
+ Minimum number of samples for consensus feature
31
+ - rt_tol : float, default 2.0
32
+ RT tolerance in seconds
33
+ - mz_tol : float, default 0.01
34
+ m/z tolerance in Da (Daltons) for all methods
35
+ - chunk_size : int, default 500
36
+ Chunk size for 'chunked' method
37
+ - nr_partitions : int, default 500
38
+ Number of partitions in m/z dimension for KD algorithms
39
+ - min_rel_cc_size : float, default 0.3
40
+ Minimum relative connected component size for conflict resolution
41
+ - max_pairwise_log_fc : float, default 0.5
42
+ Maximum pairwise log fold change for conflict resolution
43
+ - max_nr_conflicts : int, default 0
44
+ Maximum number of conflicts allowed in consensus feature
45
+ - link_ms2 : bool, default True
46
+ Whether to link MS2 spectra to consensus features
47
+
48
+ Algorithm Guidelines
49
+ -------------------
50
+ - Quality: KD with post-processing quality control to reduce oversegmentation (RECOMMENDED DEFAULT)
51
+ Includes RT tolerance optimization, secondary clustering, and quality filtering
52
+ - Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
53
+ - QT: Thorough but slow O(n²), good for <1000 samples
54
+ - NoWarp: Memory efficient KD without RT warping for large datasets
55
+ - Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
56
+ Uses optimized partitioning for better memory management while maintaining
57
+ full cross-sample consensus feature detection.
58
+ """
59
+ start_time = time.time()
60
+
61
+ # Initialize with defaults and override with kwargs
62
+ params = merge_defaults()
63
+
64
+ # Filter and apply only valid parameters
65
+ valid_params = set(params.list_parameters())
66
+ for key, value in kwargs.items():
67
+ if key in valid_params:
68
+ setattr(params, key, value)
69
+ else:
70
+ self.logger.warning(f"Unknown parameter '{key}' ignored")
71
+
72
+ # Backward compatibility: Map old method names to new names
73
+ method_mapping = {
74
+ 'kd': 'sensitivity',
75
+ 'kd-nowarp': 'nowarp',
76
+ 'kd_nowarp': 'nowarp',
77
+ 'kd-strict': 'quality',
78
+ 'kd_strict': 'quality',
79
+ 'kdstrict': 'quality'
80
+ }
81
+
82
+ if params.method in method_mapping:
83
+ old_method = params.method
84
+ params.method = method_mapping[old_method]
85
+ self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
86
+
87
+ # Validate method
88
+ if params.method not in ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']:
89
+ raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'chunked', 'quality']")
90
+
91
+ # Persist last used params for diagnostics
92
+ try:
93
+ self._merge_params_last = params.to_dict()
94
+ except Exception:
95
+ self._merge_params_last = {}
96
+
97
+ # Store merge parameters in history
98
+ try:
99
+ if hasattr(self, 'store_history'):
100
+ self.store_history(['merge'], params.to_dict())
101
+ else:
102
+ self.logger.warning("History storage not available - parameters not saved to history")
103
+ except Exception as e:
104
+ self.logger.warning(f"Failed to store merge parameters in history: {e}")
105
+
106
+ # Ensure feature maps are available for merging (regenerate if needed)
107
+ if len(self.features_maps) < len(self.samples_df):
108
+ self.features_maps = []
109
+ self.load_features()
110
+
111
+ self.logger.info(
112
+ f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da, min_rel_cc_size={params.min_rel_cc_size}, max_pairwise_log_fc={params.max_pairwise_log_fc}, max_nr_conflicts={params.max_nr_conflicts}"
113
+ )
114
+
115
+ # Initialize
116
+ self._reset_consensus_data()
117
+
118
+ # Cache adducts for performance (avoid repeated _get_adducts() calls)
119
+ cached_adducts_df = None
120
+ cached_valid_adducts = None
121
+ try:
122
+ cached_adducts_df = self._get_adducts()
123
+ if not cached_adducts_df.is_empty():
124
+ cached_valid_adducts = set(cached_adducts_df["name"].to_list())
125
+ else:
126
+ cached_valid_adducts = set()
127
+ except Exception as e:
128
+ self.logger.warning(f"Could not retrieve study adducts: {e}")
129
+ cached_valid_adducts = set()
130
+
131
+ # Always allow '?' adducts
132
+ cached_valid_adducts.add("?")
133
+
134
+ # Route to algorithm implementation
135
+ if params.method == 'sensitivity':
136
+ consensus_map = _merge_kd(self, params)
137
+ # Extract consensus features
138
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
139
+ elif params.method == 'qt':
140
+ consensus_map = _merge_qt(self, params)
141
+ # Extract consensus features
142
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
143
+ elif params.method == 'nowarp':
144
+ consensus_map = _merge_kd_nowarp(self, params)
145
+ # Extract consensus features
146
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
147
+ elif params.method == 'quality':
148
+ consensus_map = _merge_kd_strict(self, params)
149
+ # Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
150
+ elif params.method == 'chunked':
151
+ consensus_map = _merge_chunked(self, params, cached_adducts_df, cached_valid_adducts)
152
+ # Note: _merge_chunked populates consensus_df directly, no need to extract
153
+
154
+ # Perform adduct grouping
155
+ self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
156
+
157
+ # Link MS2 if requested
158
+ if params.link_ms2:
159
+ self._finalize_merge(params.link_ms2, params.min_samples)
160
+
161
+ # Log completion without the misleading feature count
162
+ elapsed = time.time() - start_time
163
+ self.logger.debug(f"Merge process completed in {elapsed:.1f}s")
164
+
165
+
166
+ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
167
+ """KD-tree based merge (fast, recommended)"""
168
+
169
+ consensus_map = oms.ConsensusMap()
170
+ file_descriptions = consensus_map.getColumnHeaders()
171
+
172
+ for i, feature_map in enumerate(self.features_maps):
173
+ file_description = file_descriptions.get(i, oms.ColumnHeader())
174
+ file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
175
+ file_description.size = feature_map.size()
176
+ file_description.unique_id = feature_map.getUniqueId()
177
+ file_descriptions[i] = file_description
178
+
179
+ consensus_map.setColumnHeaders(file_descriptions)
180
+
181
+ # Configure KD algorithm
182
+ grouper = oms.FeatureGroupingAlgorithmKD()
183
+ params_oms = grouper.getParameters()
184
+
185
+ params_oms.setValue("mz_unit", "Da")
186
+ params_oms.setValue("nr_partitions", params.nr_partitions)
187
+ params_oms.setValue("warp:enabled", "true")
188
+ params_oms.setValue("warp:rt_tol", params.rt_tol)
189
+ params_oms.setValue("warp:mz_tol", params.mz_tol)
190
+ params_oms.setValue("link:rt_tol", params.rt_tol)
191
+ params_oms.setValue("link:mz_tol", params.mz_tol)
192
+ params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
193
+ params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
194
+ params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
195
+ #params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
196
+
197
+ grouper.setParameters(params_oms)
198
+ grouper.group(self.features_maps, consensus_map)
199
+
200
+ return consensus_map
201
+
202
+
203
+ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
204
+ """QT (Quality Threshold) based merge"""
205
+
206
+ n_samples = len(self.features_maps)
207
+ if n_samples > 1000:
208
+ self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
209
+
210
+ consensus_map = oms.ConsensusMap()
211
+ file_descriptions = consensus_map.getColumnHeaders()
212
+
213
+ for i, feature_map in enumerate(self.features_maps):
214
+ file_description = file_descriptions.get(i, oms.ColumnHeader())
215
+ file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
216
+ file_description.size = feature_map.size()
217
+ file_description.unique_id = feature_map.getUniqueId()
218
+ file_descriptions[i] = file_description
219
+
220
+ consensus_map.setColumnHeaders(file_descriptions)
221
+
222
+ # Configure QT algorithm
223
+ grouper = oms.FeatureGroupingAlgorithmQT()
224
+ params_oms = grouper.getParameters()
225
+
226
+ params_oms.setValue("distance_RT:max_difference", params.rt_tol)
227
+ params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
228
+ params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
229
+ params_oms.setValue("ignore_charge", "true")
230
+ params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
231
+ params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
232
+ params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
233
+ params_oms.setValue("nr_partitions", params.nr_partitions)
234
+
235
+ grouper.setParameters(params_oms)
236
+ grouper.group(self.features_maps, consensus_map)
237
+
238
+ return consensus_map
239
+
240
+
241
+ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
242
+ """
243
+ Quality merge: Standard KD algorithm with post-processing quality control.
244
+
245
+ This method combines the sensitivity of KD clustering with post-processing steps
246
+ to reduce oversegmentation while maintaining high-quality consensus features.
247
+ This is the recommended default method.
248
+
249
+ Post-processing features:
250
+ 1. RT tolerance optimization (optional)
251
+ 2. Secondary clustering for close features
252
+ 3. Sample overlap validation
253
+ 4. RT spread quality filtering
254
+ 5. Chromatographic coherence validation
255
+
256
+ Additional parameters supported in params:
257
+ - optimize_rt_tol: bool - Enable RT tolerance optimization
258
+ - rt_tol_range: tuple - RT tolerance range for optimization (min, max)
259
+ - secondary_merge_rt_tol: float - Secondary merge RT tolerance (default: 0.5s)
260
+ - secondary_merge_mz_tol: float - Secondary merge m/z tolerance (default: 0.005)
261
+ - min_sample_overlap: float - Minimum sample overlap for merging (0.0-1.0, default: 0.8)
262
+ - max_rt_spread: float - Maximum RT spread allowed (default: 2x rt_tol)
263
+ - min_coherence: float - Minimum chromatographic coherence (default: 0.0, disabled)
264
+ """
265
+
266
+ # Check for RT tolerance optimization
267
+ optimize_rt_tol = getattr(params, 'optimize_rt_tol', False)
268
+
269
+ if optimize_rt_tol:
270
+ # Optimize RT tolerance first
271
+ optimal_rt_tol = _optimize_rt_tolerance(self, params)
272
+ self.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
273
+ # Create modified params with optimal RT tolerance
274
+ import copy
275
+ optimized_params = copy.deepcopy(params)
276
+ optimized_params.rt_tol = optimal_rt_tol
277
+ else:
278
+ optimized_params = params
279
+
280
+ # Phase 1: Standard KD clustering
281
+ self.logger.info("Initial KD clustering")
282
+ consensus_map = _merge_kd(self, optimized_params)
283
+
284
+ # Phase 2: Post-processing quality control
285
+ self.logger.info("Post-processing quality control")
286
+ consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
287
+
288
+ return consensus_map
289
+
290
+
291
+ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
292
+ """
293
+ Optimize RT tolerance by testing different values and measuring oversegmentation.
294
+
295
+ Args:
296
+ self: Study object
297
+ params: Merge parameters
298
+
299
+ Returns:
300
+ Optimal RT tolerance value
301
+ """
302
+ rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
303
+ rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
304
+
305
+ self.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
306
+
307
+ # Generate test values
308
+ test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
309
+ for i in range(rt_tol_steps)]
310
+
311
+ best_rt_tol = params.rt_tol
312
+ best_score = float('inf')
313
+
314
+ # Store original features for restoration
315
+ original_consensus_df = getattr(self, 'consensus_df', pl.DataFrame())
316
+ original_consensus_mapping_df = getattr(self, 'consensus_mapping_df', pl.DataFrame())
317
+
318
+ for test_rt_tol in test_rt_tols:
319
+ try:
320
+ # Create test parameters
321
+ import copy
322
+ test_params = copy.deepcopy(params)
323
+ test_params.rt_tol = test_rt_tol
324
+
325
+ # Run KD merge with test parameters
326
+ test_consensus_map = _merge_kd(self, test_params)
327
+
328
+ # Extract consensus features temporarily for analysis
329
+ self._extract_consensus_features(test_consensus_map, test_params.min_samples)
330
+
331
+ if len(self.consensus_df) == 0:
332
+ continue
333
+
334
+ # Calculate oversegmentation metrics
335
+ oversegmentation_score = _calculate_oversegmentation_score(self, test_rt_tol)
336
+
337
+ self.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(self.consensus_df)} features, score: {oversegmentation_score:.3f}")
338
+
339
+ # Lower score is better (less oversegmentation)
340
+ if oversegmentation_score < best_score:
341
+ best_score = oversegmentation_score
342
+ best_rt_tol = test_rt_tol
343
+
344
+ except Exception as e:
345
+ self.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
346
+ continue
347
+
348
+ # Restore original consensus data
349
+ self.consensus_df = original_consensus_df
350
+ self.consensus_mapping_df = original_consensus_mapping_df
351
+
352
+ self.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
353
+ return best_rt_tol
354
+
355
+
356
+ def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
357
+ """
358
+ Calculate oversegmentation score based on feature density and RT spread metrics.
359
+ Lower scores indicate less oversegmentation.
360
+
361
+ Args:
362
+ self: Study object
363
+ rt_tol: RT tolerance used
364
+
365
+ Returns:
366
+ Oversegmentation score (lower = better)
367
+ """
368
+ if len(self.consensus_df) == 0:
369
+ return float('inf')
370
+
371
+ # Metric 1: Feature density (features per RT second)
372
+ rt_range = self.consensus_df['rt'].max() - self.consensus_df['rt'].min()
373
+ if rt_range <= 0:
374
+ return float('inf')
375
+
376
+ feature_density = len(self.consensus_df) / rt_range
377
+
378
+ # Metric 2: Average RT spread relative to tolerance
379
+ rt_spreads = (self.consensus_df['rt_max'] - self.consensus_df['rt_min'])
380
+ avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
381
+
382
+ # Metric 3: Proportion of features with low sample counts (indicates fragmentation)
383
+ low_sample_features = len(self.consensus_df.filter(pl.col('number_samples') <= 5))
384
+ low_sample_ratio = low_sample_features / len(self.consensus_df)
385
+
386
+ # Metric 4: Number of features with excessive RT spread
387
+ excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
388
+ excessive_spread_ratio = excessive_spread_features / len(self.consensus_df)
389
+
390
+ # Combined score (weighted combination)
391
+ oversegmentation_score = (
392
+ 0.4 * (feature_density / 10.0) + # Normalize to reasonable scale
393
+ 0.3 * avg_rt_spread_ratio +
394
+ 0.2 * low_sample_ratio +
395
+ 0.1 * excessive_spread_ratio
396
+ )
397
+
398
+ return oversegmentation_score
399
+
400
+
401
+ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
402
+ """
403
+ Apply post-processing quality control to KD consensus map.
404
+
405
+ Args:
406
+ consensus_map: Initial consensus map from KD
407
+ params: Merge parameters with kd-strict options
408
+
409
+ Returns:
410
+ Processed consensus map with reduced oversegmentation
411
+ """
412
+ if consensus_map.size() == 0:
413
+ self.logger.warning("Empty consensus map provided to post-processing")
414
+ return consensus_map
415
+
416
+ self.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
417
+
418
+ # Step 1: Extract initial consensus features
419
+ original_min_samples = params.min_samples
420
+ params.min_samples = 1 # Extract all features initially
421
+
422
+ self._extract_consensus_features(consensus_map, params.min_samples)
423
+ initial_feature_count = len(self.consensus_df)
424
+
425
+ if initial_feature_count == 0:
426
+ self.logger.warning("No consensus features extracted for post-processing")
427
+ params.min_samples = original_min_samples
428
+ return consensus_map
429
+
430
+ # Step 2: Secondary clustering for close features
431
+ secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
432
+ secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
433
+
434
+ self.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
435
+ merged_features = _perform_secondary_clustering(self, secondary_merge_rt_tol, secondary_merge_mz_tol)
436
+
437
+ # Step 3: Sample overlap validation
438
+ min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
439
+ if min_sample_overlap > 0:
440
+ self.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
441
+ merged_features = _validate_sample_overlap(self, merged_features, min_sample_overlap)
442
+
443
+ # Step 4: RT spread quality filtering
444
+ if params.rt_tol is not None:
445
+ max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
446
+ if max_rt_spread is not None:
447
+ self.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
448
+ merged_features = _filter_rt_spread(self, merged_features, max_rt_spread)
449
+ else:
450
+ self.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
451
+ else:
452
+ self.logger.debug("Skipping RT spread filtering - rt_tol is None")
453
+
454
+ # Step 5: Chromatographic coherence filtering (optional)
455
+ min_coherence = getattr(params, 'min_coherence', 0.0)
456
+ if min_coherence > 0:
457
+ self.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
458
+ merged_features = _filter_coherence(self, merged_features, min_coherence)
459
+
460
+ # Step 6: Rebuild consensus_df with filtered features and preserve mapping
461
+ original_mapping_df = self.consensus_mapping_df.clone() # Save original mapping
462
+ self.consensus_df = pl.DataFrame(merged_features, strict=False)
463
+
464
+ # Step 7: Apply original min_samples filter
465
+ params.min_samples = original_min_samples
466
+ if params.min_samples > 1:
467
+ l1 = len(self.consensus_df)
468
+ self.consensus_df = self.consensus_df.filter(
469
+ pl.col("number_samples") >= params.min_samples
470
+ )
471
+ filtered_count = l1 - len(self.consensus_df)
472
+ if filtered_count > 0:
473
+ self.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
474
+
475
+ # Step 8: Update consensus_mapping_df to match final consensus_df
476
+ if len(self.consensus_df) > 0 and len(original_mapping_df) > 0:
477
+ valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
478
+ self.consensus_mapping_df = original_mapping_df.filter(
479
+ pl.col('consensus_uid').is_in(list(valid_consensus_ids))
480
+ )
481
+ else:
482
+ self.consensus_mapping_df = pl.DataFrame()
483
+
484
+ final_feature_count = len(self.consensus_df)
485
+ reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
486
+
487
+ self.logger.info(f"Post-processing complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
488
+
489
+ # Create a new consensus map for compatibility (the processed data is in consensus_df)
490
+ processed_consensus_map = oms.ConsensusMap()
491
+ return processed_consensus_map
492
+
493
+
494
+ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
495
+ """
496
+ Perform secondary clustering to merge very close features.
497
+
498
+ Args:
499
+ rt_tol: RT tolerance for secondary clustering
500
+ mz_tol: m/z tolerance for secondary clustering
501
+
502
+ Returns:
503
+ List of merged consensus feature dictionaries
504
+ """
505
+ if len(self.consensus_df) == 0:
506
+ return []
507
+
508
+ # Convert consensus_df to list of dictionaries for clustering
509
+ consensus_features = []
510
+ for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
511
+ consensus_features.append(dict(row))
512
+
513
+ # Use Union-Find for efficient clustering
514
+ class UnionFind:
515
+ def __init__(self, n):
516
+ self.parent = list(range(n))
517
+ self.rank = [0] * n
518
+
519
+ def find(self, x):
520
+ if self.parent[x] != x:
521
+ self.parent[x] = self.find(self.parent[x])
522
+ return self.parent[x]
523
+
524
+ def union(self, x, y):
525
+ px, py = self.find(x), self.find(y)
526
+ if px == py:
527
+ return
528
+ if self.rank[px] < self.rank[py]:
529
+ px, py = py, px
530
+ self.parent[py] = px
531
+ if self.rank[px] == self.rank[py]:
532
+ self.rank[px] += 1
533
+
534
+ n_features = len(consensus_features)
535
+ uf = UnionFind(n_features)
536
+
537
+ # Find features to merge based on proximity
538
+ merge_count = 0
539
+ for i in range(n_features):
540
+ for j in range(i + 1, n_features):
541
+ feat_i = consensus_features[i]
542
+ feat_j = consensus_features[j]
543
+
544
+ rt_diff = abs(feat_i['rt'] - feat_j['rt'])
545
+ mz_diff = abs(feat_i['mz'] - feat_j['mz'])
546
+
547
+ if rt_diff <= rt_tol and mz_diff <= mz_tol:
548
+ uf.union(i, j)
549
+ merge_count += 1
550
+
551
+ # Group features by their root
552
+ groups_by_root = defaultdict(list)
553
+ for i in range(n_features):
554
+ root = uf.find(i)
555
+ groups_by_root[root].append(consensus_features[i])
556
+
557
+ # Merge features within each group
558
+ merged_features = []
559
+ for group in groups_by_root.values():
560
+ if len(group) == 1:
561
+ # Single feature - keep as is
562
+ merged_features.append(group[0])
563
+ else:
564
+ # Multiple features - merge them
565
+ merged_feature = _merge_feature_group(group)
566
+ merged_features.append(merged_feature)
567
+
568
+ self.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
569
+ return merged_features
570
+
571
+
572
+ def _merge_feature_group(feature_group: list) -> dict:
573
+ """
574
+ Merge a group of similar consensus features into one.
575
+
576
+ Args:
577
+ feature_group: List of consensus feature dictionaries to merge
578
+
579
+ Returns:
580
+ Merged consensus feature dictionary
581
+ """
582
+ if not feature_group:
583
+ return {}
584
+
585
+ if len(feature_group) == 1:
586
+ return feature_group[0]
587
+
588
+ # Use the feature with highest sample count as base
589
+ base_feature = max(feature_group, key=lambda f: f.get('number_samples', 0))
590
+ merged = base_feature.copy()
591
+
592
+ # Aggregate numeric statistics
593
+ rt_values = [f['rt'] for f in feature_group if f.get('rt') is not None]
594
+ mz_values = [f['mz'] for f in feature_group if f.get('mz') is not None]
595
+ sample_counts = [f.get('number_samples', 0) for f in feature_group]
596
+ intensities = [f.get('inty_mean', 0) for f in feature_group if f.get('inty_mean') is not None]
597
+
598
+ # Update merged feature statistics
599
+ if rt_values:
600
+ merged['rt'] = float(np.mean(rt_values))
601
+ merged['rt_min'] = min([f.get('rt_min', f['rt']) for f in feature_group])
602
+ merged['rt_max'] = max([f.get('rt_max', f['rt']) for f in feature_group])
603
+ merged['rt_mean'] = float(np.mean(rt_values))
604
+
605
+ if mz_values:
606
+ merged['mz'] = float(np.mean(mz_values))
607
+ merged['mz_min'] = min([f.get('mz_min', f['mz']) for f in feature_group])
608
+ merged['mz_max'] = max([f.get('mz_max', f['mz']) for f in feature_group])
609
+ merged['mz_mean'] = float(np.mean(mz_values))
610
+
611
+ # Use maximum sample count (features might be detected in overlapping but different samples)
612
+ merged['number_samples'] = max(sample_counts)
613
+
614
+ # Use weighted average intensity (by sample count)
615
+ if intensities and sample_counts:
616
+ total_weight = sum(sample_counts)
617
+ if total_weight > 0:
618
+ weighted_intensity = sum(inty * count for inty, count in zip(intensities, sample_counts)) / total_weight
619
+ merged['inty_mean'] = float(weighted_intensity)
620
+
621
+ # Aggregate chromatographic quality metrics if available
622
+ coherence_values = [f.get('chrom_coherence_mean', 0) for f in feature_group if f.get('chrom_coherence_mean') is not None]
623
+ prominence_values = [f.get('chrom_prominence_mean', 0) for f in feature_group if f.get('chrom_prominence_mean') is not None]
624
+
625
+ if coherence_values:
626
+ merged['chrom_coherence_mean'] = float(np.mean(coherence_values))
627
+ if prominence_values:
628
+ merged['chrom_prominence_mean'] = float(np.mean(prominence_values))
629
+
630
+ # Merge MS2 counts
631
+ ms2_counts = [f.get('number_ms2', 0) for f in feature_group]
632
+ merged['number_ms2'] = sum(ms2_counts)
633
+
634
+ # Keep the best quality score
635
+ quality_scores = [f.get('quality', 1.0) for f in feature_group if f.get('quality') is not None]
636
+ if quality_scores:
637
+ merged['quality'] = max(quality_scores)
638
+
639
+ return merged
640
+
641
+
642
+ def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
643
+ """
644
+ Validate that merged features have sufficient sample overlap.
645
+
646
+ Args:
647
+ features: List of consensus feature dictionaries
648
+ min_overlap: Minimum sample overlap ratio (0.0-1.0)
649
+
650
+ Returns:
651
+ List of validated features
652
+ """
653
+ # This is a placeholder for sample overlap validation
654
+ # Implementation would require access to which samples each feature appears in
655
+ # For now, we'll use a simple heuristic based on feature statistics
656
+
657
+ validated_features = []
658
+ for feature in features:
659
+ # Simple validation based on RT spread and sample count ratio
660
+ rt_spread = feature.get('rt_max', feature['rt']) - feature.get('rt_min', feature['rt'])
661
+ sample_count = feature.get('number_samples', 1)
662
+
663
+ # Features with very tight RT spread and high sample counts are more reliable
664
+ if rt_spread <= 2.0 or sample_count >= 10: # More permissive validation
665
+ validated_features.append(feature)
666
+ else:
667
+ # Could implement more sophisticated sample overlap checking here
668
+ validated_features.append(feature) # Keep for now
669
+
670
+ return validated_features
671
+
672
+
673
+ def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
674
+ """
675
+ Filter out features with excessive RT spread.
676
+
677
+ Args:
678
+ features: List of consensus feature dictionaries
679
+ max_rt_spread: Maximum allowed RT spread in seconds
680
+
681
+ Returns:
682
+ List of filtered features
683
+ """
684
+ filtered_features = []
685
+ filtered_count = 0
686
+
687
+ for feature in features:
688
+ rt_min = feature.get('rt_min', feature['rt'])
689
+ rt_max = feature.get('rt_max', feature['rt'])
690
+ rt_spread = rt_max - rt_min
691
+
692
+ if rt_spread <= max_rt_spread:
693
+ filtered_features.append(feature)
694
+ else:
695
+ filtered_count += 1
696
+
697
+ if filtered_count > 0:
698
+ self.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
699
+
700
+ return filtered_features
701
+
702
+
703
+ def _filter_coherence(self, features: list, min_coherence: float) -> list:
704
+ """
705
+ Filter out features with low chromatographic coherence.
706
+
707
+ Args:
708
+ features: List of consensus feature dictionaries
709
+ min_coherence: Minimum chromatographic coherence score
710
+
711
+ Returns:
712
+ List of filtered features
713
+ """
714
+ filtered_features = []
715
+ filtered_count = 0
716
+
717
+ for feature in features:
718
+ coherence = feature.get('chrom_coherence_mean', 1.0) # Default to high coherence if missing
719
+
720
+ if coherence >= min_coherence:
721
+ filtered_features.append(feature)
722
+ else:
723
+ filtered_count += 1
724
+
725
+ if filtered_count > 0:
726
+ self.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
727
+
728
+ return filtered_features
729
+
730
+
731
+ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
732
+ """KD-tree based merge without RT warping"""
733
+
734
+ consensus_map = oms.ConsensusMap()
735
+ file_descriptions = consensus_map.getColumnHeaders()
736
+
737
+ for i, feature_map in enumerate(self.features_maps):
738
+ file_description = file_descriptions.get(i, oms.ColumnHeader())
739
+ file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
740
+ file_description.size = feature_map.size()
741
+ file_description.unique_id = feature_map.getUniqueId()
742
+ file_descriptions[i] = file_description
743
+
744
+ consensus_map.setColumnHeaders(file_descriptions)
745
+
746
+ # Configure KD algorithm with warping disabled for memory efficiency
747
+ grouper = oms.FeatureGroupingAlgorithmKD()
748
+ params_oms = grouper.getParameters()
749
+
750
+ params_oms.setValue("mz_unit", "Da")
751
+ params_oms.setValue("nr_partitions", params.nr_partitions)
752
+ params_oms.setValue("warp:enabled", "false") # Disabled for memory efficiency
753
+ params_oms.setValue("link:rt_tol", params.rt_tol)
754
+ params_oms.setValue("link:mz_tol", params.mz_tol)
755
+ params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
756
+ params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
757
+ params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
758
+ #params_oms.setValue("link:charge_merging", "Any")
759
+
760
+ grouper.setParameters(params_oms)
761
+ grouper.group(self.features_maps, consensus_map)
762
+
763
+ return consensus_map
764
+
765
+
766
+ def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
767
+ """Chunked merge with proper cross-chunk consensus building"""
768
+
769
+ n_samples = len(self.features_maps)
770
+ if n_samples <= params.chunk_size:
771
+ self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
772
+ consensus_map = _merge_kd(self, params)
773
+ # Extract consensus features to populate consensus_df for chunked method consistency
774
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
775
+ return consensus_map
776
+
777
+ # Process in chunks
778
+ chunks = []
779
+ for i in range(0, n_samples, params.chunk_size):
780
+ chunk_end = min(i + params.chunk_size, n_samples)
781
+ chunks.append((i, self.features_maps[i:chunk_end]))
782
+
783
+ self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
784
+
785
+ # Process each chunk to create chunk consensus maps
786
+ chunk_consensus_maps = []
787
+
788
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
789
+ chunk_consensus_map = oms.ConsensusMap()
790
+
791
+ # Set up file descriptions for chunk
792
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
793
+ for j, feature_map in enumerate(chunk_maps):
794
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
795
+ file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
796
+ file_description.size = feature_map.size()
797
+ file_description.unique_id = feature_map.getUniqueId()
798
+ file_descriptions[j] = file_description
799
+
800
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
801
+
802
+ # Use KD algorithm for chunk
803
+ grouper = oms.FeatureGroupingAlgorithmKD()
804
+ chunk_params = grouper.getParameters()
805
+ chunk_params.setValue("mz_unit", "Da")
806
+ chunk_params.setValue("nr_partitions", params.nr_partitions)
807
+ chunk_params.setValue("warp:enabled", "true")
808
+ chunk_params.setValue("warp:rt_tol", params.rt_tol)
809
+ chunk_params.setValue("warp:mz_tol", params.mz_tol)
810
+ chunk_params.setValue("link:rt_tol", params.rt_tol)
811
+ chunk_params.setValue("link:mz_tol", params.mz_tol)
812
+ chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
813
+ chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
814
+ chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
815
+
816
+ grouper.setParameters(chunk_params)
817
+ grouper.group(chunk_maps, chunk_consensus_map)
818
+
819
+ chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
820
+
821
+ # Merge chunk results with proper cross-chunk consensus building
822
+ _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
823
+
824
+ # Create a dummy consensus map for compatibility (since other functions expect it)
825
+ consensus_map = oms.ConsensusMap()
826
+ return consensus_map
827
+
828
+
829
+ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
830
+ """
831
+ Scalable aggregation of chunk consensus maps into final consensus_df.
832
+
833
+ This function implements cross-chunk consensus building by:
834
+ 1. Extracting feature_uids from each chunk consensus map
835
+ 2. Aggregating features close in RT/m/z across chunks
836
+ 3. Building consensus_df and consensus_mapping_df directly
837
+ """
838
+
839
+ if len(chunk_consensus_maps) == 1:
840
+ # Single chunk case - just extract using the true global min_samples.
841
+ # No need for permissive threshold because we are not discarding singletons pre-aggregation.
842
+ self._extract_consensus_features(
843
+ chunk_consensus_maps[0][1],
844
+ params.min_samples,
845
+ cached_adducts_df,
846
+ cached_valid_adducts,
847
+ )
848
+ return
849
+
850
+ # Build feature_uid to feature_data lookup for fast access
851
+ feature_uid_map = {
852
+ row["feature_id"]: row["feature_uid"]
853
+ for row in self.features_df.iter_rows(named=True)
854
+ }
855
+
856
+ features_lookup = _optimized_feature_lookup(self, self.features_df)
857
+
858
+ # Extract all consensus features from chunks with their feature_uids
859
+ all_chunk_consensus = []
860
+ consensus_id_counter = 0
861
+
862
+ for chunk_idx, (chunk_start_idx, chunk_consensus_map) in enumerate(chunk_consensus_maps):
863
+ for consensus_feature in chunk_consensus_map:
864
+ # ACCEPT ALL consensus features (size >=1) here.
865
+ # Reason: A feature that is globally present in many samples can still
866
+ # appear only once inside a given sample chunk. Early filtering at
867
+ # size>=2 causes irreversible loss and underestimates the final
868
+ # consensus count (observed ~296 vs 950 for KD). We defer filtering
869
+ # strictly to the final global min_samples.
870
+
871
+ # Extract feature_uids from this consensus feature
872
+ feature_uids = []
873
+ feature_data_list = []
874
+ sample_uids = []
875
+
876
+ for feature_handle in consensus_feature.getFeatureList():
877
+ fuid = str(feature_handle.getUniqueId())
878
+ if fuid not in feature_uid_map:
879
+ continue
880
+
881
+ feature_uid = feature_uid_map[fuid]
882
+ feature_data = features_lookup.get(feature_uid)
883
+ if feature_data:
884
+ feature_uids.append(feature_uid)
885
+ feature_data_list.append(feature_data)
886
+ sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
887
+
888
+ if not feature_data_list:
889
+ # No retrievable feature metadata (possible stale map reference) -> skip
890
+ continue # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
891
+ rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
892
+ mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
893
+ if rt_vals_local:
894
+ rt_min_local = min(rt_vals_local)
895
+ rt_max_local = max(rt_vals_local)
896
+ else:
897
+ rt_min_local = rt_max_local = consensus_feature.getRT()
898
+ if mz_vals_local:
899
+ mz_min_local = min(mz_vals_local)
900
+ mz_max_local = max(mz_vals_local)
901
+ else:
902
+ mz_min_local = mz_max_local = consensus_feature.getMZ()
903
+
904
+ # Store chunk consensus with feature tracking
905
+ chunk_consensus_data = {
906
+ 'consensus_id': consensus_id_counter,
907
+ 'chunk_idx': chunk_idx,
908
+ 'chunk_start_idx': chunk_start_idx,
909
+ 'mz': consensus_feature.getMZ(),
910
+ 'rt': consensus_feature.getRT(),
911
+ 'mz_min': mz_min_local,
912
+ 'mz_max': mz_max_local,
913
+ 'rt_min': rt_min_local,
914
+ 'rt_max': rt_max_local,
915
+ 'intensity': consensus_feature.getIntensity(),
916
+ 'quality': consensus_feature.getQuality(),
917
+ 'feature_uids': feature_uids,
918
+ 'feature_data_list': feature_data_list,
919
+ 'sample_uids': sample_uids,
920
+ 'sample_count': len(feature_data_list)
921
+ }
922
+
923
+ all_chunk_consensus.append(chunk_consensus_data)
924
+ consensus_id_counter += 1
925
+
926
+ if not all_chunk_consensus:
927
+ # No valid consensus features found
928
+ self.consensus_df = pl.DataFrame()
929
+ self.consensus_mapping_df = pl.DataFrame()
930
+ return
931
+
932
+ # Perform cross-chunk clustering using optimized spatial indexing
933
+ def _cluster_chunk_consensus(chunk_consensus_list: list, rt_tol: float, mz_tol: float) -> list:
934
+ """Cluster chunk consensus features using interval overlap (no over-relaxation).
935
+
936
+ A union is formed if either centroids are within tolerance OR their RT / m/z
937
+ intervals (expanded by tolerance) overlap, and they originate from different chunks.
938
+ """
939
+ if not chunk_consensus_list:
940
+ return []
941
+
942
+ n_features = len(chunk_consensus_list)
943
+
944
+ # Spatial bins using strict tolerances (improves candidate reduction without recall loss)
945
+ rt_bin_size = rt_tol if rt_tol > 0 else 1.0
946
+ mz_bin_size = mz_tol if mz_tol > 0 else 0.01
947
+ features_by_bin = defaultdict(list)
948
+
949
+ for i, cf in enumerate(chunk_consensus_list):
950
+ rt_bin = int(cf['rt'] / rt_bin_size)
951
+ mz_bin = int(cf['mz'] / mz_bin_size)
952
+ features_by_bin[(rt_bin, mz_bin)].append(i)
953
+
954
+ class UF:
955
+ def __init__(self, n):
956
+ self.p = list(range(n))
957
+ self.r = [0]*n
958
+ def find(self, x):
959
+ if self.p[x] != x:
960
+ self.p[x] = self.find(self.p[x])
961
+ return self.p[x]
962
+ def union(self, a,b):
963
+ pa, pb = self.find(a), self.find(b)
964
+ if pa == pb:
965
+ return
966
+ if self.r[pa] < self.r[pb]:
967
+ pa, pb = pb, pa
968
+ self.p[pb] = pa
969
+ if self.r[pa] == self.r[pb]:
970
+ self.r[pa] += 1
971
+
972
+ uf = UF(n_features)
973
+ checked = set()
974
+ for (rtb, mzb), idxs in features_by_bin.items():
975
+ for dr in (-1,0,1):
976
+ for dm in (-1,0,1):
977
+ neigh = (rtb+dr, mzb+dm)
978
+ if neigh not in features_by_bin:
979
+ continue
980
+ for i in idxs:
981
+ for j in features_by_bin[neigh]:
982
+ if i >= j:
983
+ continue
984
+ pair = (i,j)
985
+ if pair in checked:
986
+ continue
987
+ checked.add(pair)
988
+ a = chunk_consensus_list[i]
989
+ b = chunk_consensus_list[j]
990
+ if a['chunk_idx'] == b['chunk_idx']:
991
+ continue
992
+
993
+ # Primary check: centroid distance (strict)
994
+ centroid_close = (abs(a['rt']-b['rt']) <= rt_tol and abs(a['mz']-b['mz']) <= mz_tol)
995
+
996
+ # Secondary check: interval overlap (more conservative)
997
+ # Only allow interval overlap if centroids are reasonably close (within 2x tolerance)
998
+ centroids_reasonable = (abs(a['rt']-b['rt']) <= 2 * rt_tol and abs(a['mz']-b['mz']) <= 2 * mz_tol)
999
+ if centroids_reasonable:
1000
+ rt_overlap = (a['rt_min'] - rt_tol/2) <= (b['rt_max'] + rt_tol/2) and (b['rt_min'] - rt_tol/2) <= (a['rt_max'] + rt_tol/2)
1001
+ mz_overlap = (a['mz_min'] - mz_tol/2) <= (b['mz_max'] + mz_tol/2) and (b['mz_min'] - mz_tol/2) <= (a['mz_max'] + mz_tol/2)
1002
+ else:
1003
+ rt_overlap = mz_overlap = False
1004
+
1005
+ if centroid_close or (rt_overlap and mz_overlap):
1006
+ uf.union(i,j)
1007
+
1008
+ groups_by_root = defaultdict(list)
1009
+ for i in range(n_features):
1010
+ groups_by_root[uf.find(i)].append(chunk_consensus_list[i])
1011
+ return list(groups_by_root.values())
1012
+ # (Obsolete relaxed + centroid stitching code removed.)
1013
+
1014
+ # --- Stage 1: initial cross-chunk clustering of chunk consensus features ---
1015
+ initial_groups = _cluster_chunk_consensus(all_chunk_consensus, params.rt_tol, params.mz_tol)
1016
+
1017
+ # --- Stage 2: centroid refinement (lightweight second pass) ---
1018
+ def _refine_groups(groups: list, rt_tol: float, mz_tol: float) -> list:
1019
+ """Refine groups by clustering group centroids (single-link) under same tolerances.
1020
+
1021
+ This reconciles borderline splits left after interval-overlap clustering without
1022
+ re-introducing broad over-merging. Works on group centroids only (low cost).
1023
+ """
1024
+ if len(groups) <= 1:
1025
+ return groups
1026
+ # Build centroid list
1027
+ centroids = [] # (idx, rt, mz)
1028
+ for gi, g in enumerate(groups):
1029
+ if not g:
1030
+ continue
1031
+ rt_vals = [cf['rt'] for cf in g]
1032
+ mz_vals = [cf['mz'] for cf in g]
1033
+ if not rt_vals or not mz_vals:
1034
+ continue
1035
+ centroids.append((gi, float(np.mean(rt_vals)), float(np.mean(mz_vals))))
1036
+ if len(centroids) <= 1:
1037
+ return groups
1038
+
1039
+ # Spatial binning for centroid clustering
1040
+ rt_bin = rt_tol if rt_tol > 0 else 1.0
1041
+ mz_bin = mz_tol if mz_tol > 0 else 0.01
1042
+ bins = defaultdict(list)
1043
+ for idx, rt_c, mz_c in centroids:
1044
+ bins[(int(rt_c/rt_bin), int(mz_c/mz_bin))].append((idx, rt_c, mz_c))
1045
+
1046
+ # Union-Find over group indices
1047
+ parent = list(range(len(groups)))
1048
+ rank = [0]*len(groups)
1049
+ def find(x):
1050
+ if parent[x] != x:
1051
+ parent[x] = find(parent[x])
1052
+ return parent[x]
1053
+ def union(a,b):
1054
+ pa, pb = find(a), find(b)
1055
+ if pa == pb:
1056
+ return
1057
+ if rank[pa] < rank[pb]:
1058
+ pa, pb = pb, pa
1059
+ parent[pb] = pa
1060
+ if rank[pa] == rank[pb]:
1061
+ rank[pa] += 1
1062
+
1063
+ checked = set()
1064
+ for (rb, mb), items in bins.items():
1065
+ for dr in (-1,0,1):
1066
+ for dm in (-1,0,1):
1067
+ neigh_key = (rb+dr, mb+dm)
1068
+ if neigh_key not in bins:
1069
+ continue
1070
+ for (gi, rt_i, mz_i) in items:
1071
+ for (gj, rt_j, mz_j) in bins[neigh_key]:
1072
+ if gi >= gj:
1073
+ continue
1074
+ pair = (gi, gj)
1075
+ if pair in checked:
1076
+ continue
1077
+ checked.add(pair)
1078
+ if abs(rt_i-rt_j) <= rt_tol and abs(mz_i-mz_j) <= mz_tol:
1079
+ union(gi, gj)
1080
+
1081
+ merged = defaultdict(list)
1082
+ for gi, g in enumerate(groups):
1083
+ merged[find(gi)].extend(g)
1084
+ return list(merged.values())
1085
+
1086
+ refined_groups = _refine_groups(initial_groups, params.rt_tol, params.mz_tol)
1087
+
1088
+ # --- Stage 3: build final consensus feature metadata and mapping ---
1089
+ consensus_metadata = []
1090
+ consensus_mapping_list = []
1091
+ consensus_uid_counter = 0
1092
+
1093
+ for group in refined_groups:
1094
+ if not group:
1095
+ continue
1096
+
1097
+ # Aggregate underlying feature data (deduplicated by feature_uid)
1098
+ feature_data_acc = {}
1099
+ sample_uids_acc = set()
1100
+ rt_values_chunk = [] # use chunk-level centroids for statistic helper
1101
+ mz_values_chunk = []
1102
+ intensity_values_chunk = []
1103
+ quality_values_chunk = []
1104
+
1105
+ for cf in group:
1106
+ rt_values_chunk.append(cf['rt'])
1107
+ mz_values_chunk.append(cf['mz'])
1108
+ intensity_values_chunk.append(cf.get('intensity', 0.0) or 0.0)
1109
+ quality_values_chunk.append(cf.get('quality', 1.0) or 1.0)
1110
+
1111
+ for fd, samp_uid in zip(cf['feature_data_list'], cf['sample_uids']):
1112
+ fid = fd.get('feature_uid') or fd.get('uid') or fd.get('feature_id')
1113
+ # feature_uid expected in fd under 'feature_uid'; fallback attempts just in case
1114
+ if fid is None:
1115
+ continue
1116
+ if fid not in feature_data_acc:
1117
+ feature_data_acc[fid] = fd
1118
+ sample_uids_acc.add(samp_uid)
1119
+
1120
+ if not feature_data_acc:
1121
+ continue
1122
+
1123
+ number_samples = len(sample_uids_acc)
1124
+
1125
+ # NOTE: Don't filter by min_samples here - let _finalize_merge handle it
1126
+ # This allows proper cross-chunk consensus building before final filtering
1127
+
1128
+ metadata = _calculate_consensus_statistics(
1129
+ self,
1130
+ consensus_uid_counter,
1131
+ list(feature_data_acc.values()),
1132
+ rt_values_chunk,
1133
+ mz_values_chunk,
1134
+ intensity_values_chunk,
1135
+ quality_values_chunk,
1136
+ number_features=len(feature_data_acc),
1137
+ number_samples=number_samples,
1138
+ cached_adducts_df=cached_adducts_df,
1139
+ cached_valid_adducts=cached_valid_adducts,
1140
+ )
1141
+
1142
+ # Validate RT spread doesn't exceed tolerance (with some flexibility for chunked merge)
1143
+ rt_spread = metadata.get('rt_max', 0) - metadata.get('rt_min', 0)
1144
+ max_allowed_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
1145
+
1146
+ if rt_spread > max_allowed_spread:
1147
+ # Skip consensus features with excessive RT spread
1148
+ self.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
1149
+ consensus_uid_counter += 1
1150
+ continue
1151
+
1152
+ consensus_metadata.append(metadata)
1153
+
1154
+ # Build mapping rows (deduplicated)
1155
+ for fid, fd in feature_data_acc.items():
1156
+ samp_uid = fd.get('sample_uid') or fd.get('sample_id') or fd.get('sample')
1157
+ # If absent we attempt to derive from original group sample_uids pairing
1158
+ # but most feature_data rows should include sample_uid already.
1159
+ if samp_uid is None:
1160
+ # fallback: search for cf containing this fid
1161
+ for cf in group:
1162
+ for fd2, samp2 in zip(cf['feature_data_list'], cf['sample_uids']):
1163
+ f2id = fd2.get('feature_uid') or fd2.get('uid') or fd2.get('feature_id')
1164
+ if f2id == fid:
1165
+ samp_uid = samp2
1166
+ break
1167
+ if samp_uid is not None:
1168
+ break
1169
+ if samp_uid is None:
1170
+ continue
1171
+ consensus_mapping_list.append({
1172
+ 'consensus_uid': consensus_uid_counter,
1173
+ 'sample_uid': samp_uid,
1174
+ 'feature_uid': fid,
1175
+ })
1176
+
1177
+ consensus_uid_counter += 1
1178
+
1179
+ # Assign DataFrames
1180
+ self.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
1181
+ self.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
1182
+
1183
+ # Ensure mapping only contains features from retained consensus_df
1184
+ if len(self.consensus_df) > 0:
1185
+ valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
1186
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
1187
+ pl.col('consensus_uid').is_in(list(valid_consensus_ids))
1188
+ )
1189
+ else:
1190
+ self.consensus_mapping_df = pl.DataFrame()
1191
+
1192
+ # Attach empty consensus_map placeholder for downstream compatibility
1193
+ self.consensus_map = oms.ConsensusMap()
1194
+ return
1195
+
1196
+
1197
+ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
1198
+ rt_values: list, mz_values: list,
1199
+ intensity_values: list, quality_values: list,
1200
+ number_features: int = None, number_samples: int = None,
1201
+ cached_adducts_df=None, cached_valid_adducts=None) -> dict:
1202
+ """
1203
+ Calculate comprehensive statistics for a consensus feature from aggregated feature data.
1204
+
1205
+ Args:
1206
+ consensus_uid: Unique ID for this consensus feature
1207
+ feature_data_list: List of individual feature dictionaries
1208
+ rt_values: RT values from chunk consensus features
1209
+ mz_values: m/z values from chunk consensus features
1210
+ intensity_values: Intensity values from chunk consensus features
1211
+ quality_values: Quality values from chunk consensus features
1212
+
1213
+ Returns:
1214
+ Dictionary with consensus feature metadata
1215
+ """
1216
+ if not feature_data_list:
1217
+ return {}
1218
+
1219
+ # Convert feature data to numpy arrays for vectorized computation
1220
+ rt_feat_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
1221
+ mz_feat_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
1222
+ rt_start_values = np.array([fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None])
1223
+ rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
1224
+ rt_delta_values = np.array([fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None])
1225
+ mz_start_values = np.array([fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None])
1226
+ mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
1227
+ inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
1228
+ coherence_values = np.array([fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None])
1229
+ prominence_values = np.array([fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None])
1230
+ prominence_scaled_values = np.array([fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None])
1231
+ height_scaled_values = np.array([fd.get("chrom_prominence_scaled", 0) for fd in feature_data_list if fd.get("chrom_prominence_scaled") is not None])
1232
+ iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
1233
+ charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
1234
+
1235
+ # Process adducts with cached validation
1236
+ all_adducts = []
1237
+ valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
1238
+ valid_adducts.add("?") # Always allow '?' adducts
1239
+
1240
+ for fd in feature_data_list:
1241
+ adduct = fd.get("adduct")
1242
+ if adduct is not None:
1243
+ # Only include adducts that are valid (from cached study adducts or contain '?')
1244
+ if adduct in valid_adducts or "?" in adduct:
1245
+ all_adducts.append(adduct)
1246
+
1247
+ # Calculate adduct consensus
1248
+ adduct_values = []
1249
+ adduct_top = None
1250
+ adduct_charge_top = None
1251
+ adduct_mass_neutral_top = None
1252
+ adduct_mass_shift_top = None
1253
+
1254
+ if all_adducts:
1255
+ adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
1256
+ total_count = sum(adduct_counts.values())
1257
+ for adduct, count in adduct_counts.items():
1258
+ percentage = (count / total_count) * 100 if total_count > 0 else 0
1259
+ adduct_values.append([str(adduct), int(count), float(round(percentage, 2))])
1260
+
1261
+ adduct_values.sort(key=lambda x: x[1], reverse=True)
1262
+
1263
+ if adduct_values:
1264
+ adduct_top = adduct_values[0][0]
1265
+ # Try to get charge and mass shift from cached study adducts
1266
+ adduct_found = False
1267
+ if cached_adducts_df is not None and not cached_adducts_df.is_empty():
1268
+ matching_adduct = cached_adducts_df.filter(
1269
+ pl.col("name") == adduct_top,
1270
+ )
1271
+ if not matching_adduct.is_empty():
1272
+ adduct_row = matching_adduct.row(0, named=True)
1273
+ adduct_charge_top = adduct_row["charge"]
1274
+ adduct_mass_shift_top = adduct_row["mass_shift"]
1275
+ adduct_found = True
1276
+
1277
+ if not adduct_found:
1278
+ # Set default charge and mass shift for top adduct
1279
+ adduct_charge_top = 1
1280
+ adduct_mass_shift_top = 1.007825
1281
+ else:
1282
+ # Default adduct based on study polarity
1283
+ study_polarity = getattr(study_obj, "polarity", "positive")
1284
+ if study_polarity in ["negative", "neg"]:
1285
+ adduct_top = "[M-?]1-"
1286
+ adduct_charge_top = -1
1287
+ adduct_mass_shift_top = -1.007825
1288
+ else:
1289
+ adduct_top = "[M+?]1+"
1290
+ adduct_charge_top = 1
1291
+ adduct_mass_shift_top = 1.007825
1292
+
1293
+ adduct_values = [[adduct_top, 1, 100.0]]
1294
+
1295
+ # Calculate neutral mass
1296
+ consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
1297
+ if adduct_charge_top and adduct_mass_shift_top is not None:
1298
+ adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
1299
+
1300
+ # Calculate MS2 count
1301
+ ms2_count = 0
1302
+ for fd in feature_data_list:
1303
+ ms2_scans = fd.get("ms2_scans")
1304
+ if ms2_scans is not None:
1305
+ ms2_count += len(ms2_scans)
1306
+
1307
+ # Build consensus metadata
1308
+ return {
1309
+ "consensus_uid": int(consensus_uid),
1310
+ "consensus_id": str(consensus_uid), # Use simple string ID
1311
+ "quality": round(float(np.mean(quality_values)), 3) if len(quality_values) > 0 else 1.0,
1312
+ "number_samples": number_samples if number_samples is not None else len(feature_data_list),
1313
+ "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
1314
+ "mz": consensus_mz,
1315
+ "rt_min": round(float(np.min(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
1316
+ "rt_max": round(float(np.max(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
1317
+ "rt_mean": round(float(np.mean(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
1318
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
1319
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
1320
+ "rt_delta_mean": round(float(np.mean(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
1321
+ "mz_min": round(float(np.min(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
1322
+ "mz_max": round(float(np.max(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
1323
+ "mz_mean": round(float(np.mean(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
1324
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
1325
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
1326
+ "inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
1327
+ "bl": -1.0,
1328
+ "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
1329
+ "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
1330
+ "chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3) if len(prominence_scaled_values) > 0 else 0.0,
1331
+ "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3) if len(height_scaled_values) > 0 else 0.0,
1332
+ "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
1333
+ "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
1334
+ "number_ms2": int(ms2_count),
1335
+ "adducts": adduct_values,
1336
+ "adduct_top": adduct_top,
1337
+ "adduct_charge_top": adduct_charge_top,
1338
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
1339
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
1340
+ "id_top_name": None,
1341
+ "id_top_class": None,
1342
+ "id_top_adduct": None,
1343
+ "id_top_score": None,
1344
+ }
1345
+
1346
+
1347
+ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) -> list:
1348
+ """
1349
+ Cluster consensus features from different chunks based on RT and m/z similarity.
1350
+
1351
+ Args:
1352
+ features: List of feature dictionaries with 'mz', 'rt', 'id' keys
1353
+ rt_tol: RT tolerance in seconds
1354
+ mz_tol: m/z tolerance in Da
1355
+
1356
+ Returns:
1357
+ List of groups, where each group is a list of feature dictionaries
1358
+ """
1359
+ if not features:
1360
+ return []
1361
+
1362
+ # Use Union-Find for efficient clustering
1363
+ class UnionFind:
1364
+ def __init__(self, n):
1365
+ self.parent = list(range(n))
1366
+ self.rank = [0] * n
1367
+
1368
+ def find(self, x):
1369
+ if self.parent[x] != x:
1370
+ self.parent[x] = self.find(self.parent[x])
1371
+ return self.parent[x]
1372
+
1373
+ def union(self, x, y):
1374
+ px, py = self.find(x), self.find(y)
1375
+ if px == py:
1376
+ return
1377
+ if self.rank[px] < self.rank[py]:
1378
+ px, py = py, px
1379
+ self.parent[py] = px
1380
+ if self.rank[px] == self.rank[py]:
1381
+ self.rank[px] += 1
1382
+
1383
+ n_features = len(features)
1384
+ uf = UnionFind(n_features)
1385
+
1386
+ # Build distance matrix and cluster features within tolerance
1387
+ for i in range(n_features):
1388
+ for j in range(i + 1, n_features):
1389
+ feat_i = features[i]
1390
+ feat_j = features[j]
1391
+
1392
+ # Skip if features are from the same chunk (they're already processed)
1393
+ if feat_i['chunk_idx'] == feat_j['chunk_idx']:
1394
+ continue
1395
+
1396
+ mz_diff = abs(feat_i['mz'] - feat_j['mz'])
1397
+ rt_diff = abs(feat_i['rt'] - feat_j['rt'])
1398
+
1399
+ # Cluster if within tolerance
1400
+ if mz_diff <= mz_tol and rt_diff <= rt_tol:
1401
+ uf.union(i, j)
1402
+
1403
+ # Extract groups
1404
+ groups_by_root = {}
1405
+ for i in range(n_features):
1406
+ root = uf.find(i)
1407
+ if root not in groups_by_root:
1408
+ groups_by_root[root] = []
1409
+ groups_by_root[root].append(features[i])
1410
+
1411
+ return list(groups_by_root.values())
1412
+
1413
+
1414
+ # Note: Restored proper chunked implementation with cross-chunk consensus clustering
1415
+
1416
+
1417
+ def _reset_consensus_data(self):
1418
+ """Reset consensus-related DataFrames at the start of merge."""
1419
+ self.consensus_df = pl.DataFrame()
1420
+ self.consensus_ms2 = pl.DataFrame()
1421
+ self.consensus_mapping_df = pl.DataFrame()
1422
+
1423
+
1424
+ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
1425
+ """Extract consensus features and build metadata."""
1426
+ # create a dict to map uid to feature_uid using self.features_df
1427
+ feature_uid_map = {
1428
+ row["feature_id"]: row["feature_uid"]
1429
+ for row in self.features_df.iter_rows(named=True)
1430
+ }
1431
+ imax = consensus_map.size()
1432
+
1433
+ self.logger.debug(f"Found {imax} feature groups by clustering.")
1434
+
1435
+ # Pre-build fast lookup tables for features_df data using optimized approach
1436
+ features_lookup = _optimized_feature_lookup(self, self.features_df)
1437
+
1438
+ # create a list to store the consensus mapping
1439
+ consensus_mapping = []
1440
+ metadata_list = []
1441
+
1442
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
1443
+
1444
+ for i, feature in enumerate(
1445
+ tqdm(
1446
+ consensus_map,
1447
+ total=imax,
1448
+ disable=tqdm_disable,
1449
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
1450
+ ),
1451
+ ):
1452
+ # get all features in the feature map with the same unique id as the consensus feature
1453
+ features_list = feature.getFeatureList()
1454
+ uids = []
1455
+ feature_data_list = []
1456
+
1457
+ for _j, f in enumerate(features_list):
1458
+ fuid = str(f.getUniqueId())
1459
+ if fuid not in feature_uid_map:
1460
+ # this is a feature that was removed but is still in the feature maps
1461
+ continue
1462
+ fuid = feature_uid_map[fuid]
1463
+ consensus_mapping.append(
1464
+ {
1465
+ "consensus_uid": i,
1466
+ "sample_uid": f.getMapIndex() + 1,
1467
+ "feature_uid": fuid,
1468
+ },
1469
+ )
1470
+ uids.append(fuid)
1471
+
1472
+ # Get feature data from lookup instead of DataFrame filtering
1473
+ feature_data = features_lookup.get(fuid)
1474
+ if feature_data:
1475
+ feature_data_list.append(feature_data)
1476
+
1477
+ if not feature_data_list:
1478
+ # Skip this consensus feature if no valid features found
1479
+ continue
1480
+
1481
+ # Compute statistics using vectorized operations on collected data
1482
+ # Convert to numpy arrays for faster computation
1483
+ rt_values = np.array(
1484
+ [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
1485
+ )
1486
+ mz_values = np.array(
1487
+ [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
1488
+ )
1489
+ rt_start_values = np.array(
1490
+ [
1491
+ fd.get("rt_start", 0)
1492
+ for fd in feature_data_list
1493
+ if fd.get("rt_start") is not None
1494
+ ],
1495
+ )
1496
+ rt_end_values = np.array(
1497
+ [
1498
+ fd.get("rt_end", 0)
1499
+ for fd in feature_data_list
1500
+ if fd.get("rt_end") is not None
1501
+ ],
1502
+ )
1503
+ rt_delta_values = np.array(
1504
+ [
1505
+ fd.get("rt_delta", 0)
1506
+ for fd in feature_data_list
1507
+ if fd.get("rt_delta") is not None
1508
+ ],
1509
+ )
1510
+ mz_start_values = np.array(
1511
+ [
1512
+ fd.get("mz_start", 0)
1513
+ for fd in feature_data_list
1514
+ if fd.get("mz_start") is not None
1515
+ ],
1516
+ )
1517
+ mz_end_values = np.array(
1518
+ [
1519
+ fd.get("mz_end", 0)
1520
+ for fd in feature_data_list
1521
+ if fd.get("mz_end") is not None
1522
+ ],
1523
+ )
1524
+ inty_values = np.array(
1525
+ [
1526
+ fd.get("inty", 0)
1527
+ for fd in feature_data_list
1528
+ if fd.get("inty") is not None
1529
+ ],
1530
+ )
1531
+ coherence_values = np.array(
1532
+ [
1533
+ fd.get("chrom_coherence", 0)
1534
+ for fd in feature_data_list
1535
+ if fd.get("chrom_coherence") is not None
1536
+ ],
1537
+ )
1538
+ prominence_values = np.array(
1539
+ [
1540
+ fd.get("chrom_prominence", 0)
1541
+ for fd in feature_data_list
1542
+ if fd.get("chrom_prominence") is not None
1543
+ ],
1544
+ )
1545
+ prominence_scaled_values = np.array(
1546
+ [
1547
+ fd.get("chrom_height_scaled", 0)
1548
+ for fd in feature_data_list
1549
+ if fd.get("chrom_height_scaled") is not None
1550
+ ],
1551
+ )
1552
+ height_scaled_values = np.array(
1553
+ [
1554
+ fd.get("chrom_prominence_scaled", 0)
1555
+ for fd in feature_data_list
1556
+ if fd.get("chrom_prominence_scaled") is not None
1557
+ ],
1558
+ )
1559
+ iso_values = np.array(
1560
+ [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
1561
+ )
1562
+ charge_values = np.array(
1563
+ [
1564
+ fd.get("charge", 0)
1565
+ for fd in feature_data_list
1566
+ if fd.get("charge") is not None
1567
+ ],
1568
+ )
1569
+
1570
+ # adduct_values
1571
+ # Collect all adducts from feature_data_list to create consensus adduct information
1572
+ # Only consider adducts that are in study._get_adducts() plus items with '?'
1573
+ all_adducts = []
1574
+ adduct_masses = {}
1575
+
1576
+ # Get valid adducts from cached result (avoid repeated _get_adducts() calls)
1577
+ valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
1578
+ valid_adducts.add("?") # Always allow '?' adducts
1579
+
1580
+ for fd in feature_data_list:
1581
+ # Get individual adduct and mass from each feature data (fd)
1582
+ adduct = fd.get("adduct")
1583
+ adduct_mass = fd.get("adduct_mass")
1584
+
1585
+ if adduct is not None:
1586
+ # Only include adducts that are valid (from study._get_adducts() or contain '?')
1587
+ if adduct in valid_adducts or "?" in adduct:
1588
+ all_adducts.append(adduct)
1589
+ if adduct_mass is not None:
1590
+ adduct_masses[adduct] = adduct_mass
1591
+
1592
+ # Calculate adduct_values for the consensus feature
1593
+ adduct_values = []
1594
+ if all_adducts:
1595
+ adduct_counts = {
1596
+ adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
1597
+ }
1598
+ total_count = sum(adduct_counts.values())
1599
+ for adduct, count in adduct_counts.items():
1600
+ percentage = (count / total_count) * 100 if total_count > 0 else 0
1601
+ # Store as list with [name, num, %] format for the adducts column
1602
+ adduct_values.append(
1603
+ [
1604
+ str(adduct),
1605
+ int(count),
1606
+ float(round(percentage, 2)),
1607
+ ],
1608
+ )
1609
+
1610
+ # Sort adduct_values by count in descending order
1611
+ adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
1612
+ # Store adduct_values for use in metadata
1613
+ consensus_adduct_values = adduct_values
1614
+
1615
+ # Extract top adduct information for new columns
1616
+ adduct_top = None
1617
+ adduct_charge_top = None
1618
+ adduct_mass_neutral_top = None
1619
+ adduct_mass_shift_top = None
1620
+
1621
+ if consensus_adduct_values:
1622
+ top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
1623
+ adduct_top = top_adduct_name
1624
+
1625
+ # Parse adduct information to extract charge and mass shift
1626
+ # Handle "?" as "H" and parse common adduct formats
1627
+ if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
1628
+ adduct_charge_top = 1
1629
+ adduct_mass_shift_top = 1.007825 # H mass
1630
+ elif top_adduct_name == "[M+?]-":
1631
+ adduct_charge_top = -1
1632
+ adduct_mass_shift_top = -1.007825 # -H mass
1633
+ else:
1634
+ # Try to get charge and mass shift from cached study adducts
1635
+ adduct_found = False
1636
+ if cached_adducts_df is not None and not cached_adducts_df.is_empty():
1637
+ # Look for exact match in study adducts
1638
+ matching_adduct = cached_adducts_df.filter(
1639
+ pl.col("name") == top_adduct_name,
1640
+ )
1641
+ if not matching_adduct.is_empty():
1642
+ adduct_row = matching_adduct.row(0, named=True)
1643
+ adduct_charge_top = adduct_row["charge"]
1644
+ adduct_mass_shift_top = adduct_row["mass_shift"]
1645
+ adduct_found = True
1646
+
1647
+ if not adduct_found:
1648
+ # Fallback to regex parsing
1649
+ import re
1650
+
1651
+ # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
1652
+ pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
1653
+ match = re.match(pattern, top_adduct_name)
1654
+
1655
+ if match:
1656
+ sign = match.group(1)
1657
+ element = match.group(2)
1658
+ multiplier_str = match.group(3)
1659
+ charge_sign = match.group(4)
1660
+
1661
+ multiplier = int(multiplier_str) if multiplier_str else 1
1662
+ charge = multiplier if charge_sign == "+" else -multiplier
1663
+ adduct_charge_top = charge
1664
+
1665
+ # Calculate mass shift based on element
1666
+ element_masses = {
1667
+ "H": 1.007825,
1668
+ "Na": 22.989769,
1669
+ "K": 38.963708,
1670
+ "NH4": 18.033823,
1671
+ "Li": 7.016930,
1672
+ "Cl": 34.969401,
1673
+ "Br": 78.918885,
1674
+ "HCOO": 44.998201,
1675
+ "CH3COO": 59.013851,
1676
+ "H2O": 18.010565,
1677
+ }
1678
+
1679
+ base_mass = element_masses.get(
1680
+ element,
1681
+ 1.007825,
1682
+ ) # Default to H if unknown
1683
+ mass_shift = (
1684
+ base_mass * multiplier
1685
+ if sign == "+"
1686
+ else -base_mass * multiplier
1687
+ )
1688
+ adduct_mass_shift_top = mass_shift
1689
+ else:
1690
+ # Default fallback
1691
+ adduct_charge_top = 1
1692
+ adduct_mass_shift_top = 1.007825
1693
+ else:
1694
+ # No valid adducts found - assign default based on study polarity
1695
+ study_polarity = getattr(self, "polarity", "positive")
1696
+ if study_polarity in ["negative", "neg"]:
1697
+ # Negative mode default
1698
+ adduct_top = "[M-?]1-"
1699
+ adduct_charge_top = -1
1700
+ adduct_mass_shift_top = -1.007825 # -H mass (loss of proton)
1701
+ else:
1702
+ # Positive mode default (includes 'positive', 'pos', or any other value)
1703
+ adduct_top = "[M+?]1+"
1704
+ adduct_charge_top = 1
1705
+ adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
1706
+
1707
+ # Create a single default adduct entry in the adducts list for consistency
1708
+ consensus_adduct_values = [[adduct_top, 1, 100.0]]
1709
+
1710
+ # Calculate neutral mass from consensus mz (for both cases)
1711
+ consensus_mz = (
1712
+ round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
1713
+ )
1714
+ if adduct_charge_top and adduct_mass_shift_top is not None:
1715
+ adduct_mass_neutral_top = (
1716
+ consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
1717
+ )
1718
+
1719
+ # Calculate number of MS2 spectra
1720
+ ms2_count = 0
1721
+ for fd in feature_data_list:
1722
+ ms2_scans = fd.get("ms2_scans")
1723
+ if ms2_scans is not None:
1724
+ ms2_count += len(ms2_scans)
1725
+
1726
+ metadata_list.append(
1727
+ {
1728
+ "consensus_uid": int(i), # "consensus_id": i,
1729
+ "consensus_id": str(feature.getUniqueId()),
1730
+ "quality": round(float(feature.getQuality()), 3),
1731
+ "number_samples": len(feature_data_list),
1732
+ # "number_ext": int(len(features_list)),
1733
+ "rt": round(float(np.mean(rt_values)), 4)
1734
+ if len(rt_values) > 0
1735
+ else 0.0,
1736
+ "mz": round(float(np.mean(mz_values)), 4)
1737
+ if len(mz_values) > 0
1738
+ else 0.0,
1739
+ "rt_min": round(float(np.min(rt_values)), 3)
1740
+ if len(rt_values) > 0
1741
+ else 0.0,
1742
+ "rt_max": round(float(np.max(rt_values)), 3)
1743
+ if len(rt_values) > 0
1744
+ else 0.0,
1745
+ "rt_mean": round(float(np.mean(rt_values)), 3)
1746
+ if len(rt_values) > 0
1747
+ else 0.0,
1748
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
1749
+ if len(rt_start_values) > 0
1750
+ else 0.0,
1751
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
1752
+ if len(rt_end_values) > 0
1753
+ else 0.0,
1754
+ "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
1755
+ if len(rt_delta_values) > 0
1756
+ else 0.0,
1757
+ "mz_min": round(float(np.min(mz_values)), 4)
1758
+ if len(mz_values) > 0
1759
+ else 0.0,
1760
+ "mz_max": round(float(np.max(mz_values)), 4)
1761
+ if len(mz_values) > 0
1762
+ else 0.0,
1763
+ "mz_mean": round(float(np.mean(mz_values)), 4)
1764
+ if len(mz_values) > 0
1765
+ else 0.0,
1766
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
1767
+ if len(mz_start_values) > 0
1768
+ else 0.0,
1769
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
1770
+ if len(mz_end_values) > 0
1771
+ else 0.0,
1772
+ "inty_mean": round(float(np.mean(inty_values)), 0)
1773
+ if len(inty_values) > 0
1774
+ else 0.0,
1775
+ "bl": -1.0,
1776
+ "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
1777
+ if len(coherence_values) > 0
1778
+ else 0.0,
1779
+ "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
1780
+ if len(prominence_values) > 0
1781
+ else 0.0,
1782
+ "chrom_prominence_scaled_mean": round(
1783
+ float(np.mean(prominence_scaled_values)),
1784
+ 3,
1785
+ )
1786
+ if len(prominence_scaled_values) > 0
1787
+ else 0.0,
1788
+ "chrom_height_scaled_mean": round(
1789
+ float(np.mean(height_scaled_values)),
1790
+ 3,
1791
+ )
1792
+ if len(height_scaled_values) > 0
1793
+ else 0.0,
1794
+ "iso_mean": round(float(np.mean(iso_values)), 2)
1795
+ if len(iso_values) > 0
1796
+ else 0.0,
1797
+ "charge_mean": round(float(np.mean(charge_values)), 2)
1798
+ if len(charge_values) > 0
1799
+ else 0.0,
1800
+ "number_ms2": int(ms2_count),
1801
+ "adducts": consensus_adduct_values
1802
+ if consensus_adduct_values
1803
+ else [], # Ensure it's always a list
1804
+ # New columns for top-ranked adduct information
1805
+ "adduct_top": adduct_top,
1806
+ "adduct_charge_top": adduct_charge_top,
1807
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
1808
+ if adduct_mass_neutral_top is not None
1809
+ else None,
1810
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
1811
+ if adduct_mass_shift_top is not None
1812
+ else None,
1813
+ # New columns for top-scoring identification results
1814
+ "id_top_name": None,
1815
+ "id_top_class": None,
1816
+ "id_top_adduct": None,
1817
+ "id_top_score": None,
1818
+ },
1819
+ )
1820
+
1821
+ consensus_mapping_df = pl.DataFrame(consensus_mapping)
1822
+ # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
1823
+ l1 = len(consensus_mapping_df)
1824
+ consensus_mapping_df = consensus_mapping_df.filter(
1825
+ pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
1826
+ )
1827
+ self.logger.debug(
1828
+ f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
1829
+ )
1830
+ self.consensus_mapping_df = consensus_mapping_df
1831
+ self.consensus_df = pl.DataFrame(metadata_list, strict=False)
1832
+
1833
+ if min_samples is None:
1834
+ min_samples = 1
1835
+ if min_samples < 1:
1836
+ min_samples = int(min_samples * len(self.samples_df))
1837
+
1838
+ # Validate that min_samples doesn't exceed the number of samples
1839
+ if min_samples > len(self.samples_df):
1840
+ self.logger.warning(
1841
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
1842
+ f"Setting min_samples to {len(self.samples_df)}.",
1843
+ )
1844
+ min_samples = len(self.samples_df)
1845
+
1846
+ # filter out consensus features with less than min_samples features
1847
+ l1 = len(self.consensus_df)
1848
+ self.consensus_df = self.consensus_df.filter(
1849
+ pl.col("number_samples") >= min_samples,
1850
+ )
1851
+ self.logger.debug(
1852
+ f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
1853
+ )
1854
+ # filter out consensus mapping with less than min_samples features
1855
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
1856
+ pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
1857
+ )
1858
+
1859
+ self.consensus_map = consensus_map
1860
+
1861
+
1862
+ def _perform_adduct_grouping(self, rt_tol, mz_tol):
1863
+ """Perform adduct grouping on consensus features."""
1864
+ import polars as pl
1865
+
1866
+ # Add adduct grouping and adduct_of assignment
1867
+ if len(self.consensus_df) > 0:
1868
+ # Get relevant columns for grouping
1869
+ consensus_data = []
1870
+ for row in self.consensus_df.iter_rows(named=True):
1871
+ consensus_data.append(
1872
+ {
1873
+ "consensus_uid": row["consensus_uid"],
1874
+ "rt": row["rt"],
1875
+ "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
1876
+ "adduct_top": row.get("adduct_top"),
1877
+ "inty_mean": row.get("inty_mean", 0),
1878
+ },
1879
+ )
1880
+
1881
+ # Use optimized adduct grouping
1882
+ adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
1883
+ self, consensus_data, rt_tol, mz_tol
1884
+ )
1885
+
1886
+ # Add the new columns to consensus_df
1887
+ self.consensus_df = self.consensus_df.with_columns(
1888
+ [
1889
+ pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
1890
+ pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
1891
+ ],
1892
+ )
1893
+
1894
+
1895
+ def _finalize_merge(self, link_ms2, min_samples):
1896
+ """Complete the merge process with final calculations and cleanup."""
1897
+ import polars as pl
1898
+
1899
+ # Check if consensus_df is empty or missing required columns
1900
+ if len(self.consensus_df) == 0 or "number_samples" not in self.consensus_df.columns:
1901
+ self.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
1902
+ return
1903
+
1904
+ # Validate min_samples parameter
1905
+ if min_samples is None:
1906
+ min_samples = 1
1907
+ if min_samples < 1:
1908
+ min_samples = int(min_samples * len(self.samples_df))
1909
+
1910
+ # Validate that min_samples doesn't exceed the number of samples
1911
+ if min_samples > len(self.samples_df):
1912
+ self.logger.warning(
1913
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
1914
+ f"Setting min_samples to {len(self.samples_df)}.",
1915
+ )
1916
+ min_samples = len(self.samples_df)
1917
+
1918
+ # Filter out consensus features with less than min_samples features
1919
+ l1 = len(self.consensus_df)
1920
+ self.consensus_df = self.consensus_df.filter(
1921
+ pl.col("number_samples") >= min_samples,
1922
+ )
1923
+ self.logger.debug(
1924
+ f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
1925
+ )
1926
+
1927
+ # Filter out consensus mapping with less than min_samples features
1928
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
1929
+ pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
1930
+ )
1931
+
1932
+ # Calculate the completeness of the consensus map
1933
+ if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
1934
+ c = (
1935
+ len(self.consensus_mapping_df)
1936
+ / len(self.consensus_df)
1937
+ / len(self.samples_df)
1938
+ )
1939
+ self.logger.info(
1940
+ f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
1941
+ )
1942
+ else:
1943
+ self.logger.warning(
1944
+ f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
1945
+ f"This may be due to min_samples ({min_samples}) being too high for the available data.",
1946
+ )
1947
+
1948
+ if link_ms2:
1949
+ self.find_ms2()
1950
+
1951
+
1952
+ def _optimized_feature_lookup(study_obj, features_df):
1953
+ """
1954
+ Optimized feature lookup creation using Polars operations.
1955
+ """
1956
+ study_obj.logger.debug("Creating optimized feature lookup...")
1957
+ start_time = time.time()
1958
+
1959
+ # Use Polars select for faster conversion
1960
+ feature_columns = [
1961
+ "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
1962
+ "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
1963
+ "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
1964
+ "ms2_scans", "adduct", "adduct_mass"
1965
+ ]
1966
+
1967
+ # Filter to only existing columns
1968
+ existing_columns = [col for col in feature_columns if col in features_df.columns]
1969
+
1970
+ # Convert to dictionary more efficiently
1971
+ selected_df = features_df.select(existing_columns)
1972
+
1973
+ features_lookup = {}
1974
+ for row in selected_df.iter_rows(named=True):
1975
+ feature_uid = row["feature_uid"]
1976
+ # Keep feature_uid in the dictionary for chunked merge compatibility
1977
+ features_lookup[feature_uid] = {k: v for k, v in row.items()}
1978
+
1979
+ lookup_time = time.time() - start_time
1980
+ if len(features_lookup) > 50000:
1981
+ study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
1982
+ return features_lookup
1983
+
1984
+
1985
+ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
1986
+ """
1987
+ Optimized O(n log n) adduct grouping using spatial indexing.
1988
+
1989
+ Args:
1990
+ study_obj: Study object with logger
1991
+ consensus_data: List of consensus feature dictionaries
1992
+ rt_tol: RT tolerance in minutes
1993
+ mz_tol: m/z tolerance in Da
1994
+
1995
+ Returns:
1996
+ Tuple of (adduct_group_list, adduct_of_list)
1997
+ """
1998
+ if not consensus_data:
1999
+ return [], []
2000
+
2001
+ n_features = len(consensus_data)
2002
+ if n_features > 10000:
2003
+ study_obj.logger.info(f"Adduct grouping for {n_features} consensus features...")
2004
+ else:
2005
+ study_obj.logger.debug(f"Adduct grouping for {n_features} consensus features...")
2006
+
2007
+ # Build spatial index using RT and neutral mass as coordinates
2008
+ features_by_mass = defaultdict(list)
2009
+ mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
2010
+
2011
+ valid_features = []
2012
+ for feature in consensus_data:
2013
+ consensus_uid = feature["consensus_uid"]
2014
+ rt = feature["rt"]
2015
+ neutral_mass = feature.get("adduct_mass_neutral_top")
2016
+ intensity = feature.get("inty_mean", 0)
2017
+ adduct = feature.get("adduct_top", "")
2018
+
2019
+ if neutral_mass is not None:
2020
+ mass_bin = int(neutral_mass / mass_bin_size)
2021
+ features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
2022
+ valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
2023
+
2024
+ # Union-Find for efficient grouping
2025
+ class UnionFind:
2026
+ def __init__(self, n):
2027
+ self.parent = list(range(n))
2028
+ self.rank = [0] * n
2029
+
2030
+ def find(self, x):
2031
+ if self.parent[x] != x:
2032
+ self.parent[x] = self.find(self.parent[x])
2033
+ return self.parent[x]
2034
+
2035
+ def union(self, x, y):
2036
+ px, py = self.find(x), self.find(y)
2037
+ if px == py:
2038
+ return
2039
+ if self.rank[px] < self.rank[py]:
2040
+ px, py = py, px
2041
+ self.parent[py] = px
2042
+ if self.rank[px] == self.rank[py]:
2043
+ self.rank[px] += 1
2044
+
2045
+ uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
2046
+ uf = UnionFind(len(valid_features))
2047
+
2048
+ # Find groups using spatial index
2049
+ checked_pairs = set()
2050
+ for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
2051
+ for bin_offset in [-1, 0, 1]:
2052
+ check_bin = bin1 + bin_offset
2053
+ if check_bin not in features_by_mass:
2054
+ continue
2055
+
2056
+ for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
2057
+ if uid1 >= uid2:
2058
+ continue
2059
+
2060
+ pair = (min(uid1, uid2), max(uid1, uid2))
2061
+ if pair in checked_pairs:
2062
+ continue
2063
+ checked_pairs.add(pair)
2064
+
2065
+ mass_diff = abs(mass1 - mass2)
2066
+ rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
2067
+
2068
+ if mass_diff <= mz_tol and rt_diff <= rt_tol:
2069
+ j = uid_to_idx[uid2]
2070
+ uf.union(i, j)
2071
+
2072
+ # Extract groups
2073
+ groups_by_root = defaultdict(list)
2074
+ for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
2075
+ root = uf.find(i)
2076
+ groups_by_root[root].append((uid, rt, mass, inty, adduct))
2077
+
2078
+ groups = {}
2079
+ group_id = 1
2080
+ assigned_groups = {}
2081
+
2082
+ for group_members in groups_by_root.values():
2083
+ member_uids = [uid for uid, _, _, _, _ in group_members]
2084
+
2085
+ for uid in member_uids:
2086
+ assigned_groups[uid] = group_id
2087
+ groups[group_id] = member_uids
2088
+ group_id += 1
2089
+
2090
+ # Handle features without neutral mass
2091
+ for feature in consensus_data:
2092
+ uid = feature["consensus_uid"]
2093
+ if uid not in assigned_groups:
2094
+ assigned_groups[uid] = group_id
2095
+ groups[group_id] = [uid]
2096
+ group_id += 1
2097
+
2098
+ # Determine adduct_of for each group
2099
+ group_adduct_of = {}
2100
+ for grp_id, member_uids in groups.items():
2101
+ best_uid = None
2102
+ best_priority = -1
2103
+ best_intensity = 0
2104
+
2105
+ for uid in member_uids:
2106
+ feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
2107
+ if not feature_data:
2108
+ continue
2109
+
2110
+ adduct = feature_data.get("adduct_top", "")
2111
+ intensity = feature_data.get("inty_mean", 0)
2112
+
2113
+ priority = 0
2114
+ if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
2115
+ priority = 3
2116
+ elif adduct and "[M-H]" in adduct:
2117
+ priority = 2
2118
+ elif adduct and "M" in adduct:
2119
+ priority = 1
2120
+
2121
+ if priority > best_priority or (priority == best_priority and intensity > best_intensity):
2122
+ best_uid = uid
2123
+ best_priority = priority
2124
+ best_intensity = intensity
2125
+
2126
+ group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
2127
+
2128
+ # Build final lists in same order as consensus_data
2129
+ adduct_group_list = []
2130
+ adduct_of_list = []
2131
+
2132
+ for feature in consensus_data:
2133
+ uid = feature["consensus_uid"]
2134
+ group = assigned_groups.get(uid, 0)
2135
+ adduct_of = group_adduct_of.get(group, uid)
2136
+
2137
+ adduct_group_list.append(group)
2138
+ adduct_of_list.append(adduct_of)
2139
+
2140
+ if n_features > 10000:
2141
+ study_obj.logger.info("Adduct grouping completed.")
2142
+ else:
2143
+ study_obj.logger.debug("Adduct grouping completed.")
2144
+
2145
+ return adduct_group_list, adduct_of_list