masster 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py ADDED
@@ -0,0 +1,1607 @@
1
+ """
2
+ Unified merge module for the Study class.
3
+ Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'chunked'
4
+ """
5
+
6
+ import time
7
+ import numpy as np
8
+ from collections import defaultdict
9
+ from datetime import datetime
10
+ from tqdm import tqdm
11
+ import pyopenms as oms
12
+ import polars as pl
13
+ from masster.study.defaults import merge_defaults
14
+
15
+
16
+ def merge(self, **kwargs) -> None:
17
+ """
18
+ Group features across samples into consensus features using various algorithms.
19
+
20
+ This function provides a unified interface to multiple feature grouping algorithms,
21
+ each optimized for different dataset sizes and analysis requirements.
22
+
23
+ Parameters
24
+ ----------
25
+ **kwargs : dict
26
+ Parameters from merge_defaults class:
27
+ - method : str, default 'kd'
28
+ Merge algorithm: 'kd', 'qt', 'kd-nowarp', 'chunked'
29
+ - min_samples : int, default 10
30
+ Minimum number of samples for consensus feature
31
+ - rt_tol : float, default 2.0
32
+ RT tolerance in seconds
33
+ - mz_tol : float, default 0.01
34
+ m/z tolerance in Da (Daltons) for all methods
35
+ - chunk_size : int, default 500
36
+ Chunk size for 'chunked' method
37
+ - nr_partitions : int, default 500
38
+ Number of partitions in m/z dimension for KD algorithms
39
+ - min_rel_cc_size : float, default 0.3
40
+ Minimum relative connected component size for conflict resolution
41
+ - max_pairwise_log_fc : float, default 0.5
42
+ Maximum pairwise log fold change for conflict resolution
43
+ - max_nr_conflicts : int, default 0
44
+ Maximum number of conflicts allowed in consensus feature
45
+ - link_ms2 : bool, default True
46
+ Whether to link MS2 spectra to consensus features
47
+
48
+ Algorithm Guidelines
49
+ -------------------
50
+ - KD: Best general purpose, O(n log n), recommended default
51
+ - QT: Thorough but slow O(n²), good for <1000 samples
52
+ - KD-NoWarp: Memory efficient KD without RT warping for large datasets
53
+ - Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
54
+ Uses optimized partitioning for better memory management while maintaining
55
+ full cross-sample consensus feature detection.
56
+ """
57
+ start_time = time.time()
58
+
59
+ # Initialize with defaults and override with kwargs
60
+ params = merge_defaults()
61
+
62
+ # Filter and apply only valid parameters
63
+ valid_params = set(params.list_parameters())
64
+ for key, value in kwargs.items():
65
+ if key in valid_params:
66
+ setattr(params, key, value)
67
+ else:
68
+ self.logger.warning(f"Unknown parameter '{key}' ignored")
69
+
70
+ # Validate method
71
+ if params.method not in ['kd', 'qt', 'kd-nowarp', 'chunked']:
72
+ raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['kd', 'qt', 'kd-nowarp', 'chunked']")
73
+
74
+ # Persist last used params for diagnostics
75
+ try:
76
+ self._merge_params_last = params.to_dict()
77
+ except Exception:
78
+ self._merge_params_last = {}
79
+
80
+ # Ensure feature maps are available for merging (regenerate if needed)
81
+ if len(self.features_maps) < len(self.samples_df):
82
+ self.features_maps = []
83
+ self.load_features()
84
+
85
+ self.logger.info(
86
+ f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da, min_rel_cc_size={params.min_rel_cc_size}, max_pairwise_log_fc={params.max_pairwise_log_fc}, max_nr_conflicts={params.max_nr_conflicts}"
87
+ )
88
+
89
+ # Initialize
90
+ self._reset_consensus_data()
91
+
92
+ # Cache adducts for performance (avoid repeated _get_adducts() calls)
93
+ cached_adducts_df = None
94
+ cached_valid_adducts = None
95
+ try:
96
+ cached_adducts_df = self._get_adducts()
97
+ if not cached_adducts_df.is_empty():
98
+ cached_valid_adducts = set(cached_adducts_df["name"].to_list())
99
+ else:
100
+ cached_valid_adducts = set()
101
+ except Exception as e:
102
+ self.logger.warning(f"Could not retrieve study adducts: {e}")
103
+ cached_valid_adducts = set()
104
+
105
+ # Always allow '?' adducts
106
+ cached_valid_adducts.add("?")
107
+
108
+ # Route to algorithm implementation
109
+ if params.method == 'kd':
110
+ consensus_map = _merge_kd(self, params)
111
+ # Extract consensus features
112
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
113
+ elif params.method == 'qt':
114
+ consensus_map = _merge_qt(self, params)
115
+ # Extract consensus features
116
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
117
+ elif params.method == 'kd-nowarp':
118
+ consensus_map = _merge_kd_nowarp(self, params)
119
+ # Extract consensus features
120
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
121
+ elif params.method == 'chunked':
122
+ consensus_map = _merge_chunked(self, params, cached_adducts_df, cached_valid_adducts)
123
+ # Note: _merge_chunked populates consensus_df directly, no need to extract
124
+
125
+ # Perform adduct grouping
126
+ self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
127
+
128
+ # Link MS2 if requested
129
+ if params.link_ms2:
130
+ self._finalize_merge(params.link_ms2, params.min_samples)
131
+
132
+ # Log completion without the misleading feature count
133
+ elapsed = time.time() - start_time
134
+ self.logger.debug(f"Merge process completed in {elapsed:.1f}s")
135
+
136
+
137
+ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
138
+ """KD-tree based merge (fast, recommended)"""
139
+
140
+ consensus_map = oms.ConsensusMap()
141
+ file_descriptions = consensus_map.getColumnHeaders()
142
+
143
+ for i, feature_map in enumerate(self.features_maps):
144
+ file_description = file_descriptions.get(i, oms.ColumnHeader())
145
+ file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
146
+ file_description.size = feature_map.size()
147
+ file_description.unique_id = feature_map.getUniqueId()
148
+ file_descriptions[i] = file_description
149
+
150
+ consensus_map.setColumnHeaders(file_descriptions)
151
+
152
+ # Configure KD algorithm
153
+ grouper = oms.FeatureGroupingAlgorithmKD()
154
+ params_oms = grouper.getParameters()
155
+
156
+ params_oms.setValue("mz_unit", "Da")
157
+ params_oms.setValue("nr_partitions", params.nr_partitions)
158
+ params_oms.setValue("warp:enabled", "true")
159
+ params_oms.setValue("warp:rt_tol", params.rt_tol)
160
+ params_oms.setValue("warp:mz_tol", params.mz_tol)
161
+ params_oms.setValue("link:rt_tol", params.rt_tol)
162
+ params_oms.setValue("link:mz_tol", params.mz_tol)
163
+ params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
164
+ params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
165
+ params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
166
+ #params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
167
+
168
+ grouper.setParameters(params_oms)
169
+ grouper.group(self.features_maps, consensus_map)
170
+
171
+ return consensus_map
172
+
173
+
174
+ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
175
+ """QT (Quality Threshold) based merge"""
176
+
177
+ n_samples = len(self.features_maps)
178
+ if n_samples > 1000:
179
+ self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
180
+
181
+ consensus_map = oms.ConsensusMap()
182
+ file_descriptions = consensus_map.getColumnHeaders()
183
+
184
+ for i, feature_map in enumerate(self.features_maps):
185
+ file_description = file_descriptions.get(i, oms.ColumnHeader())
186
+ file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
187
+ file_description.size = feature_map.size()
188
+ file_description.unique_id = feature_map.getUniqueId()
189
+ file_descriptions[i] = file_description
190
+
191
+ consensus_map.setColumnHeaders(file_descriptions)
192
+
193
+ # Configure QT algorithm
194
+ grouper = oms.FeatureGroupingAlgorithmQT()
195
+ params_oms = grouper.getParameters()
196
+
197
+ params_oms.setValue("distance_RT:max_difference", params.rt_tol)
198
+ params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
199
+ params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
200
+ params_oms.setValue("ignore_charge", "true")
201
+ params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
202
+ params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
203
+ params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
204
+ params_oms.setValue("nr_partitions", params.nr_partitions)
205
+
206
+ grouper.setParameters(params_oms)
207
+ grouper.group(self.features_maps, consensus_map)
208
+
209
+ return consensus_map
210
+
211
+
212
+ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
213
+ """KD-tree based merge without RT warping"""
214
+
215
+ consensus_map = oms.ConsensusMap()
216
+ file_descriptions = consensus_map.getColumnHeaders()
217
+
218
+ for i, feature_map in enumerate(self.features_maps):
219
+ file_description = file_descriptions.get(i, oms.ColumnHeader())
220
+ file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
221
+ file_description.size = feature_map.size()
222
+ file_description.unique_id = feature_map.getUniqueId()
223
+ file_descriptions[i] = file_description
224
+
225
+ consensus_map.setColumnHeaders(file_descriptions)
226
+
227
+ # Configure KD algorithm with warping disabled for memory efficiency
228
+ grouper = oms.FeatureGroupingAlgorithmKD()
229
+ params_oms = grouper.getParameters()
230
+
231
+ params_oms.setValue("mz_unit", "Da")
232
+ params_oms.setValue("nr_partitions", params.nr_partitions)
233
+ params_oms.setValue("warp:enabled", "false") # Disabled for memory efficiency
234
+ params_oms.setValue("link:rt_tol", params.rt_tol)
235
+ params_oms.setValue("link:mz_tol", params.mz_tol)
236
+ params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
237
+ params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
238
+ params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
239
+ #params_oms.setValue("link:charge_merging", "Any")
240
+
241
+ grouper.setParameters(params_oms)
242
+ grouper.group(self.features_maps, consensus_map)
243
+
244
+ return consensus_map
245
+
246
+
247
+ def _merge_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
248
+ """Chunked merge with proper cross-chunk consensus building"""
249
+
250
+ n_samples = len(self.features_maps)
251
+ if n_samples <= params.chunk_size:
252
+ self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
253
+ consensus_map = _merge_kd(self, params)
254
+ # Extract consensus features to populate consensus_df for chunked method consistency
255
+ self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
256
+ return consensus_map
257
+
258
+ # Process in chunks
259
+ chunks = []
260
+ for i in range(0, n_samples, params.chunk_size):
261
+ chunk_end = min(i + params.chunk_size, n_samples)
262
+ chunks.append((i, self.features_maps[i:chunk_end]))
263
+
264
+ self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
265
+
266
+ # Process each chunk to create chunk consensus maps
267
+ chunk_consensus_maps = []
268
+
269
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
270
+ chunk_consensus_map = oms.ConsensusMap()
271
+
272
+ # Set up file descriptions for chunk
273
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
274
+ for j, feature_map in enumerate(chunk_maps):
275
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
276
+ file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
277
+ file_description.size = feature_map.size()
278
+ file_description.unique_id = feature_map.getUniqueId()
279
+ file_descriptions[j] = file_description
280
+
281
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
282
+
283
+ # Use KD algorithm for chunk
284
+ grouper = oms.FeatureGroupingAlgorithmKD()
285
+ chunk_params = grouper.getParameters()
286
+ chunk_params.setValue("mz_unit", "Da")
287
+ chunk_params.setValue("nr_partitions", params.nr_partitions)
288
+ chunk_params.setValue("warp:enabled", "true")
289
+ chunk_params.setValue("warp:rt_tol", params.rt_tol)
290
+ chunk_params.setValue("warp:mz_tol", params.mz_tol)
291
+ chunk_params.setValue("link:rt_tol", params.rt_tol)
292
+ chunk_params.setValue("link:mz_tol", params.mz_tol)
293
+ chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
294
+ chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
295
+ chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
296
+
297
+ grouper.setParameters(chunk_params)
298
+ grouper.group(chunk_maps, chunk_consensus_map)
299
+
300
+ chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
301
+
302
+ # Merge chunk results with proper cross-chunk consensus building
303
+ _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
304
+
305
+ # Create a dummy consensus map for compatibility (since other functions expect it)
306
+ consensus_map = oms.ConsensusMap()
307
+ return consensus_map
308
+
309
+
310
+ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
311
+ """
312
+ Scalable aggregation of chunk consensus maps into final consensus_df.
313
+
314
+ This function implements cross-chunk consensus building by:
315
+ 1. Extracting feature_uids from each chunk consensus map
316
+ 2. Aggregating features close in RT/m/z across chunks
317
+ 3. Building consensus_df and consensus_mapping_df directly
318
+ """
319
+
320
+ if len(chunk_consensus_maps) == 1:
321
+ # Single chunk case - just extract using the true global min_samples.
322
+ # No need for permissive threshold because we are not discarding singletons pre-aggregation.
323
+ self._extract_consensus_features(
324
+ chunk_consensus_maps[0][1],
325
+ params.min_samples,
326
+ cached_adducts_df,
327
+ cached_valid_adducts,
328
+ )
329
+ return
330
+
331
+ # Build feature_uid to feature_data lookup for fast access
332
+ feature_uid_map = {
333
+ row["feature_id"]: row["feature_uid"]
334
+ for row in self.features_df.iter_rows(named=True)
335
+ }
336
+
337
+ features_lookup = _optimized_feature_lookup(self, self.features_df)
338
+
339
+ # Extract all consensus features from chunks with their feature_uids
340
+ all_chunk_consensus = []
341
+ consensus_id_counter = 0
342
+
343
+ for chunk_idx, (chunk_start_idx, chunk_consensus_map) in enumerate(chunk_consensus_maps):
344
+ for consensus_feature in chunk_consensus_map:
345
+ # ACCEPT ALL consensus features (size >=1) here.
346
+ # Reason: A feature that is globally present in many samples can still
347
+ # appear only once inside a given sample chunk. Early filtering at
348
+ # size>=2 causes irreversible loss and underestimates the final
349
+ # consensus count (observed ~296 vs 950 for KD). We defer filtering
350
+ # strictly to the final global min_samples.
351
+
352
+ # Extract feature_uids from this consensus feature
353
+ feature_uids = []
354
+ feature_data_list = []
355
+ sample_uids = []
356
+
357
+ for feature_handle in consensus_feature.getFeatureList():
358
+ fuid = str(feature_handle.getUniqueId())
359
+ if fuid not in feature_uid_map:
360
+ continue
361
+
362
+ feature_uid = feature_uid_map[fuid]
363
+ feature_data = features_lookup.get(feature_uid)
364
+ if feature_data:
365
+ feature_uids.append(feature_uid)
366
+ feature_data_list.append(feature_data)
367
+ sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
368
+
369
+ if not feature_data_list:
370
+ # No retrievable feature metadata (possible stale map reference) -> skip
371
+ continue # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
372
+ rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
373
+ mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
374
+ if rt_vals_local:
375
+ rt_min_local = min(rt_vals_local)
376
+ rt_max_local = max(rt_vals_local)
377
+ else:
378
+ rt_min_local = rt_max_local = consensus_feature.getRT()
379
+ if mz_vals_local:
380
+ mz_min_local = min(mz_vals_local)
381
+ mz_max_local = max(mz_vals_local)
382
+ else:
383
+ mz_min_local = mz_max_local = consensus_feature.getMZ()
384
+
385
+ # Store chunk consensus with feature tracking
386
+ chunk_consensus_data = {
387
+ 'consensus_id': consensus_id_counter,
388
+ 'chunk_idx': chunk_idx,
389
+ 'chunk_start_idx': chunk_start_idx,
390
+ 'mz': consensus_feature.getMZ(),
391
+ 'rt': consensus_feature.getRT(),
392
+ 'mz_min': mz_min_local,
393
+ 'mz_max': mz_max_local,
394
+ 'rt_min': rt_min_local,
395
+ 'rt_max': rt_max_local,
396
+ 'intensity': consensus_feature.getIntensity(),
397
+ 'quality': consensus_feature.getQuality(),
398
+ 'feature_uids': feature_uids,
399
+ 'feature_data_list': feature_data_list,
400
+ 'sample_uids': sample_uids,
401
+ 'sample_count': len(feature_data_list)
402
+ }
403
+
404
+ all_chunk_consensus.append(chunk_consensus_data)
405
+ consensus_id_counter += 1
406
+
407
+ if not all_chunk_consensus:
408
+ # No valid consensus features found
409
+ self.consensus_df = pl.DataFrame()
410
+ self.consensus_mapping_df = pl.DataFrame()
411
+ return
412
+
413
+ # Perform cross-chunk clustering using optimized spatial indexing
414
+ def _cluster_chunk_consensus(chunk_consensus_list: list, rt_tol: float, mz_tol: float) -> list:
415
+ """Cluster chunk consensus features using interval overlap (no over-relaxation).
416
+
417
+ A union is formed if either centroids are within tolerance OR their RT / m/z
418
+ intervals (expanded by tolerance) overlap, and they originate from different chunks.
419
+ """
420
+ if not chunk_consensus_list:
421
+ return []
422
+
423
+ n_features = len(chunk_consensus_list)
424
+
425
+ # Spatial bins using strict tolerances (improves candidate reduction without recall loss)
426
+ rt_bin_size = rt_tol if rt_tol > 0 else 1.0
427
+ mz_bin_size = mz_tol if mz_tol > 0 else 0.01
428
+ features_by_bin = defaultdict(list)
429
+
430
+ for i, cf in enumerate(chunk_consensus_list):
431
+ rt_bin = int(cf['rt'] / rt_bin_size)
432
+ mz_bin = int(cf['mz'] / mz_bin_size)
433
+ features_by_bin[(rt_bin, mz_bin)].append(i)
434
+
435
+ class UF:
436
+ def __init__(self, n):
437
+ self.p = list(range(n))
438
+ self.r = [0]*n
439
+ def find(self, x):
440
+ if self.p[x] != x:
441
+ self.p[x] = self.find(self.p[x])
442
+ return self.p[x]
443
+ def union(self, a,b):
444
+ pa, pb = self.find(a), self.find(b)
445
+ if pa == pb:
446
+ return
447
+ if self.r[pa] < self.r[pb]:
448
+ pa, pb = pb, pa
449
+ self.p[pb] = pa
450
+ if self.r[pa] == self.r[pb]:
451
+ self.r[pa] += 1
452
+
453
+ uf = UF(n_features)
454
+ checked = set()
455
+ for (rtb, mzb), idxs in features_by_bin.items():
456
+ for dr in (-1,0,1):
457
+ for dm in (-1,0,1):
458
+ neigh = (rtb+dr, mzb+dm)
459
+ if neigh not in features_by_bin:
460
+ continue
461
+ for i in idxs:
462
+ for j in features_by_bin[neigh]:
463
+ if i >= j:
464
+ continue
465
+ pair = (i,j)
466
+ if pair in checked:
467
+ continue
468
+ checked.add(pair)
469
+ a = chunk_consensus_list[i]
470
+ b = chunk_consensus_list[j]
471
+ if a['chunk_idx'] == b['chunk_idx']:
472
+ continue
473
+ # Centroid checks
474
+ centroid_close = (abs(a['rt']-b['rt']) <= rt_tol and abs(a['mz']-b['mz']) <= mz_tol)
475
+ # Interval overlap checks (expanded by tolerance)
476
+ rt_overlap = (a['rt_min'] - rt_tol) <= (b['rt_max'] + rt_tol) and (b['rt_min'] - rt_tol) <= (a['rt_max'] + rt_tol)
477
+ mz_overlap = (a['mz_min'] - mz_tol) <= (b['mz_max'] + mz_tol) and (b['mz_min'] - mz_tol) <= (a['mz_max'] + mz_tol)
478
+ if centroid_close or (rt_overlap and mz_overlap):
479
+ uf.union(i,j)
480
+
481
+ groups_by_root = defaultdict(list)
482
+ for i in range(n_features):
483
+ groups_by_root[uf.find(i)].append(chunk_consensus_list[i])
484
+ return list(groups_by_root.values())
485
+ # (Obsolete relaxed + centroid stitching code removed.)
486
+
487
+ # --- Stage 1: initial cross-chunk clustering of chunk consensus features ---
488
+ initial_groups = _cluster_chunk_consensus(all_chunk_consensus, params.rt_tol, params.mz_tol)
489
+
490
+ # --- Stage 2: centroid refinement (lightweight second pass) ---
491
+ def _refine_groups(groups: list, rt_tol: float, mz_tol: float) -> list:
492
+ """Refine groups by clustering group centroids (single-link) under same tolerances.
493
+
494
+ This reconciles borderline splits left after interval-overlap clustering without
495
+ re-introducing broad over-merging. Works on group centroids only (low cost).
496
+ """
497
+ if len(groups) <= 1:
498
+ return groups
499
+ # Build centroid list
500
+ centroids = [] # (idx, rt, mz)
501
+ for gi, g in enumerate(groups):
502
+ if not g:
503
+ continue
504
+ rt_vals = [cf['rt'] for cf in g]
505
+ mz_vals = [cf['mz'] for cf in g]
506
+ if not rt_vals or not mz_vals:
507
+ continue
508
+ centroids.append((gi, float(np.mean(rt_vals)), float(np.mean(mz_vals))))
509
+ if len(centroids) <= 1:
510
+ return groups
511
+
512
+ # Spatial binning for centroid clustering
513
+ rt_bin = rt_tol if rt_tol > 0 else 1.0
514
+ mz_bin = mz_tol if mz_tol > 0 else 0.01
515
+ bins = defaultdict(list)
516
+ for idx, rt_c, mz_c in centroids:
517
+ bins[(int(rt_c/rt_bin), int(mz_c/mz_bin))].append((idx, rt_c, mz_c))
518
+
519
+ # Union-Find over group indices
520
+ parent = list(range(len(groups)))
521
+ rank = [0]*len(groups)
522
+ def find(x):
523
+ if parent[x] != x:
524
+ parent[x] = find(parent[x])
525
+ return parent[x]
526
+ def union(a,b):
527
+ pa, pb = find(a), find(b)
528
+ if pa == pb:
529
+ return
530
+ if rank[pa] < rank[pb]:
531
+ pa, pb = pb, pa
532
+ parent[pb] = pa
533
+ if rank[pa] == rank[pb]:
534
+ rank[pa] += 1
535
+
536
+ checked = set()
537
+ for (rb, mb), items in bins.items():
538
+ for dr in (-1,0,1):
539
+ for dm in (-1,0,1):
540
+ neigh_key = (rb+dr, mb+dm)
541
+ if neigh_key not in bins:
542
+ continue
543
+ for (gi, rt_i, mz_i) in items:
544
+ for (gj, rt_j, mz_j) in bins[neigh_key]:
545
+ if gi >= gj:
546
+ continue
547
+ pair = (gi, gj)
548
+ if pair in checked:
549
+ continue
550
+ checked.add(pair)
551
+ if abs(rt_i-rt_j) <= rt_tol and abs(mz_i-mz_j) <= mz_tol:
552
+ union(gi, gj)
553
+
554
+ merged = defaultdict(list)
555
+ for gi, g in enumerate(groups):
556
+ merged[find(gi)].extend(g)
557
+ return list(merged.values())
558
+
559
+ refined_groups = _refine_groups(initial_groups, params.rt_tol, params.mz_tol)
560
+
561
+ # --- Stage 3: build final consensus feature metadata and mapping ---
562
+ consensus_metadata = []
563
+ consensus_mapping_list = []
564
+ consensus_uid_counter = 0
565
+
566
+ for group in refined_groups:
567
+ if not group:
568
+ continue
569
+
570
+ # Aggregate underlying feature data (deduplicated by feature_uid)
571
+ feature_data_acc = {}
572
+ sample_uids_acc = set()
573
+ rt_values_chunk = [] # use chunk-level centroids for statistic helper
574
+ mz_values_chunk = []
575
+ intensity_values_chunk = []
576
+ quality_values_chunk = []
577
+
578
+ for cf in group:
579
+ rt_values_chunk.append(cf['rt'])
580
+ mz_values_chunk.append(cf['mz'])
581
+ intensity_values_chunk.append(cf.get('intensity', 0.0) or 0.0)
582
+ quality_values_chunk.append(cf.get('quality', 1.0) or 1.0)
583
+
584
+ for fd, samp_uid in zip(cf['feature_data_list'], cf['sample_uids']):
585
+ fid = fd.get('feature_uid') or fd.get('uid') or fd.get('feature_id')
586
+ # feature_uid expected in fd under 'feature_uid'; fallback attempts just in case
587
+ if fid is None:
588
+ continue
589
+ if fid not in feature_data_acc:
590
+ feature_data_acc[fid] = fd
591
+ sample_uids_acc.add(samp_uid)
592
+
593
+ if not feature_data_acc:
594
+ continue
595
+
596
+ number_samples = len(sample_uids_acc)
597
+
598
+ # NOTE: Don't filter by min_samples here - let _finalize_merge handle it
599
+ # This allows proper cross-chunk consensus building before final filtering
600
+
601
+ metadata = _calculate_consensus_statistics(
602
+ self,
603
+ consensus_uid_counter,
604
+ list(feature_data_acc.values()),
605
+ rt_values_chunk,
606
+ mz_values_chunk,
607
+ intensity_values_chunk,
608
+ quality_values_chunk,
609
+ number_features=len(feature_data_acc),
610
+ number_samples=number_samples,
611
+ cached_adducts_df=cached_adducts_df,
612
+ cached_valid_adducts=cached_valid_adducts,
613
+ )
614
+ consensus_metadata.append(metadata)
615
+
616
+ # Build mapping rows (deduplicated)
617
+ for fid, fd in feature_data_acc.items():
618
+ samp_uid = fd.get('sample_uid') or fd.get('sample_id') or fd.get('sample')
619
+ # If absent we attempt to derive from original group sample_uids pairing
620
+ # but most feature_data rows should include sample_uid already.
621
+ if samp_uid is None:
622
+ # fallback: search for cf containing this fid
623
+ for cf in group:
624
+ for fd2, samp2 in zip(cf['feature_data_list'], cf['sample_uids']):
625
+ f2id = fd2.get('feature_uid') or fd2.get('uid') or fd2.get('feature_id')
626
+ if f2id == fid:
627
+ samp_uid = samp2
628
+ break
629
+ if samp_uid is not None:
630
+ break
631
+ if samp_uid is None:
632
+ continue
633
+ consensus_mapping_list.append({
634
+ 'consensus_uid': consensus_uid_counter,
635
+ 'sample_uid': samp_uid,
636
+ 'feature_uid': fid,
637
+ })
638
+
639
+ consensus_uid_counter += 1
640
+
641
+ # Assign DataFrames
642
+ self.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
643
+ self.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
644
+
645
+ # Ensure mapping only contains features from retained consensus_df
646
+ if len(self.consensus_df) > 0:
647
+ valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
648
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
649
+ pl.col('consensus_uid').is_in(list(valid_consensus_ids))
650
+ )
651
+ else:
652
+ self.consensus_mapping_df = pl.DataFrame()
653
+
654
+ # Attach empty consensus_map placeholder for downstream compatibility
655
+ self.consensus_map = oms.ConsensusMap()
656
+ return
657
+
658
+
659
+ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
660
+ rt_values: list, mz_values: list,
661
+ intensity_values: list, quality_values: list,
662
+ number_features: int = None, number_samples: int = None,
663
+ cached_adducts_df=None, cached_valid_adducts=None) -> dict:
664
+ """
665
+ Calculate comprehensive statistics for a consensus feature from aggregated feature data.
666
+
667
+ Args:
668
+ consensus_uid: Unique ID for this consensus feature
669
+ feature_data_list: List of individual feature dictionaries
670
+ rt_values: RT values from chunk consensus features
671
+ mz_values: m/z values from chunk consensus features
672
+ intensity_values: Intensity values from chunk consensus features
673
+ quality_values: Quality values from chunk consensus features
674
+
675
+ Returns:
676
+ Dictionary with consensus feature metadata
677
+ """
678
+ if not feature_data_list:
679
+ return {}
680
+
681
+ # Convert feature data to numpy arrays for vectorized computation
682
+ rt_feat_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
683
+ mz_feat_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
684
+ rt_start_values = np.array([fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None])
685
+ rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
686
+ rt_delta_values = np.array([fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None])
687
+ mz_start_values = np.array([fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None])
688
+ mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
689
+ inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
690
+ coherence_values = np.array([fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None])
691
+ prominence_values = np.array([fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None])
692
+ prominence_scaled_values = np.array([fd.get("chrom_prominence_scaled", 0) for fd in feature_data_list if fd.get("chrom_prominence_scaled") is not None])
693
+ height_scaled_values = np.array([fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None])
694
+ iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
695
+ charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
696
+
697
+ # Process adducts with cached validation
698
+ all_adducts = []
699
+ valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
700
+ valid_adducts.add("?") # Always allow '?' adducts
701
+
702
+ for fd in feature_data_list:
703
+ adduct = fd.get("adduct")
704
+ if adduct is not None:
705
+ # Only include adducts that are valid (from cached study adducts or contain '?')
706
+ if adduct in valid_adducts or "?" in adduct:
707
+ all_adducts.append(adduct)
708
+
709
+ # Calculate adduct consensus
710
+ adduct_values = []
711
+ adduct_top = None
712
+ adduct_charge_top = None
713
+ adduct_mass_neutral_top = None
714
+ adduct_mass_shift_top = None
715
+
716
+ if all_adducts:
717
+ adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
718
+ total_count = sum(adduct_counts.values())
719
+ for adduct, count in adduct_counts.items():
720
+ percentage = (count / total_count) * 100 if total_count > 0 else 0
721
+ adduct_values.append([str(adduct), int(count), float(round(percentage, 2))])
722
+
723
+ adduct_values.sort(key=lambda x: x[1], reverse=True)
724
+
725
+ if adduct_values:
726
+ adduct_top = adduct_values[0][0]
727
+ # Try to get charge and mass shift from cached study adducts
728
+ adduct_found = False
729
+ if cached_adducts_df is not None and not cached_adducts_df.is_empty():
730
+ matching_adduct = cached_adducts_df.filter(
731
+ pl.col("name") == adduct_top,
732
+ )
733
+ if not matching_adduct.is_empty():
734
+ adduct_row = matching_adduct.row(0, named=True)
735
+ adduct_charge_top = adduct_row["charge"]
736
+ adduct_mass_shift_top = adduct_row["mass_shift"]
737
+ adduct_found = True
738
+
739
+ if not adduct_found:
740
+ # Set default charge and mass shift for top adduct
741
+ adduct_charge_top = 1
742
+ adduct_mass_shift_top = 1.007825
743
+ else:
744
+ # Default adduct based on study polarity
745
+ study_polarity = getattr(study_obj, "polarity", "positive")
746
+ if study_polarity in ["negative", "neg"]:
747
+ adduct_top = "[M-?]1-"
748
+ adduct_charge_top = -1
749
+ adduct_mass_shift_top = -1.007825
750
+ else:
751
+ adduct_top = "[M+?]1+"
752
+ adduct_charge_top = 1
753
+ adduct_mass_shift_top = 1.007825
754
+
755
+ adduct_values = [[adduct_top, 1, 100.0]]
756
+
757
+ # Calculate neutral mass
758
+ consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
759
+ if adduct_charge_top and adduct_mass_shift_top is not None:
760
+ adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
761
+
762
+ # Calculate MS2 count
763
+ ms2_count = 0
764
+ for fd in feature_data_list:
765
+ ms2_scans = fd.get("ms2_scans")
766
+ if ms2_scans is not None:
767
+ ms2_count += len(ms2_scans)
768
+
769
+ # Build consensus metadata
770
+ return {
771
+ "consensus_uid": int(consensus_uid),
772
+ "consensus_id": str(consensus_uid), # Use simple string ID
773
+ "quality": round(float(np.mean(quality_values)), 3) if len(quality_values) > 0 else 1.0,
774
+ "number_samples": number_samples if number_samples is not None else len(feature_data_list),
775
+ "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
776
+ "mz": consensus_mz,
777
+ "rt_min": round(float(np.min(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
778
+ "rt_max": round(float(np.max(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
779
+ "rt_mean": round(float(np.mean(rt_feat_values)), 3) if len(rt_feat_values) > 0 else 0.0,
780
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
781
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
782
+ "rt_delta_mean": round(float(np.mean(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
783
+ "mz_min": round(float(np.min(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
784
+ "mz_max": round(float(np.max(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
785
+ "mz_mean": round(float(np.mean(mz_feat_values)), 4) if len(mz_feat_values) > 0 else 0.0,
786
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
787
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
788
+ "inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
789
+ "bl": -1.0,
790
+ "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
791
+ "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
792
+ "chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3) if len(prominence_scaled_values) > 0 else 0.0,
793
+ "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3) if len(height_scaled_values) > 0 else 0.0,
794
+ "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
795
+ "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
796
+ "number_ms2": int(ms2_count),
797
+ "adducts": adduct_values,
798
+ "adduct_top": adduct_top,
799
+ "adduct_charge_top": adduct_charge_top,
800
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6) if adduct_mass_neutral_top is not None else None,
801
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
802
+ "id_top_name": None,
803
+ "id_top_class": None,
804
+ "id_top_adduct": None,
805
+ "id_top_score": None,
806
+ }
807
+
808
+
809
+ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) -> list:
810
+ """
811
+ Cluster consensus features from different chunks based on RT and m/z similarity.
812
+
813
+ Args:
814
+ features: List of feature dictionaries with 'mz', 'rt', 'id' keys
815
+ rt_tol: RT tolerance in seconds
816
+ mz_tol: m/z tolerance in Da
817
+
818
+ Returns:
819
+ List of groups, where each group is a list of feature dictionaries
820
+ """
821
+ if not features:
822
+ return []
823
+
824
+ # Use Union-Find for efficient clustering
825
+ class UnionFind:
826
+ def __init__(self, n):
827
+ self.parent = list(range(n))
828
+ self.rank = [0] * n
829
+
830
+ def find(self, x):
831
+ if self.parent[x] != x:
832
+ self.parent[x] = self.find(self.parent[x])
833
+ return self.parent[x]
834
+
835
+ def union(self, x, y):
836
+ px, py = self.find(x), self.find(y)
837
+ if px == py:
838
+ return
839
+ if self.rank[px] < self.rank[py]:
840
+ px, py = py, px
841
+ self.parent[py] = px
842
+ if self.rank[px] == self.rank[py]:
843
+ self.rank[px] += 1
844
+
845
+ n_features = len(features)
846
+ uf = UnionFind(n_features)
847
+
848
+ # Build distance matrix and cluster features within tolerance
849
+ for i in range(n_features):
850
+ for j in range(i + 1, n_features):
851
+ feat_i = features[i]
852
+ feat_j = features[j]
853
+
854
+ # Skip if features are from the same chunk (they're already processed)
855
+ if feat_i['chunk_idx'] == feat_j['chunk_idx']:
856
+ continue
857
+
858
+ mz_diff = abs(feat_i['mz'] - feat_j['mz'])
859
+ rt_diff = abs(feat_i['rt'] - feat_j['rt'])
860
+
861
+ # Cluster if within tolerance
862
+ if mz_diff <= mz_tol and rt_diff <= rt_tol:
863
+ uf.union(i, j)
864
+
865
+ # Extract groups
866
+ groups_by_root = {}
867
+ for i in range(n_features):
868
+ root = uf.find(i)
869
+ if root not in groups_by_root:
870
+ groups_by_root[root] = []
871
+ groups_by_root[root].append(features[i])
872
+
873
+ return list(groups_by_root.values())
874
+
875
+
876
+ # Note: Restored proper chunked implementation with cross-chunk consensus clustering
877
+
878
+
879
+ def _reset_consensus_data(self):
880
+ """Reset consensus-related DataFrames at the start of merge."""
881
+ self.consensus_df = pl.DataFrame()
882
+ self.consensus_ms2 = pl.DataFrame()
883
+ self.consensus_mapping_df = pl.DataFrame()
884
+
885
+
886
+ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
887
+ """Extract consensus features and build metadata."""
888
+ # create a dict to map uid to feature_uid using self.features_df
889
+ feature_uid_map = {
890
+ row["feature_id"]: row["feature_uid"]
891
+ for row in self.features_df.iter_rows(named=True)
892
+ }
893
+ imax = consensus_map.size()
894
+
895
+ self.logger.debug(f"Found {imax} feature groups by clustering.")
896
+
897
+ # Pre-build fast lookup tables for features_df data using optimized approach
898
+ features_lookup = _optimized_feature_lookup(self, self.features_df)
899
+
900
+ # create a list to store the consensus mapping
901
+ consensus_mapping = []
902
+ metadata_list = []
903
+
904
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
905
+
906
+ for i, feature in enumerate(
907
+ tqdm(
908
+ consensus_map,
909
+ total=imax,
910
+ disable=tqdm_disable,
911
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
912
+ ),
913
+ ):
914
+ # get all features in the feature map with the same unique id as the consensus feature
915
+ features_list = feature.getFeatureList()
916
+ uids = []
917
+ feature_data_list = []
918
+
919
+ for _j, f in enumerate(features_list):
920
+ fuid = str(f.getUniqueId())
921
+ if fuid not in feature_uid_map:
922
+ # this is a feature that was removed but is still in the feature maps
923
+ continue
924
+ fuid = feature_uid_map[fuid]
925
+ consensus_mapping.append(
926
+ {
927
+ "consensus_uid": i,
928
+ "sample_uid": f.getMapIndex() + 1,
929
+ "feature_uid": fuid,
930
+ },
931
+ )
932
+ uids.append(fuid)
933
+
934
+ # Get feature data from lookup instead of DataFrame filtering
935
+ feature_data = features_lookup.get(fuid)
936
+ if feature_data:
937
+ feature_data_list.append(feature_data)
938
+
939
+ if not feature_data_list:
940
+ # Skip this consensus feature if no valid features found
941
+ continue
942
+
943
+ # Compute statistics using vectorized operations on collected data
944
+ # Convert to numpy arrays for faster computation
945
+ rt_values = np.array(
946
+ [fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None],
947
+ )
948
+ mz_values = np.array(
949
+ [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
950
+ )
951
+ rt_start_values = np.array(
952
+ [
953
+ fd.get("rt_start", 0)
954
+ for fd in feature_data_list
955
+ if fd.get("rt_start") is not None
956
+ ],
957
+ )
958
+ rt_end_values = np.array(
959
+ [
960
+ fd.get("rt_end", 0)
961
+ for fd in feature_data_list
962
+ if fd.get("rt_end") is not None
963
+ ],
964
+ )
965
+ rt_delta_values = np.array(
966
+ [
967
+ fd.get("rt_delta", 0)
968
+ for fd in feature_data_list
969
+ if fd.get("rt_delta") is not None
970
+ ],
971
+ )
972
+ mz_start_values = np.array(
973
+ [
974
+ fd.get("mz_start", 0)
975
+ for fd in feature_data_list
976
+ if fd.get("mz_start") is not None
977
+ ],
978
+ )
979
+ mz_end_values = np.array(
980
+ [
981
+ fd.get("mz_end", 0)
982
+ for fd in feature_data_list
983
+ if fd.get("mz_end") is not None
984
+ ],
985
+ )
986
+ inty_values = np.array(
987
+ [
988
+ fd.get("inty", 0)
989
+ for fd in feature_data_list
990
+ if fd.get("inty") is not None
991
+ ],
992
+ )
993
+ coherence_values = np.array(
994
+ [
995
+ fd.get("chrom_coherence", 0)
996
+ for fd in feature_data_list
997
+ if fd.get("chrom_coherence") is not None
998
+ ],
999
+ )
1000
+ prominence_values = np.array(
1001
+ [
1002
+ fd.get("chrom_prominence", 0)
1003
+ for fd in feature_data_list
1004
+ if fd.get("chrom_prominence") is not None
1005
+ ],
1006
+ )
1007
+ prominence_scaled_values = np.array(
1008
+ [
1009
+ fd.get("chrom_prominence_scaled", 0)
1010
+ for fd in feature_data_list
1011
+ if fd.get("chrom_prominence_scaled") is not None
1012
+ ],
1013
+ )
1014
+ height_scaled_values = np.array(
1015
+ [
1016
+ fd.get("chrom_height_scaled", 0)
1017
+ for fd in feature_data_list
1018
+ if fd.get("chrom_height_scaled") is not None
1019
+ ],
1020
+ )
1021
+ iso_values = np.array(
1022
+ [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
1023
+ )
1024
+ charge_values = np.array(
1025
+ [
1026
+ fd.get("charge", 0)
1027
+ for fd in feature_data_list
1028
+ if fd.get("charge") is not None
1029
+ ],
1030
+ )
1031
+
1032
+ # adduct_values
1033
+ # Collect all adducts from feature_data_list to create consensus adduct information
1034
+ # Only consider adducts that are in study._get_adducts() plus items with '?'
1035
+ all_adducts = []
1036
+ adduct_masses = {}
1037
+
1038
+ # Get valid adducts from cached result (avoid repeated _get_adducts() calls)
1039
+ valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
1040
+ valid_adducts.add("?") # Always allow '?' adducts
1041
+
1042
+ for fd in feature_data_list:
1043
+ # Get individual adduct and mass from each feature data (fd)
1044
+ adduct = fd.get("adduct")
1045
+ adduct_mass = fd.get("adduct_mass")
1046
+
1047
+ if adduct is not None:
1048
+ # Only include adducts that are valid (from study._get_adducts() or contain '?')
1049
+ if adduct in valid_adducts or "?" in adduct:
1050
+ all_adducts.append(adduct)
1051
+ if adduct_mass is not None:
1052
+ adduct_masses[adduct] = adduct_mass
1053
+
1054
+ # Calculate adduct_values for the consensus feature
1055
+ adduct_values = []
1056
+ if all_adducts:
1057
+ adduct_counts = {
1058
+ adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
1059
+ }
1060
+ total_count = sum(adduct_counts.values())
1061
+ for adduct, count in adduct_counts.items():
1062
+ percentage = (count / total_count) * 100 if total_count > 0 else 0
1063
+ # Store as list with [name, num, %] format for the adducts column
1064
+ adduct_values.append(
1065
+ [
1066
+ str(adduct),
1067
+ int(count),
1068
+ float(round(percentage, 2)),
1069
+ ],
1070
+ )
1071
+
1072
+ # Sort adduct_values by count in descending order
1073
+ adduct_values.sort(key=lambda x: x[1], reverse=True) # Sort by count (index 1)
1074
+ # Store adduct_values for use in metadata
1075
+ consensus_adduct_values = adduct_values
1076
+
1077
+ # Extract top adduct information for new columns
1078
+ adduct_top = None
1079
+ adduct_charge_top = None
1080
+ adduct_mass_neutral_top = None
1081
+ adduct_mass_shift_top = None
1082
+
1083
+ if consensus_adduct_values:
1084
+ top_adduct_name = consensus_adduct_values[0][0] # Get top adduct name
1085
+ adduct_top = top_adduct_name
1086
+
1087
+ # Parse adduct information to extract charge and mass shift
1088
+ # Handle "?" as "H" and parse common adduct formats
1089
+ if top_adduct_name == "?" or top_adduct_name == "[M+?]+":
1090
+ adduct_charge_top = 1
1091
+ adduct_mass_shift_top = 1.007825 # H mass
1092
+ elif top_adduct_name == "[M+?]-":
1093
+ adduct_charge_top = -1
1094
+ adduct_mass_shift_top = -1.007825 # -H mass
1095
+ else:
1096
+ # Try to get charge and mass shift from cached study adducts
1097
+ adduct_found = False
1098
+ if cached_adducts_df is not None and not cached_adducts_df.is_empty():
1099
+ # Look for exact match in study adducts
1100
+ matching_adduct = cached_adducts_df.filter(
1101
+ pl.col("name") == top_adduct_name,
1102
+ )
1103
+ if not matching_adduct.is_empty():
1104
+ adduct_row = matching_adduct.row(0, named=True)
1105
+ adduct_charge_top = adduct_row["charge"]
1106
+ adduct_mass_shift_top = adduct_row["mass_shift"]
1107
+ adduct_found = True
1108
+
1109
+ if not adduct_found:
1110
+ # Fallback to regex parsing
1111
+ import re
1112
+
1113
+ # Pattern for adducts like [M+H]+, [M-H]-, [M+Na]+, etc.
1114
+ pattern = r"\[M([+\-])([A-Za-z0-9]+)\]([0-9]*)([+\-])"
1115
+ match = re.match(pattern, top_adduct_name)
1116
+
1117
+ if match:
1118
+ sign = match.group(1)
1119
+ element = match.group(2)
1120
+ multiplier_str = match.group(3)
1121
+ charge_sign = match.group(4)
1122
+
1123
+ multiplier = int(multiplier_str) if multiplier_str else 1
1124
+ charge = multiplier if charge_sign == "+" else -multiplier
1125
+ adduct_charge_top = charge
1126
+
1127
+ # Calculate mass shift based on element
1128
+ element_masses = {
1129
+ "H": 1.007825,
1130
+ "Na": 22.989769,
1131
+ "K": 38.963708,
1132
+ "NH4": 18.033823,
1133
+ "Li": 7.016930,
1134
+ "Cl": 34.969401,
1135
+ "Br": 78.918885,
1136
+ "HCOO": 44.998201,
1137
+ "CH3COO": 59.013851,
1138
+ "H2O": 18.010565,
1139
+ }
1140
+
1141
+ base_mass = element_masses.get(
1142
+ element,
1143
+ 1.007825,
1144
+ ) # Default to H if unknown
1145
+ mass_shift = (
1146
+ base_mass * multiplier
1147
+ if sign == "+"
1148
+ else -base_mass * multiplier
1149
+ )
1150
+ adduct_mass_shift_top = mass_shift
1151
+ else:
1152
+ # Default fallback
1153
+ adduct_charge_top = 1
1154
+ adduct_mass_shift_top = 1.007825
1155
+ else:
1156
+ # No valid adducts found - assign default based on study polarity
1157
+ study_polarity = getattr(self, "polarity", "positive")
1158
+ if study_polarity in ["negative", "neg"]:
1159
+ # Negative mode default
1160
+ adduct_top = "[M-?]1-"
1161
+ adduct_charge_top = -1
1162
+ adduct_mass_shift_top = -1.007825 # -H mass (loss of proton)
1163
+ else:
1164
+ # Positive mode default (includes 'positive', 'pos', or any other value)
1165
+ adduct_top = "[M+?]1+"
1166
+ adduct_charge_top = 1
1167
+ adduct_mass_shift_top = 1.007825 # H mass (gain of proton)
1168
+
1169
+ # Create a single default adduct entry in the adducts list for consistency
1170
+ consensus_adduct_values = [[adduct_top, 1, 100.0]]
1171
+
1172
+ # Calculate neutral mass from consensus mz (for both cases)
1173
+ consensus_mz = (
1174
+ round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
1175
+ )
1176
+ if adduct_charge_top and adduct_mass_shift_top is not None:
1177
+ adduct_mass_neutral_top = (
1178
+ consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
1179
+ )
1180
+
1181
+ # Calculate number of MS2 spectra
1182
+ ms2_count = 0
1183
+ for fd in feature_data_list:
1184
+ ms2_scans = fd.get("ms2_scans")
1185
+ if ms2_scans is not None:
1186
+ ms2_count += len(ms2_scans)
1187
+
1188
+ metadata_list.append(
1189
+ {
1190
+ "consensus_uid": int(i), # "consensus_id": i,
1191
+ "consensus_id": str(feature.getUniqueId()),
1192
+ "quality": round(float(feature.getQuality()), 3),
1193
+ "number_samples": len(feature_data_list),
1194
+ # "number_ext": int(len(features_list)),
1195
+ "rt": round(float(np.mean(rt_values)), 4)
1196
+ if len(rt_values) > 0
1197
+ else 0.0,
1198
+ "mz": round(float(np.mean(mz_values)), 4)
1199
+ if len(mz_values) > 0
1200
+ else 0.0,
1201
+ "rt_min": round(float(np.min(rt_values)), 3)
1202
+ if len(rt_values) > 0
1203
+ else 0.0,
1204
+ "rt_max": round(float(np.max(rt_values)), 3)
1205
+ if len(rt_values) > 0
1206
+ else 0.0,
1207
+ "rt_mean": round(float(np.mean(rt_values)), 3)
1208
+ if len(rt_values) > 0
1209
+ else 0.0,
1210
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
1211
+ if len(rt_start_values) > 0
1212
+ else 0.0,
1213
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
1214
+ if len(rt_end_values) > 0
1215
+ else 0.0,
1216
+ "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
1217
+ if len(rt_delta_values) > 0
1218
+ else 0.0,
1219
+ "mz_min": round(float(np.min(mz_values)), 4)
1220
+ if len(mz_values) > 0
1221
+ else 0.0,
1222
+ "mz_max": round(float(np.max(mz_values)), 4)
1223
+ if len(mz_values) > 0
1224
+ else 0.0,
1225
+ "mz_mean": round(float(np.mean(mz_values)), 4)
1226
+ if len(mz_values) > 0
1227
+ else 0.0,
1228
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
1229
+ if len(mz_start_values) > 0
1230
+ else 0.0,
1231
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
1232
+ if len(mz_end_values) > 0
1233
+ else 0.0,
1234
+ "inty_mean": round(float(np.mean(inty_values)), 0)
1235
+ if len(inty_values) > 0
1236
+ else 0.0,
1237
+ "bl": -1.0,
1238
+ "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
1239
+ if len(coherence_values) > 0
1240
+ else 0.0,
1241
+ "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
1242
+ if len(prominence_values) > 0
1243
+ else 0.0,
1244
+ "chrom_prominence_scaled_mean": round(
1245
+ float(np.mean(prominence_scaled_values)),
1246
+ 3,
1247
+ )
1248
+ if len(prominence_scaled_values) > 0
1249
+ else 0.0,
1250
+ "chrom_height_scaled_mean": round(
1251
+ float(np.mean(height_scaled_values)),
1252
+ 3,
1253
+ )
1254
+ if len(height_scaled_values) > 0
1255
+ else 0.0,
1256
+ "iso_mean": round(float(np.mean(iso_values)), 2)
1257
+ if len(iso_values) > 0
1258
+ else 0.0,
1259
+ "charge_mean": round(float(np.mean(charge_values)), 2)
1260
+ if len(charge_values) > 0
1261
+ else 0.0,
1262
+ "number_ms2": int(ms2_count),
1263
+ "adducts": consensus_adduct_values
1264
+ if consensus_adduct_values
1265
+ else [], # Ensure it's always a list
1266
+ # New columns for top-ranked adduct information
1267
+ "adduct_top": adduct_top,
1268
+ "adduct_charge_top": adduct_charge_top,
1269
+ "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
1270
+ if adduct_mass_neutral_top is not None
1271
+ else None,
1272
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
1273
+ if adduct_mass_shift_top is not None
1274
+ else None,
1275
+ # New columns for top-scoring identification results
1276
+ "id_top_name": None,
1277
+ "id_top_class": None,
1278
+ "id_top_adduct": None,
1279
+ "id_top_score": None,
1280
+ },
1281
+ )
1282
+
1283
+ consensus_mapping_df = pl.DataFrame(consensus_mapping)
1284
+ # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
1285
+ l1 = len(consensus_mapping_df)
1286
+ consensus_mapping_df = consensus_mapping_df.filter(
1287
+ pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
1288
+ )
1289
+ self.logger.debug(
1290
+ f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
1291
+ )
1292
+ self.consensus_mapping_df = consensus_mapping_df
1293
+ self.consensus_df = pl.DataFrame(metadata_list, strict=False)
1294
+
1295
+ if min_samples is None:
1296
+ min_samples = 1
1297
+ if min_samples < 1:
1298
+ min_samples = int(min_samples * len(self.samples_df))
1299
+
1300
+ # Validate that min_samples doesn't exceed the number of samples
1301
+ if min_samples > len(self.samples_df):
1302
+ self.logger.warning(
1303
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
1304
+ f"Setting min_samples to {len(self.samples_df)}.",
1305
+ )
1306
+ min_samples = len(self.samples_df)
1307
+
1308
+ # filter out consensus features with less than min_samples features
1309
+ l1 = len(self.consensus_df)
1310
+ self.consensus_df = self.consensus_df.filter(
1311
+ pl.col("number_samples") >= min_samples,
1312
+ )
1313
+ self.logger.debug(
1314
+ f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
1315
+ )
1316
+ # filter out consensus mapping with less than min_samples features
1317
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
1318
+ pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
1319
+ )
1320
+
1321
+ self.consensus_map = consensus_map
1322
+
1323
+
1324
+ def _perform_adduct_grouping(self, rt_tol, mz_tol):
1325
+ """Perform adduct grouping on consensus features."""
1326
+ import polars as pl
1327
+
1328
+ # Add adduct grouping and adduct_of assignment
1329
+ if len(self.consensus_df) > 0:
1330
+ # Get relevant columns for grouping
1331
+ consensus_data = []
1332
+ for row in self.consensus_df.iter_rows(named=True):
1333
+ consensus_data.append(
1334
+ {
1335
+ "consensus_uid": row["consensus_uid"],
1336
+ "rt": row["rt"],
1337
+ "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
1338
+ "adduct_top": row.get("adduct_top"),
1339
+ "inty_mean": row.get("inty_mean", 0),
1340
+ },
1341
+ )
1342
+
1343
+ # Use optimized adduct grouping
1344
+ adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
1345
+ self, consensus_data, rt_tol, mz_tol
1346
+ )
1347
+
1348
+ # Add the new columns to consensus_df
1349
+ self.consensus_df = self.consensus_df.with_columns(
1350
+ [
1351
+ pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
1352
+ pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
1353
+ ],
1354
+ )
1355
+
1356
+
1357
+ def _finalize_merge(self, link_ms2, min_samples):
1358
+ """Complete the merge process with final calculations and cleanup."""
1359
+ import polars as pl
1360
+
1361
+ # Check if consensus_df is empty or missing required columns
1362
+ if len(self.consensus_df) == 0 or "number_samples" not in self.consensus_df.columns:
1363
+ self.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
1364
+ return
1365
+
1366
+ # Validate min_samples parameter
1367
+ if min_samples is None:
1368
+ min_samples = 1
1369
+ if min_samples < 1:
1370
+ min_samples = int(min_samples * len(self.samples_df))
1371
+
1372
+ # Validate that min_samples doesn't exceed the number of samples
1373
+ if min_samples > len(self.samples_df):
1374
+ self.logger.warning(
1375
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
1376
+ f"Setting min_samples to {len(self.samples_df)}.",
1377
+ )
1378
+ min_samples = len(self.samples_df)
1379
+
1380
+ # Filter out consensus features with less than min_samples features
1381
+ l1 = len(self.consensus_df)
1382
+ self.consensus_df = self.consensus_df.filter(
1383
+ pl.col("number_samples") >= min_samples,
1384
+ )
1385
+ self.logger.debug(
1386
+ f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
1387
+ )
1388
+
1389
+ # Filter out consensus mapping with less than min_samples features
1390
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
1391
+ pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
1392
+ )
1393
+
1394
+ # Calculate the completeness of the consensus map
1395
+ if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
1396
+ c = (
1397
+ len(self.consensus_mapping_df)
1398
+ / len(self.consensus_df)
1399
+ / len(self.samples_df)
1400
+ )
1401
+ self.logger.info(
1402
+ f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
1403
+ )
1404
+ else:
1405
+ self.logger.warning(
1406
+ f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
1407
+ f"This may be due to min_samples ({min_samples}) being too high for the available data.",
1408
+ )
1409
+
1410
+ if link_ms2:
1411
+ self.find_ms2()
1412
+
1413
+
1414
+ def _optimized_feature_lookup(study_obj, features_df):
1415
+ """
1416
+ Optimized feature lookup creation using Polars operations.
1417
+ """
1418
+ study_obj.logger.debug("Creating optimized feature lookup...")
1419
+ start_time = time.time()
1420
+
1421
+ # Use Polars select for faster conversion
1422
+ feature_columns = [
1423
+ "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
1424
+ "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
1425
+ "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
1426
+ "ms2_scans", "adduct", "adduct_mass"
1427
+ ]
1428
+
1429
+ # Filter to only existing columns
1430
+ existing_columns = [col for col in feature_columns if col in features_df.columns]
1431
+
1432
+ # Convert to dictionary more efficiently
1433
+ selected_df = features_df.select(existing_columns)
1434
+
1435
+ features_lookup = {}
1436
+ for row in selected_df.iter_rows(named=True):
1437
+ feature_uid = row["feature_uid"]
1438
+ # Keep feature_uid in the dictionary for chunked merge compatibility
1439
+ features_lookup[feature_uid] = {k: v for k, v in row.items()}
1440
+
1441
+ lookup_time = time.time() - start_time
1442
+ if len(features_lookup) > 50000:
1443
+ study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
1444
+ return features_lookup
1445
+
1446
+
1447
+ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
1448
+ """
1449
+ Optimized O(n log n) adduct grouping using spatial indexing.
1450
+
1451
+ Args:
1452
+ study_obj: Study object with logger
1453
+ consensus_data: List of consensus feature dictionaries
1454
+ rt_tol: RT tolerance in minutes
1455
+ mz_tol: m/z tolerance in Da
1456
+
1457
+ Returns:
1458
+ Tuple of (adduct_group_list, adduct_of_list)
1459
+ """
1460
+ if not consensus_data:
1461
+ return [], []
1462
+
1463
+ n_features = len(consensus_data)
1464
+ if n_features > 10000:
1465
+ study_obj.logger.info(f"Adduct grouping for {n_features} consensus features...")
1466
+ else:
1467
+ study_obj.logger.debug(f"Adduct grouping for {n_features} consensus features...")
1468
+
1469
+ # Build spatial index using RT and neutral mass as coordinates
1470
+ features_by_mass = defaultdict(list)
1471
+ mass_bin_size = mz_tol * 2 # 2x tolerance for conservative binning
1472
+
1473
+ valid_features = []
1474
+ for feature in consensus_data:
1475
+ consensus_uid = feature["consensus_uid"]
1476
+ rt = feature["rt"]
1477
+ neutral_mass = feature.get("adduct_mass_neutral_top")
1478
+ intensity = feature.get("inty_mean", 0)
1479
+ adduct = feature.get("adduct_top", "")
1480
+
1481
+ if neutral_mass is not None:
1482
+ mass_bin = int(neutral_mass / mass_bin_size)
1483
+ features_by_mass[mass_bin].append((consensus_uid, rt, neutral_mass, intensity, adduct))
1484
+ valid_features.append((consensus_uid, rt, neutral_mass, intensity, adduct, mass_bin))
1485
+
1486
+ # Union-Find for efficient grouping
1487
+ class UnionFind:
1488
+ def __init__(self, n):
1489
+ self.parent = list(range(n))
1490
+ self.rank = [0] * n
1491
+
1492
+ def find(self, x):
1493
+ if self.parent[x] != x:
1494
+ self.parent[x] = self.find(self.parent[x])
1495
+ return self.parent[x]
1496
+
1497
+ def union(self, x, y):
1498
+ px, py = self.find(x), self.find(y)
1499
+ if px == py:
1500
+ return
1501
+ if self.rank[px] < self.rank[py]:
1502
+ px, py = py, px
1503
+ self.parent[py] = px
1504
+ if self.rank[px] == self.rank[py]:
1505
+ self.rank[px] += 1
1506
+
1507
+ uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
1508
+ uf = UnionFind(len(valid_features))
1509
+
1510
+ # Find groups using spatial index
1511
+ checked_pairs = set()
1512
+ for i, (uid1, rt1, mass1, inty1, adduct1, bin1) in enumerate(valid_features):
1513
+ for bin_offset in [-1, 0, 1]:
1514
+ check_bin = bin1 + bin_offset
1515
+ if check_bin not in features_by_mass:
1516
+ continue
1517
+
1518
+ for uid2, rt2, mass2, inty2, adduct2 in features_by_mass[check_bin]:
1519
+ if uid1 >= uid2:
1520
+ continue
1521
+
1522
+ pair = (min(uid1, uid2), max(uid1, uid2))
1523
+ if pair in checked_pairs:
1524
+ continue
1525
+ checked_pairs.add(pair)
1526
+
1527
+ mass_diff = abs(mass1 - mass2)
1528
+ rt_diff = abs(rt1 - rt2) / 60.0 # Convert to minutes
1529
+
1530
+ if mass_diff <= mz_tol and rt_diff <= rt_tol:
1531
+ j = uid_to_idx[uid2]
1532
+ uf.union(i, j)
1533
+
1534
+ # Extract groups
1535
+ groups_by_root = defaultdict(list)
1536
+ for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
1537
+ root = uf.find(i)
1538
+ groups_by_root[root].append((uid, rt, mass, inty, adduct))
1539
+
1540
+ groups = {}
1541
+ group_id = 1
1542
+ assigned_groups = {}
1543
+
1544
+ for group_members in groups_by_root.values():
1545
+ member_uids = [uid for uid, _, _, _, _ in group_members]
1546
+
1547
+ for uid in member_uids:
1548
+ assigned_groups[uid] = group_id
1549
+ groups[group_id] = member_uids
1550
+ group_id += 1
1551
+
1552
+ # Handle features without neutral mass
1553
+ for feature in consensus_data:
1554
+ uid = feature["consensus_uid"]
1555
+ if uid not in assigned_groups:
1556
+ assigned_groups[uid] = group_id
1557
+ groups[group_id] = [uid]
1558
+ group_id += 1
1559
+
1560
+ # Determine adduct_of for each group
1561
+ group_adduct_of = {}
1562
+ for grp_id, member_uids in groups.items():
1563
+ best_uid = None
1564
+ best_priority = -1
1565
+ best_intensity = 0
1566
+
1567
+ for uid in member_uids:
1568
+ feature_data = next((f for f in consensus_data if f["consensus_uid"] == uid), None)
1569
+ if not feature_data:
1570
+ continue
1571
+
1572
+ adduct = feature_data.get("adduct_top", "")
1573
+ intensity = feature_data.get("inty_mean", 0)
1574
+
1575
+ priority = 0
1576
+ if adduct and ("[M+H]" in adduct or adduct == "H" or adduct == "?"):
1577
+ priority = 3
1578
+ elif adduct and "[M-H]" in adduct:
1579
+ priority = 2
1580
+ elif adduct and "M" in adduct:
1581
+ priority = 1
1582
+
1583
+ if priority > best_priority or (priority == best_priority and intensity > best_intensity):
1584
+ best_uid = uid
1585
+ best_priority = priority
1586
+ best_intensity = intensity
1587
+
1588
+ group_adduct_of[grp_id] = best_uid if best_uid else member_uids[0]
1589
+
1590
+ # Build final lists in same order as consensus_data
1591
+ adduct_group_list = []
1592
+ adduct_of_list = []
1593
+
1594
+ for feature in consensus_data:
1595
+ uid = feature["consensus_uid"]
1596
+ group = assigned_groups.get(uid, 0)
1597
+ adduct_of = group_adduct_of.get(group, uid)
1598
+
1599
+ adduct_group_list.append(group)
1600
+ adduct_of_list.append(adduct_of)
1601
+
1602
+ if n_features > 10000:
1603
+ study_obj.logger.info("Adduct grouping completed.")
1604
+ else:
1605
+ study_obj.logger.debug("Adduct grouping completed.")
1606
+
1607
+ return adduct_group_list, adduct_of_list