masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """
2
2
  Unified merge module for the Study class.
3
- Supports multiple merge methods: 'kd', 'qt', 'kd-nowarp', 'kd_chunked', 'qt_chunked'
3
+ Supports multiple merge methods: 'kd', 'qt', 'kd_chunked', 'qt_chunked'
4
4
  """
5
5
 
6
6
  import time
@@ -12,6 +12,7 @@ import pyopenms as oms
12
12
  import polars as pl
13
13
  from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
14
14
  from concurrent.futures.process import BrokenProcessPool
15
+ from scipy.spatial import cKDTree
15
16
  from masster.study.defaults import merge_defaults
16
17
 
17
18
 
@@ -115,47 +116,6 @@ def _process_kd_chunk_parallel(chunk_data):
115
116
  return chunk_start_idx, consensus_features
116
117
 
117
118
 
118
- def _deserialize_consensus_features(consensus_features):
119
- """
120
- Deserialize consensus features back into an OpenMS ConsensusMap.
121
-
122
- Args:
123
- consensus_features: List of serialized consensus feature dictionaries
124
-
125
- Returns:
126
- OpenMS ConsensusMap object
127
- """
128
- import pyopenms as oms
129
-
130
- consensus_map = oms.ConsensusMap()
131
-
132
- for feature_data in consensus_features:
133
- consensus_feature = oms.ConsensusFeature()
134
- consensus_feature.setRT(float(feature_data['rt']))
135
- consensus_feature.setMZ(float(feature_data['mz']))
136
- consensus_feature.setIntensity(float(feature_data['intensity']))
137
- consensus_feature.setQuality(float(feature_data['quality']))
138
- consensus_feature.setUniqueId(int(feature_data['unique_id']))
139
-
140
- # Reconstruct feature handles (simplified approach)
141
- feature_handles = []
142
- for handle_data in feature_data['features']:
143
- feature_handle = oms.FeatureHandle()
144
- feature_handle.setUniqueId(int(handle_data['unique_id']))
145
- feature_handle.setMapIndex(int(handle_data['map_index']))
146
- feature_handles.append(feature_handle)
147
-
148
- # Set the feature list - properly add feature handles back to consensus feature
149
- if feature_handles:
150
- # Add each feature handle to the consensus feature using the correct OpenMS API
151
- for feature_handle in feature_handles:
152
- consensus_feature.getFeatureList().append(feature_handle)
153
-
154
- consensus_map.push_back(consensus_feature)
155
-
156
- return consensus_map
157
-
158
-
159
119
  def _process_qt_chunk_parallel(chunk_data):
160
120
  """
161
121
  Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
@@ -222,7 +182,8 @@ def _process_qt_chunk_parallel(chunk_data):
222
182
  chunk_params.setValue("distance_MZ:unit", "Da")
223
183
  chunk_params.setValue("ignore_charge", "true")
224
184
  chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
225
-
185
+
186
+
226
187
  grouper.setParameters(chunk_params)
227
188
  grouper.group(chunk_maps, chunk_consensus_map)
228
189
 
@@ -251,29 +212,6 @@ def _process_qt_chunk_parallel(chunk_data):
251
212
  return chunk_start_idx, consensus_features
252
213
 
253
214
 
254
- def _serialize_feature_map(feature_map):
255
- """
256
- Serialize a FeatureMap to a list of dictionaries for multiprocessing.
257
-
258
- Args:
259
- feature_map: OpenMS FeatureMap object
260
-
261
- Returns:
262
- List of feature dictionaries
263
- """
264
- features_data = []
265
- for feature in feature_map:
266
- feature_data = {
267
- 'rt': feature.getRT(),
268
- 'mz': feature.getMZ(),
269
- 'intensity': feature.getIntensity(),
270
- 'charge': feature.getCharge(),
271
- 'unique_id': feature.getUniqueId()
272
- }
273
- features_data.append(feature_data)
274
- return features_data
275
-
276
-
277
215
  def merge(study, **kwargs) -> None:
278
216
  """
279
217
  Group features across samples into consensus features using various algorithms.
@@ -285,74 +223,155 @@ def merge(study, **kwargs) -> None:
285
223
  ----------
286
224
  **kwargs : dict
287
225
  Parameters from merge_defaults class:
288
- - method : str, default 'quality'
289
- Merge algorithm: 'sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality'
290
- - min_samples : int, default 10
226
+ - method : str, default 'kd'
227
+ Merge algorithm: 'kd', 'qt', 'kd_chunked', 'qt_chunked'
228
+ - min_samples : int, default 2
291
229
  Minimum number of samples for consensus feature
292
- - rt_tol : float, default 2.0
230
+ - rt_tol : float, default 5.0
293
231
  RT tolerance in seconds
294
232
  - mz_tol : float, default 0.01
295
233
  m/z tolerance in Da (Daltons) for all methods
296
234
  - chunk_size : int, default 500
297
- Chunk size for 'chunked' method
298
- - threads : int, default 1
299
- Number of parallel processes for chunked methods (kd_chunked, qt_chunked)
300
- - nr_partitions : int, default 500
235
+ Chunk size for chunked methods
236
+ - dechunking : str, default 'hierarchical'
237
+ Cross-chunk merging algorithm: 'hierarchical', 'kdtree', 'qt', 'none'
238
+ - threads : int, default None
239
+ Number of parallel processes for chunked methods (None=sequential)
240
+ - nr_partitions : int, default 1000
301
241
  Number of partitions in m/z dimension for KD algorithms
302
- - min_rel_cc_size : float, default 0.3
303
- Minimum relative connected component size for conflict resolution
304
- - max_pairwise_log_fc : float, default 0.5
305
- Maximum pairwise log fold change for conflict resolution
242
+ - min_rel_cc_size : float, default 0.1
243
+ Minimum relative connected component size for conflict resolution (chunked only)
244
+ - max_pairwise_log_fc : float, default -1.0
245
+ Maximum pairwise log fold change for conflict resolution (chunked only)
306
246
  - max_nr_conflicts : int, default 0
307
- Maximum number of conflicts allowed in consensus feature
247
+ Maximum number of conflicts allowed in consensus feature (chunked only)
308
248
  - link_ms2 : bool, default True
309
249
  Whether to link MS2 spectra to consensus features
310
-
311
- Algorithm Guidelines
312
- -------------------
313
- - Quality: KD with post-processing quality control to reduce oversegmentation (RECOMMENDED DEFAULT)
314
- Includes RT tolerance optimization, secondary clustering, and quality filtering
315
- - Sensitivity: Best raw sensitivity, O(n log n), maximum feature detection
316
- - QT: Thorough but slow O(n²), good for <1000 samples
317
- - NoWarp: Memory efficient KD without RT warping for large datasets
318
- - KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
319
- Uses optimized partitioning for better memory management while maintaining
320
- full cross-sample consensus feature detection. Supports parallel processing.
321
- - QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
322
- Uses QT clustering in first stage with optimized cross-chunk consensus building.
323
- Supports parallel processing.
250
+ - extract_ms1 : bool, default True
251
+ Whether to extract MS1 spectra for consensus features
252
+
253
+ Algorithm Selection Guide
254
+ ------------------------
255
+ Choose your merge method based on dataset size and performance requirements:
256
+
257
+ **KD (K-D Tree)** - *Recommended Default*
258
+ - Fast O(n log n) algorithm with RT warping
259
+ - Best balance of speed, accuracy, and memory usage
260
+ - Suitable for most dataset sizes (50 - 5,000 samples)
261
+ - Uses spatial partitioning for efficient feature matching
262
+
263
+ **QT (Quality Threshold)**
264
+ - Thorough O(n²) clustering algorithm
265
+ - Most accurate but slowest method
266
+ - Recommended for small datasets (<1,000 samples)
267
+ - Guarantees quality threshold constraints
268
+
269
+ **KD-Chunked** - *For Large Datasets*
270
+ - Memory-optimized KD algorithm for very large datasets (>5,000 samples)
271
+ - Processes data in chunks with cross-chunk consensus building
272
+ - Supports parallel processing with threads parameter
273
+ - Maintains high feature recovery through hierarchical dechunking
274
+
275
+ **QT-Chunked** - *For Large Datasets with Maximum Accuracy*
276
+ - Memory-optimized QT algorithm for very large datasets (>5,000 samples)
277
+ - Uses QT clustering within chunks, then cross-chunk consensus
278
+ - Slowest but most thorough for large datasets
279
+ - Best when accuracy is more important than speed
280
+
281
+ Cross-Chunk Merging (Dechunking) Methods
282
+ ----------------------------------------
283
+ For chunked methods, choose dechunking algorithm based on your priorities:
284
+
285
+ **Hierarchical** - *Recommended Default*
286
+ - Priority-based merging starting from high sample count features
287
+ - Achieves ~97% feature recovery vs original ~10% recovery
288
+ - Best overall balance of recovery and accuracy
289
+
290
+ **KDTree** - *High Sample Feature Preservation*
291
+ - Spatial indexing approach optimized for frequent features
292
+ - ~95% high sample count feature recovery
293
+ - Best for preserving features present in many samples
294
+
295
+ Performance Guidelines
296
+ ---------------------
297
+ - **Small datasets (≤1,000 samples)**: Use 'qt' for maximum accuracy
298
+ - **Medium datasets (1,000-5,000 samples)**: Use 'kd' (default)
299
+ - **Large datasets (>5,000 samples)**: Use 'kd_chunked' or 'qt_chunked'
300
+ - **Memory constrained**: Use chunked methods with smaller chunk_size
301
+ - **Time constrained**: Use 'kd' or 'kd_chunked' with hierarchical dechunking
324
302
 
325
303
  Parallel Processing
326
304
  ------------------
327
- For kd_chunked and qt_chunked methods, use threads > 1 to enable parallel processing
328
- of chunk alignments. This can significantly reduce processing time for large datasets
329
- by processing multiple chunks simultaneously in separate processes.
330
-
331
- Example:
332
- study.merge(method='kd_chunked', threads=4, chunk_size=200)
305
+ Chunked methods support parallel processing:
306
+ - Set threads=N (where N is number of CPU cores to use)
307
+ - Recommended: threads=4 to 8 for most systems
308
+ - Each chunk is processed independently in parallel
309
+ - Significantly reduces processing time for large datasets
310
+
311
+ Tolerance Settings
312
+ -----------------
313
+ - **rt_tol**: RT tolerance in seconds (typical range: 1-10s)
314
+ - Smaller values: more specific, may fragment features
315
+ - Larger values: more permissive, may merge distinct features
316
+ - **mz_tol**: m/z tolerance in Daltons (typical range: 0.005-0.05 Da)
317
+ - High-resolution MS: 0.005-0.01 Da
318
+ - Lower resolution MS: 0.01-0.05 Da
319
+
320
+ Examples
321
+ --------
322
+ Basic usage with default KD algorithm:
323
+ study.merge()
324
+
325
+ High-accuracy small dataset:
326
+ study.merge(method='qt', rt_tol=2.0, mz_tol=0.005, min_samples=5)
327
+
328
+ Large dataset with parallel processing:
329
+ study.merge(method='kd_chunked', threads=8, chunk_size=500,
330
+ dechunking='hierarchical')
331
+
332
+ Custom tolerances for specific instrument:
333
+ study.merge(method='kd', rt_tol=1.5, mz_tol=0.01, min_samples=10)
334
+
335
+ Notes
336
+ -----
337
+ - Features must be loaded before merging (study.load_features())
338
+ - Results are stored in study.consensus_df and study.consensus_mapping_df
339
+ - Merge parameters are saved to study history for reproducibility
340
+ - MS2 spectra are automatically linked when link_ms2=True
341
+ - Adduct relationships are identified and stored after merging
333
342
  """
334
343
  start_time = time.time()
335
344
 
336
345
  # Initialize with defaults and override with kwargs
337
- params = merge_defaults()
338
-
339
- # Filter and apply only valid parameters
340
- valid_params = set(params.list_parameters())
346
+ params = merge_defaults()
347
+
348
+ # Handle 'params' keyword argument specifically (like merge does)
349
+ if 'params' in kwargs:
350
+ provided_params = kwargs.pop('params')
351
+ if isinstance(provided_params, merge_defaults):
352
+ params = provided_params
353
+ study.logger.debug("Using provided merge_defaults parameters from 'params' argument")
354
+ else:
355
+ study.logger.warning("'params' argument is not an merge_defaults instance, ignoring")
356
+
357
+ # Process remaining kwargs
341
358
  for key, value in kwargs.items():
342
- if key in valid_params:
343
- setattr(params, key, value)
359
+ if isinstance(value, merge_defaults):
360
+ params = value
361
+ study.logger.debug("Using provided merge_defaults parameters")
344
362
  else:
345
- study.logger.warning(f"Unknown parameter '{key}' ignored")
346
-
363
+ if hasattr(params, key):
364
+ if params.set(key, value, validate=True):
365
+ study.logger.debug(f"Updated parameter {key} = {value}")
366
+ else:
367
+ study.logger.warning(
368
+ f"Failed to set parameter {key} = {value} (validation failed)",
369
+ )
370
+ else:
371
+ study.logger.warning(f"Unknown parameter '{key}' ignored")
372
+
347
373
  # Backward compatibility: Map old method names to new names
348
374
  method_mapping = {
349
- 'kd': 'sensitivity',
350
- 'kd-nowarp': 'nowarp',
351
- 'kd_nowarp': 'nowarp',
352
- 'kd-strict': 'quality',
353
- 'kd_strict': 'quality',
354
- 'kdstrict': 'quality',
355
- 'chunked': 'kd_chunked', # Map old 'chunked' to 'kd_chunked'
356
375
  'qtchunked': 'qt_chunked', # QT chunked variants
357
376
  'qt-chunked': 'qt_chunked',
358
377
  'kdchunked': 'kd_chunked', # KD chunked variants
@@ -365,18 +384,28 @@ def merge(study, **kwargs) -> None:
365
384
  study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
366
385
 
367
386
  # Validate method
368
- if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
369
- raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
370
-
387
+ if params.method not in ['kd', 'qt', 'kd_chunked', 'qt_chunked']:
388
+ raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['kd', 'qt', 'kd_chunked', 'qt_chunked']")
389
+
371
390
  # Check if chunked method is advisable for large datasets
372
391
  num_samples = len(study.samples_df) if hasattr(study, 'samples_df') and study.samples_df is not None else 0
392
+ if num_samples == 0:
393
+ raise ValueError("No samples loaded in study. Load features before merging.")
394
+ if params.method == 'kd' and num_samples > params.chunk_size:
395
+ params.method = 'kd_chunked'
396
+ study.logger.info(
397
+ f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
398
+ )
399
+ if params.method == 'qt' and num_samples > params.chunk_size:
400
+ params.method = 'qt_chunked'
401
+ study.logger.info(
402
+ f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
403
+ )
404
+
373
405
  if num_samples > 500:
374
- chunked_methods = {'kd_chunked', 'qt_chunked'}
375
- if params.method not in chunked_methods:
406
+ if params.method not in {'kd_chunked', 'qt_chunked'}:
376
407
  study.logger.warning(
377
- f"Large dataset detected ({num_samples} samples > 500). "
378
- f"For better performance and memory efficiency, consider using a chunked method: "
379
- f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
408
+ f"Large dataset detected ({num_samples} samples > 500). Consider dropping chunk_size to 500 to use chunked methods."
380
409
  )
381
410
 
382
411
  # Persist last used params for diagnostics
@@ -403,8 +432,10 @@ def merge(study, **kwargs) -> None:
403
432
  f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
404
433
  )
405
434
 
406
- # Initialize
407
- _reset_consensus_data(study)
435
+ # Initialize
436
+ study.consensus_df = pl.DataFrame()
437
+ study.consensus_ms2 = pl.DataFrame()
438
+ study.consensus_mapping_df = pl.DataFrame()
408
439
 
409
440
  # Cache adducts for performance (avoid repeated _get_adducts() calls)
410
441
  cached_adducts_df = None
@@ -424,7 +455,7 @@ def merge(study, **kwargs) -> None:
424
455
  cached_valid_adducts.add("?")
425
456
 
426
457
  # Route to algorithm implementation
427
- if params.method == 'sensitivity':
458
+ if params.method == 'kd':
428
459
  consensus_map = _merge_kd(study, params)
429
460
  # Extract consensus features
430
461
  _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
@@ -432,13 +463,6 @@ def merge(study, **kwargs) -> None:
432
463
  consensus_map = _merge_qt(study, params)
433
464
  # Extract consensus features
434
465
  _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
435
- elif params.method == 'nowarp':
436
- consensus_map = _merge_kd_nowarp(study, params)
437
- # Extract consensus features
438
- _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
439
- elif params.method == 'quality':
440
- consensus_map = _merge_kd_strict(study, params)
441
- # Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
442
466
  elif params.method == 'kd_chunked':
443
467
  consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
444
468
  # Note: _merge_kd_chunked populates consensus_df directly, no need to extract
@@ -446,19 +470,23 @@ def merge(study, **kwargs) -> None:
446
470
  consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
447
471
  # Note: _merge_qt_chunked populates consensus_df directly, no need to extract
448
472
 
449
- # Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
450
- if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
451
- _consensus_cleanup(study, params.rt_tol, params.mz_tol)
473
+ # Enhanced post-clustering to merge over-segmented features (for non-chunked methods)
474
+ # Chunked methods already perform their own cross-chunk consensus building
475
+ if params.method in ['qt', 'kd']:
476
+ __consensus_cleanup(study, params.rt_tol, params.mz_tol)
452
477
 
453
478
  # Perform adduct grouping
454
479
  _perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
455
480
 
456
481
  # Identify coeluting consensus features by mass shifts and update adduct information
457
- _identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
482
+ __identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
483
+
484
+ # Post-processing for chunked methods: merge partial consensus features
485
+ if params.method in ['qt_chunked', 'kd_chunked']:
486
+ _merge_partial_consensus_features(study, params.rt_tol, params.mz_tol)
458
487
 
459
- # Link MS2 if requested
460
- if params.link_ms2:
461
- _finalize_merge(study, params.link_ms2, params.min_samples)
488
+ # Finalize merge: filter by min_samples and add isotope/MS2 data
489
+ __finalize_merge(study, params.link_ms2, params.extract_ms1, params.min_samples)
462
490
 
463
491
  # Log completion without the misleading feature count
464
492
  elapsed = time.time() - start_time
@@ -494,10 +522,6 @@ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
494
522
  params_oms.setValue("warp:mz_tol", params.mz_tol)
495
523
  params_oms.setValue("link:rt_tol", params.rt_tol)
496
524
  params_oms.setValue("link:mz_tol", params.mz_tol)
497
- #params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
498
- #params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
499
- #params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
500
- #params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
501
525
 
502
526
  grouper.setParameters(params_oms)
503
527
  grouper.group(temp_feature_maps, consensus_map)
@@ -505,92 +529,6 @@ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
505
529
  return consensus_map
506
530
 
507
531
 
508
- def _generate_feature_maps_from_samples(study):
509
- """
510
- Generate feature maps using Study-level features_df instead of Sample-level loading.
511
- This uses the study's existing features_df which is already loaded.
512
-
513
- Args:
514
- study: Study object containing features_df
515
-
516
- Returns:
517
- list: List of temporary FeatureMap objects built from Study-level data
518
- """
519
- import pyopenms as oms
520
-
521
- temp_feature_maps = []
522
-
523
- study.logger.info(f"Building feature maps using Study-level features_df from {len(study.samples_df)} samples")
524
-
525
- # Use the features_df from the study that's already loaded
526
- if not hasattr(study, 'features_df') or study.features_df is None or study.features_df.is_empty():
527
- study.logger.warning("No features_df available - features must be loaded first")
528
- return temp_feature_maps
529
-
530
- # Group features by sample
531
- study.logger.info(f"Processing {len(study.features_df)} features grouped by sample")
532
-
533
- # Get unique sample names/indices
534
- if 'sample_uid' in study.features_df.columns:
535
- sample_groups = study.features_df.group_by('sample_uid')
536
- study.logger.debug("Grouping features by 'sample_uid' column")
537
- elif 'sample_id' in study.features_df.columns:
538
- sample_groups = study.features_df.group_by('sample_id')
539
- study.logger.debug("Grouping features by 'sample_id' column")
540
- elif 'sample' in study.features_df.columns:
541
- sample_groups = study.features_df.group_by('sample')
542
- study.logger.debug("Grouping features by 'sample' column")
543
- else:
544
- study.logger.warning("No sample grouping column found in features_df")
545
- study.logger.info(f"Available columns: {study.features_df.columns}")
546
- return temp_feature_maps
547
-
548
- # Process each sample group
549
- processed_samples = 0
550
- for sample_key, sample_features in sample_groups:
551
- try:
552
- feature_map = oms.FeatureMap()
553
- feature_count = 0
554
-
555
- # Build features from this sample's features
556
- for row in sample_features.iter_rows(named=True):
557
- try:
558
- feature = oms.Feature()
559
-
560
- # Set feature properties
561
- if row.get("feature_id") is not None:
562
- feature.setUniqueId(int(row["feature_id"]))
563
- if row.get("mz") is not None:
564
- feature.setMZ(float(row["mz"]))
565
- if row.get("rt") is not None:
566
- feature.setRT(float(row["rt"]))
567
- if row.get("inty") is not None:
568
- feature.setIntensity(float(row["inty"]))
569
- if row.get("quality") is not None:
570
- feature.setOverallQuality(float(row["quality"]))
571
- if row.get("charge") is not None:
572
- feature.setCharge(int(row["charge"]))
573
-
574
- feature_map.push_back(feature)
575
- feature_count += 1
576
-
577
- except (ValueError, TypeError) as e:
578
- study.logger.warning(f"Skipping feature in sample {sample_key} due to conversion error: {e}")
579
- continue
580
-
581
- temp_feature_maps.append(feature_map)
582
- processed_samples += 1
583
- study.logger.debug(f"Built feature map for sample {sample_key} with {feature_count} features")
584
-
585
- except Exception as e:
586
- study.logger.warning(f"Failed to process sample group {sample_key}: {e}")
587
- # Add empty feature map for failed samples to maintain sample order
588
- temp_feature_maps.append(oms.FeatureMap())
589
-
590
- study.logger.info(f"Generated {len(temp_feature_maps)} feature maps from {processed_samples} samples using Study-level features_df")
591
- return temp_feature_maps
592
-
593
-
594
532
  def _generate_feature_maps_on_demand(study):
595
533
  """
596
534
  Generate feature maps on-demand using Sample-level _load_ms1() for merge operations.
@@ -610,9 +548,9 @@ def _generate_feature_maps_on_demand(study):
610
548
  use_sample_loading = True # Default to Sample-level loading as requested
611
549
 
612
550
  # Use Sample-level loading if requested and samples_df is available
613
- if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
614
- study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
615
- return _generate_feature_maps_from_samples(study)
551
+ #if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
552
+ # study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
553
+ # return _generate_feature_maps_from_samples(study)
616
554
 
617
555
  # Fallback to original features_df approach
618
556
  if study.features_df is None or len(study.features_df) == 0:
@@ -750,9 +688,6 @@ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
750
688
  params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
751
689
  params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
752
690
  params_oms.setValue("ignore_charge", "true")
753
- #params_oms.setValue("min_rel_cc_size", params.min_rel_cc_size)
754
- #params_oms.setValue("max_pairwise_log_fc", params.max_pairwise_log_fc)
755
- #params_oms.setValue("max_nr_conflicts", params.max_nr_conflicts)
756
691
  params_oms.setValue("nr_partitions", params.nr_partitions)
757
692
 
758
693
  grouper.setParameters(params_oms)
@@ -761,534 +696,6 @@ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
761
696
  return consensus_map
762
697
 
763
698
 
764
- def _merge_kd_strict(study, params: merge_defaults) -> oms.ConsensusMap:
765
- """
766
- Quality merge: Standard KD algorithm with post-processing quality control.
767
-
768
- This method combines the sensitivity of KD clustering with post-processing steps
769
- to reduce oversegmentation while maintaining high-quality consensus features.
770
- This is the recommended default method.
771
-
772
- Post-processing features:
773
- 1. RT tolerance optimization (optional)
774
- 2. Secondary clustering for close features
775
- 3. Sample overlap validation
776
- 4. RT spread quality filtering
777
- 5. Chromatographic coherence validation
778
-
779
- Additional parameters supported in params:
780
- - optimize_rt_tol: bool - Enable RT tolerance optimization
781
- - rt_tol_range: tuple - RT tolerance range for optimization (min, max)
782
- - secondary_merge_rt_tol: float - Secondary merge RT tolerance (default: 0.5s)
783
- - secondary_merge_mz_tol: float - Secondary merge m/z tolerance (default: 0.005)
784
- - min_sample_overlap: float - Minimum sample overlap for merging (0.0-1.0, default: 0.8)
785
- - max_rt_spread: float - Maximum RT spread allowed (default: 2x rt_tol)
786
- - min_coherence: float - Minimum chromatographic coherence (default: 0.0, disabled)
787
- """
788
-
789
- # Check for RT tolerance optimization
790
- optimize_rt_tol = getattr(params, 'optimize_rt_tol', False)
791
-
792
- if optimize_rt_tol:
793
- # Optimize RT tolerance first
794
- optimal_rt_tol = _optimize_rt_tolerance(study, params)
795
- study.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
796
- # Create modified params with optimal RT tolerance
797
- import copy
798
- optimized_params = copy.deepcopy(params)
799
- optimized_params.rt_tol = optimal_rt_tol
800
- else:
801
- optimized_params = params
802
-
803
- # Phase 1: Standard KD clustering
804
- study.logger.debug("Initial KD clustering")
805
- consensus_map = _merge_kd(study, optimized_params)
806
-
807
- # Phase 2: Post-processing quality control
808
- study.logger.debug("Post-processing quality control")
809
- consensus_map = _apply_kd_strict_postprocessing(study, consensus_map, optimized_params)
810
-
811
- return consensus_map
812
-
813
-
814
- def _optimize_rt_tolerance(study, params: merge_defaults) -> float:
815
- """
816
- Optimize RT tolerance by testing different values and measuring oversegmentation.
817
-
818
- Args:
819
- study: Study object
820
- params: Merge parameters
821
-
822
- Returns:
823
- Optimal RT tolerance value
824
- """
825
- rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
826
- rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
827
-
828
- study.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
829
-
830
- # Generate test values
831
- test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
832
- for i in range(rt_tol_steps)]
833
-
834
- best_rt_tol = params.rt_tol
835
- best_score = float('inf')
836
-
837
- # Store original features for restoration
838
- original_consensus_df = getattr(study, 'consensus_df', pl.DataFrame())
839
- original_consensus_mapping_df = getattr(study, 'consensus_mapping_df', pl.DataFrame())
840
-
841
- for test_rt_tol in test_rt_tols:
842
- try:
843
- # Create test parameters
844
- import copy
845
- test_params = copy.deepcopy(params)
846
- test_params.rt_tol = test_rt_tol
847
-
848
- # Run KD merge with test parameters
849
- test_consensus_map = _merge_kd(study, test_params)
850
-
851
- # Extract consensus features temporarily for analysis
852
- _extract_consensus_features(study, test_consensus_map, test_params.min_samples)
853
-
854
- if len(study.consensus_df) == 0:
855
- continue
856
-
857
- # Calculate oversegmentation metrics
858
- oversegmentation_score = _calculate_oversegmentation_score(study, test_rt_tol)
859
-
860
- study.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(study.consensus_df)} features, score: {oversegmentation_score:.3f}")
861
-
862
- # Lower score is better (less oversegmentation)
863
- if oversegmentation_score < best_score:
864
- best_score = oversegmentation_score
865
- best_rt_tol = test_rt_tol
866
-
867
- except Exception as e:
868
- study.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
869
- continue
870
-
871
- # Restore original consensus data
872
- study.consensus_df = original_consensus_df
873
- study.consensus_mapping_df = original_consensus_mapping_df
874
-
875
- study.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
876
- return best_rt_tol
877
-
878
-
879
- def _calculate_oversegmentation_score(study, rt_tol: float) -> float:
880
- """
881
- Calculate oversegmentation score based on feature density and RT spread metrics.
882
- Lower scores indicate less oversegmentation.
883
-
884
- Args:
885
- study: Study object
886
- rt_tol: RT tolerance used
887
-
888
- Returns:
889
- Oversegmentation score (lower = better)
890
- """
891
- if len(study.consensus_df) == 0:
892
- return float('inf')
893
-
894
- # Metric 1: Feature density (features per RT second)
895
- rt_range = study.consensus_df['rt'].max() - study.consensus_df['rt'].min()
896
- if rt_range <= 0:
897
- return float('inf')
898
-
899
- feature_density = len(study.consensus_df) / rt_range
900
-
901
- # Metric 2: Average RT spread relative to tolerance
902
- rt_spreads = (study.consensus_df['rt_max'] - study.consensus_df['rt_min'])
903
- avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
904
-
905
- # Metric 3: Proportion of features with low sample counts (indicates fragmentation)
906
- low_sample_features = len(study.consensus_df.filter(pl.col('number_samples') <= 5))
907
- low_sample_ratio = low_sample_features / len(study.consensus_df)
908
-
909
- # Metric 4: Number of features with excessive RT spread
910
- excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
911
- excessive_spread_ratio = excessive_spread_features / len(study.consensus_df)
912
-
913
- # Combined score (weighted combination)
914
- oversegmentation_score = (
915
- 0.4 * (feature_density / 10.0) + # Normalize to reasonable scale
916
- 0.3 * avg_rt_spread_ratio +
917
- 0.2 * low_sample_ratio +
918
- 0.1 * excessive_spread_ratio
919
- )
920
-
921
- return oversegmentation_score
922
-
923
-
924
- def _apply_kd_strict_postprocessing(study, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
925
- """
926
- Apply post-processing quality control to KD consensus map.
927
-
928
- Args:
929
- consensus_map: Initial consensus map from KD
930
- params: Merge parameters with kd-strict options
931
-
932
- Returns:
933
- Processed consensus map with reduced oversegmentation
934
- """
935
- if consensus_map.size() == 0:
936
- study.logger.warning("Empty consensus map provided to post-processing")
937
- return consensus_map
938
-
939
- study.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
940
-
941
- # Step 1: Extract initial consensus features
942
- original_min_samples = params.min_samples
943
- params.min_samples = 1 # Extract all features initially
944
-
945
- _extract_consensus_features(study, consensus_map, params.min_samples)
946
- initial_feature_count = len(study.consensus_df)
947
-
948
- if initial_feature_count == 0:
949
- study.logger.warning("No consensus features extracted for post-processing")
950
- params.min_samples = original_min_samples
951
- return consensus_map
952
-
953
- # Step 2: Secondary clustering for close features
954
- secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
955
- secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
956
-
957
- study.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
958
- merged_features = _perform_secondary_clustering(study, secondary_merge_rt_tol, secondary_merge_mz_tol)
959
-
960
- # Step 3: Sample overlap validation
961
- min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
962
- if min_sample_overlap > 0:
963
- study.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
964
- merged_features = _validate_sample_overlap(study, merged_features, min_sample_overlap)
965
-
966
- # Step 4: RT spread quality filtering
967
- if params.rt_tol is not None:
968
- max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
969
- if max_rt_spread is not None:
970
- study.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
971
- merged_features = _filter_rt_spread(study, merged_features, max_rt_spread)
972
- else:
973
- study.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
974
- else:
975
- study.logger.debug("Skipping RT spread filtering - rt_tol is None")
976
-
977
- # Step 5: Chromatographic coherence filtering (optional)
978
- min_coherence = getattr(params, 'min_coherence', 0.0)
979
- if min_coherence > 0:
980
- study.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
981
- merged_features = _filter_coherence(study, merged_features, min_coherence)
982
-
983
- # Step 6: Rebuild consensus_df with filtered features and preserve mapping
984
- original_mapping_df = study.consensus_mapping_df.clone() # Save original mapping
985
- study.consensus_df = pl.DataFrame(merged_features, strict=False)
986
-
987
- # Step 7: Apply original min_samples filter
988
- params.min_samples = original_min_samples
989
- if params.min_samples > 1:
990
- l1 = len(study.consensus_df)
991
- study.consensus_df = study.consensus_df.filter(
992
- pl.col("number_samples") >= params.min_samples
993
- )
994
- filtered_count = l1 - len(study.consensus_df)
995
- if filtered_count > 0:
996
- study.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
997
-
998
- # Step 8: Update consensus_mapping_df to match final consensus_df
999
- if len(study.consensus_df) > 0 and len(original_mapping_df) > 0:
1000
- valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
1001
- study.consensus_mapping_df = original_mapping_df.filter(
1002
- pl.col('consensus_uid').is_in(list(valid_consensus_ids))
1003
- )
1004
- else:
1005
- study.consensus_mapping_df = pl.DataFrame()
1006
-
1007
- final_feature_count = len(study.consensus_df)
1008
- reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
1009
-
1010
- study.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
1011
-
1012
- # Create a new consensus map for compatibility (the processed data is in consensus_df)
1013
- processed_consensus_map = oms.ConsensusMap()
1014
- return processed_consensus_map
1015
-
1016
-
1017
- def _perform_secondary_clustering(study, rt_tol: float, mz_tol: float) -> list:
1018
- """
1019
- Perform secondary clustering to merge very close features.
1020
-
1021
- Args:
1022
- rt_tol: RT tolerance for secondary clustering
1023
- mz_tol: m/z tolerance for secondary clustering
1024
-
1025
- Returns:
1026
- List of merged consensus feature dictionaries
1027
- """
1028
- if len(study.consensus_df) == 0:
1029
- return []
1030
-
1031
- # Convert consensus_df to list of dictionaries for clustering
1032
- consensus_features = []
1033
- for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
1034
- consensus_features.append(dict(row))
1035
-
1036
- # Use Union-Find for efficient clustering
1037
- class UnionFind:
1038
- def __init__(study, n):
1039
- study.parent = list(range(n))
1040
- study.rank = [0] * n
1041
-
1042
- def find(study, x):
1043
- if study.parent[x] != x:
1044
- study.parent[x] = study.find(study.parent[x])
1045
- return study.parent[x]
1046
-
1047
- def union(study, x, y):
1048
- px, py = study.find(x), study.find(y)
1049
- if px == py:
1050
- return
1051
- if study.rank[px] < study.rank[py]:
1052
- px, py = py, px
1053
- study.parent[py] = px
1054
- if study.rank[px] == study.rank[py]:
1055
- study.rank[px] += 1
1056
-
1057
- n_features = len(consensus_features)
1058
- uf = UnionFind(n_features)
1059
-
1060
- # Find features to merge based on proximity
1061
- merge_count = 0
1062
- for i in range(n_features):
1063
- for j in range(i + 1, n_features):
1064
- feat_i = consensus_features[i]
1065
- feat_j = consensus_features[j]
1066
-
1067
- rt_diff = abs(feat_i['rt'] - feat_j['rt'])
1068
- mz_diff = abs(feat_i['mz'] - feat_j['mz'])
1069
-
1070
- if rt_diff <= rt_tol and mz_diff <= mz_tol:
1071
- uf.union(i, j)
1072
- merge_count += 1
1073
-
1074
- # Group features by their root
1075
- groups_by_root = defaultdict(list)
1076
- for i in range(n_features):
1077
- root = uf.find(i)
1078
- groups_by_root[root].append(consensus_features[i])
1079
-
1080
- # Merge features within each group
1081
- merged_features = []
1082
- for group in groups_by_root.values():
1083
- if len(group) == 1:
1084
- # Single feature - keep as is
1085
- merged_features.append(group[0])
1086
- else:
1087
- # Multiple features - merge them
1088
- merged_feature = _merge_feature_group(group)
1089
- merged_features.append(merged_feature)
1090
-
1091
- study.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
1092
- return merged_features
1093
-
1094
-
1095
- def _merge_feature_group(feature_group: list) -> dict:
1096
- """
1097
- Merge a group of similar consensus features into one.
1098
-
1099
- Args:
1100
- feature_group: List of consensus feature dictionaries to merge
1101
-
1102
- Returns:
1103
- Merged consensus feature dictionary
1104
- """
1105
- if not feature_group:
1106
- return {}
1107
-
1108
- if len(feature_group) == 1:
1109
- return feature_group[0]
1110
-
1111
- # Use the feature with highest sample count as base
1112
- base_feature = max(feature_group, key=lambda f: f.get('number_samples', 0))
1113
- merged = base_feature.copy()
1114
-
1115
- # Aggregate numeric statistics
1116
- rt_values = [f['rt'] for f in feature_group if f.get('rt') is not None]
1117
- mz_values = [f['mz'] for f in feature_group if f.get('mz') is not None]
1118
- sample_counts = [f.get('number_samples', 0) for f in feature_group]
1119
- intensities = [f.get('inty_mean', 0) for f in feature_group if f.get('inty_mean') is not None]
1120
-
1121
- # Update merged feature statistics
1122
- if rt_values:
1123
- merged['rt'] = float(np.mean(rt_values))
1124
- merged['rt_min'] = min([f.get('rt_min', f['rt']) for f in feature_group])
1125
- merged['rt_max'] = max([f.get('rt_max', f['rt']) for f in feature_group])
1126
- merged['rt_mean'] = float(np.mean(rt_values))
1127
-
1128
- if mz_values:
1129
- merged['mz'] = float(np.mean(mz_values))
1130
- merged['mz_min'] = min([f.get('mz_min', f['mz']) for f in feature_group])
1131
- merged['mz_max'] = max([f.get('mz_max', f['mz']) for f in feature_group])
1132
- merged['mz_mean'] = float(np.mean(mz_values))
1133
-
1134
- # Use maximum sample count (features might be detected in overlapping but different samples)
1135
- merged['number_samples'] = max(sample_counts)
1136
-
1137
- # Use weighted average intensity (by sample count)
1138
- if intensities and sample_counts:
1139
- total_weight = sum(sample_counts)
1140
- if total_weight > 0:
1141
- weighted_intensity = sum(inty * count for inty, count in zip(intensities, sample_counts)) / total_weight
1142
- merged['inty_mean'] = float(weighted_intensity)
1143
-
1144
- # Aggregate chromatographic quality metrics if available
1145
- coherence_values = [f.get('chrom_coherence_mean', 0) for f in feature_group if f.get('chrom_coherence_mean') is not None]
1146
- prominence_values = [f.get('chrom_prominence_mean', 0) for f in feature_group if f.get('chrom_prominence_mean') is not None]
1147
-
1148
- if coherence_values:
1149
- merged['chrom_coherence_mean'] = float(np.mean(coherence_values))
1150
- if prominence_values:
1151
- merged['chrom_prominence_mean'] = float(np.mean(prominence_values))
1152
-
1153
- # Merge MS2 counts
1154
- ms2_counts = [f.get('number_ms2', 0) for f in feature_group]
1155
- merged['number_ms2'] = sum(ms2_counts)
1156
-
1157
- # Keep the best quality score
1158
- quality_scores = [f.get('quality', 1.0) for f in feature_group if f.get('quality') is not None]
1159
- if quality_scores:
1160
- merged['quality'] = max(quality_scores)
1161
-
1162
- return merged
1163
-
1164
-
1165
- def _validate_sample_overlap(study, features: list, min_overlap: float) -> list:
1166
- """
1167
- Validate that merged features have sufficient sample overlap.
1168
-
1169
- Args:
1170
- features: List of consensus feature dictionaries
1171
- min_overlap: Minimum sample overlap ratio (0.0-1.0)
1172
-
1173
- Returns:
1174
- List of validated features
1175
- """
1176
- # This is a placeholder for sample overlap validation
1177
- # Implementation would require access to which samples each feature appears in
1178
- # For now, we'll use a simple heuristic based on feature statistics
1179
-
1180
- validated_features = []
1181
- for feature in features:
1182
- # Simple validation based on RT spread and sample count ratio
1183
- rt_spread = feature.get('rt_max', feature['rt']) - feature.get('rt_min', feature['rt'])
1184
- sample_count = feature.get('number_samples', 1)
1185
-
1186
- # Features with very tight RT spread and high sample counts are more reliable
1187
- if rt_spread <= 2.0 or sample_count >= 10: # More permissive validation
1188
- validated_features.append(feature)
1189
- else:
1190
- # Could implement more sophisticated sample overlap checking here
1191
- validated_features.append(feature) # Keep for now
1192
-
1193
- return validated_features
1194
-
1195
-
1196
- def _filter_rt_spread(study, features: list, max_rt_spread: float) -> list:
1197
- """
1198
- Filter out features with excessive RT spread.
1199
-
1200
- Args:
1201
- features: List of consensus feature dictionaries
1202
- max_rt_spread: Maximum allowed RT spread in seconds
1203
-
1204
- Returns:
1205
- List of filtered features
1206
- """
1207
- filtered_features = []
1208
- filtered_count = 0
1209
-
1210
- for feature in features:
1211
- rt_min = feature.get('rt_min', feature['rt'])
1212
- rt_max = feature.get('rt_max', feature['rt'])
1213
- rt_spread = rt_max - rt_min
1214
-
1215
- if rt_spread <= max_rt_spread:
1216
- filtered_features.append(feature)
1217
- else:
1218
- filtered_count += 1
1219
-
1220
- if filtered_count > 0:
1221
- study.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
1222
-
1223
- return filtered_features
1224
-
1225
-
1226
- def _filter_coherence(study, features: list, min_coherence: float) -> list:
1227
- """
1228
- Filter out features with low chromatographic coherence.
1229
-
1230
- Args:
1231
- features: List of consensus feature dictionaries
1232
- min_coherence: Minimum chromatographic coherence score
1233
-
1234
- Returns:
1235
- List of filtered features
1236
- """
1237
- filtered_features = []
1238
- filtered_count = 0
1239
-
1240
- for feature in features:
1241
- coherence = feature.get('chrom_coherence_mean', 1.0) # Default to high coherence if missing
1242
-
1243
- if coherence >= min_coherence:
1244
- filtered_features.append(feature)
1245
- else:
1246
- filtered_count += 1
1247
-
1248
- if filtered_count > 0:
1249
- study.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
1250
-
1251
- return filtered_features
1252
-
1253
-
1254
- def _merge_kd_nowarp(study, params: merge_defaults) -> oms.ConsensusMap:
1255
- """KD-tree based merge without RT warping"""
1256
-
1257
- # Generate temporary feature maps on-demand from features_df
1258
- temp_feature_maps = _generate_feature_maps_on_demand(study)
1259
-
1260
- consensus_map = oms.ConsensusMap()
1261
- file_descriptions = consensus_map.getColumnHeaders()
1262
-
1263
- for i, feature_map in enumerate(temp_feature_maps):
1264
- file_description = file_descriptions.get(i, oms.ColumnHeader())
1265
- file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
1266
- file_description.size = feature_map.size()
1267
- file_description.unique_id = feature_map.getUniqueId()
1268
- file_descriptions[i] = file_description
1269
-
1270
- consensus_map.setColumnHeaders(file_descriptions)
1271
-
1272
- # Configure KD algorithm with warping disabled for memory efficiency
1273
- grouper = oms.FeatureGroupingAlgorithmKD()
1274
- params_oms = grouper.getParameters()
1275
-
1276
- params_oms.setValue("mz_unit", "Da")
1277
- params_oms.setValue("nr_partitions", params.nr_partitions)
1278
- params_oms.setValue("warp:enabled", "false") # Disabled for memory efficiency
1279
- params_oms.setValue("link:rt_tol", params.rt_tol)
1280
- params_oms.setValue("link:mz_tol", params.mz_tol)
1281
- params_oms.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
1282
- params_oms.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
1283
- params_oms.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
1284
- #params_oms.setValue("link:charge_merging", "Any")
1285
-
1286
- grouper.setParameters(params_oms)
1287
- grouper.group(temp_feature_maps, consensus_map)
1288
-
1289
- return consensus_map
1290
-
1291
-
1292
699
  def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1293
700
  """KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1294
701
 
@@ -1462,7 +869,7 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
1462
869
 
1463
870
  # Merge chunk results with proper cross-chunk consensus building
1464
871
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1465
- _merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
872
+ _dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1466
873
 
1467
874
  # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
1468
875
  consensus_map = oms.ConsensusMap()
@@ -1637,14 +1044,14 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
1637
1044
 
1638
1045
  # Merge chunk results with proper cross-chunk consensus building
1639
1046
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1640
- _merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1047
+ _dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1641
1048
 
1642
1049
  # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
1643
1050
  consensus_map = oms.ConsensusMap()
1644
1051
  return consensus_map
1645
1052
 
1646
1053
 
1647
- def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
1054
+ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
1648
1055
  """
1649
1056
  Scalable aggregation of chunk consensus maps into final consensus_df.
1650
1057
 
@@ -1672,7 +1079,7 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
1672
1079
  for row in study.features_df.iter_rows(named=True)
1673
1080
  }
1674
1081
 
1675
- features_lookup = _optimized_feature_lookup(study, study.features_df)
1082
+ features_lookup = __merge_feature_lookup(study, study.features_df)
1676
1083
 
1677
1084
  # Extract all consensus features from chunks with their feature_uids
1678
1085
  all_chunk_consensus = []
@@ -1705,7 +1112,10 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
1705
1112
  if feature_data:
1706
1113
  feature_uids.append(feature_uid)
1707
1114
  feature_data_list.append(feature_data)
1708
- sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
1115
+
1116
+ # Use feature_uid to lookup actual sample_uid instead of chunk position
1117
+ actual_sample_uid = feature_data['sample_uid']
1118
+ sample_uids.append(actual_sample_uid)
1709
1119
 
1710
1120
  if not feature_data_list:
1711
1121
  # No retrievable feature metadata (possible stale map reference) -> skip
@@ -1725,13 +1135,6 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
1725
1135
 
1726
1136
  # Process the consensus features (now all in serialized format)
1727
1137
  for consensus_feature_data in consensus_features_data:
1728
- # ACCEPT ALL consensus features (size >=1) here.
1729
- # Reason: A feature that is globally present in many samples can still
1730
- # appear only once inside a given sample chunk. Early filtering at
1731
- # size>=2 causes irreversible loss and underestimates the final
1732
- # consensus count (observed ~296 vs 950 for KD). We defer filtering
1733
- # strictly to the final global min_samples.
1734
-
1735
1138
  # For parallel processing, feature data is already extracted
1736
1139
  if isinstance(chunk_data, list):
1737
1140
  # Extract feature_uids and data from serialized format for parallel processing
@@ -1749,11 +1152,14 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
1749
1152
  if feature_data:
1750
1153
  feature_uids.append(feature_uid)
1751
1154
  feature_data_list.append(feature_data)
1752
- sample_uids.append(chunk_start_idx + handle_data['map_index'] + 1)
1155
+
1156
+ # Use feature_uid to lookup actual sample_uid instead of chunk position
1157
+ actual_sample_uid = feature_data['sample_uid']
1158
+ sample_uids.append(actual_sample_uid)
1753
1159
 
1754
1160
  if not feature_data_list:
1755
1161
  continue
1756
-
1162
+
1757
1163
  # Get RT/MZ from consensus feature data
1758
1164
  consensus_rt = consensus_feature_data['rt']
1759
1165
  consensus_mz = consensus_feature_data['mz']
@@ -1818,163 +1224,301 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
1818
1224
  study.consensus_mapping_df = pl.DataFrame()
1819
1225
  return
1820
1226
 
1821
- # Perform cross-chunk clustering using optimized spatial indexing
1822
- def _cluster_chunk_consensus(chunk_consensus_list: list, rt_tol: float, mz_tol: float) -> list:
1823
- """Cluster chunk consensus features using interval overlap (no over-relaxation).
1824
-
1825
- A union is formed if either centroids are within tolerance OR their RT / m/z
1826
- intervals (expanded by tolerance) overlap, and they originate from different chunks.
1227
+ # CROSS-CHUNK DECHUNKING ALGORITHMS
1228
+ # Multiple algorithms available for combining chunk results
1229
+
1230
+ class HierarchicalAnchorMerger:
1231
+ """
1232
+ Hierarchical Anchor Merger: Comprehensive cross-chunk feature preservation.
1233
+ Uses Union-Find clustering for transitive matching across multiple chunks.
1827
1234
  """
1828
- if not chunk_consensus_list:
1829
- return []
1830
-
1831
- n_features = len(chunk_consensus_list)
1832
-
1833
- # Spatial bins using strict tolerances (improves candidate reduction without recall loss)
1834
- rt_bin_size = rt_tol if rt_tol > 0 else 1.0
1835
- mz_bin_size = mz_tol if mz_tol > 0 else 0.01
1836
- features_by_bin = defaultdict(list)
1837
-
1838
- for i, cf in enumerate(chunk_consensus_list):
1839
- rt_bin = int(cf['rt'] / rt_bin_size)
1840
- mz_bin = int(cf['mz'] / mz_bin_size)
1841
- features_by_bin[(rt_bin, mz_bin)].append(i)
1842
-
1843
- class UF:
1844
- def __init__(study, n):
1845
- study.p = list(range(n))
1846
- study.r = [0]*n
1847
- def find(study, x):
1848
- if study.p[x] != x:
1849
- study.p[x] = study.find(study.p[x])
1850
- return study.p[x]
1851
- def union(study, a,b):
1852
- pa, pb = study.find(a), study.find(b)
1853
- if pa == pb:
1854
- return
1855
- if study.r[pa] < study.r[pb]:
1856
- pa, pb = pb, pa
1857
- study.p[pb] = pa
1858
- if study.r[pa] == study.r[pb]:
1859
- study.r[pa] += 1
1860
-
1861
- uf = UF(n_features)
1862
- checked = set()
1863
- for (rtb, mzb), idxs in features_by_bin.items():
1864
- for dr in (-1,0,1):
1865
- for dm in (-1,0,1):
1866
- neigh = (rtb+dr, mzb+dm)
1867
- if neigh not in features_by_bin:
1235
+ def __init__(self, rt_tol: float, mz_tol: float):
1236
+ self.rt_tol = rt_tol
1237
+ self.mz_tol = mz_tol
1238
+
1239
+ def merge(self, chunk_consensus_list: list) -> list:
1240
+ """Fixed hierarchical merging with union-find clustering for complete feature preservation"""
1241
+ if not chunk_consensus_list:
1242
+ return []
1243
+
1244
+ study.logger.debug(f"FIXED HierarchicalAnchorMerger: processing {len(chunk_consensus_list)} chunk features")
1245
+
1246
+ # Union-Find data structure for transitive clustering
1247
+ class UnionFind:
1248
+ def __init__(self, n):
1249
+ self.parent = list(range(n))
1250
+ self.rank = [0] * n
1251
+
1252
+ def find(self, x):
1253
+ if self.parent[x] != x:
1254
+ self.parent[x] = self.find(self.parent[x]) # Path compression
1255
+ return self.parent[x]
1256
+
1257
+ def union(self, x, y):
1258
+ px, py = self.find(x), self.find(y)
1259
+ if px == py:
1260
+ return False # Already in same component
1261
+ # Union by rank for balanced trees
1262
+ if self.rank[px] < self.rank[py]:
1263
+ px, py = py, px
1264
+ self.parent[py] = px
1265
+ if self.rank[px] == self.rank[py]:
1266
+ self.rank[px] += 1
1267
+ return True # Union was performed
1268
+
1269
+ n_features = len(chunk_consensus_list)
1270
+ uf = UnionFind(n_features)
1271
+ merges_made = 0
1272
+
1273
+ # Optimized cross-chunk feature matching using KD-tree spatial indexing
1274
+
1275
+ # Proper dimensional scaling for RT vs m/z
1276
+ rt_scale = 1.0 # RT in seconds (1-30 min range)
1277
+ mz_scale = 100.0 # m/z in Da (100-1000 range) - scale to match RT magnitude
1278
+
1279
+ # Build spatial index with scaled coordinates
1280
+ points = np.array([[f['rt'] * rt_scale, f['mz'] * mz_scale] for f in chunk_consensus_list])
1281
+ tree = cKDTree(points, balanced_tree=True, compact_nodes=True)
1282
+
1283
+ # Calculate proper Euclidean radius in scaled space
1284
+ scaled_rt_tol = self.rt_tol * rt_scale
1285
+ scaled_mz_tol = self.mz_tol * mz_scale
1286
+ radius = np.sqrt(scaled_rt_tol**2 + scaled_mz_tol**2)
1287
+
1288
+ # Efficient neighbor search for feature matching
1289
+ for i in range(n_features):
1290
+ feature_i = chunk_consensus_list[i]
1291
+ chunk_i = feature_i.get('chunk_idx', -1)
1292
+
1293
+ # Query spatial index for nearby features
1294
+ neighbor_indices = tree.query_ball_point(points[i], r=radius, p=2)
1295
+
1296
+ for j in neighbor_indices:
1297
+ if i >= j: # Skip duplicates and self
1868
1298
  continue
1869
- for i in idxs:
1870
- for j in features_by_bin[neigh]:
1871
- if i >= j:
1872
- continue
1873
- pair = (i,j)
1874
- if pair in checked:
1875
- continue
1876
- checked.add(pair)
1877
- a = chunk_consensus_list[i]
1878
- b = chunk_consensus_list[j]
1879
- if a['chunk_idx'] == b['chunk_idx']:
1880
- continue
1881
-
1882
- # Primary check: centroid distance (strict)
1883
- centroid_close = (abs(a['rt']-b['rt']) <= rt_tol and abs(a['mz']-b['mz']) <= mz_tol)
1884
-
1885
- # Secondary check: interval overlap (more conservative)
1886
- # Only allow interval overlap if centroids are reasonably close (within 2x tolerance)
1887
- centroids_reasonable = (abs(a['rt']-b['rt']) <= 2 * rt_tol and abs(a['mz']-b['mz']) <= 2 * mz_tol)
1888
- if centroids_reasonable:
1889
- rt_overlap = (a['rt_min'] - rt_tol/2) <= (b['rt_max'] + rt_tol/2) and (b['rt_min'] - rt_tol/2) <= (a['rt_max'] + rt_tol/2)
1890
- mz_overlap = (a['mz_min'] - mz_tol/2) <= (b['mz_max'] + mz_tol/2) and (b['mz_min'] - mz_tol/2) <= (a['mz_max'] + mz_tol/2)
1891
- else:
1892
- rt_overlap = mz_overlap = False
1893
-
1894
- if centroid_close or (rt_overlap and mz_overlap):
1895
- uf.union(i,j)
1896
-
1897
- groups_by_root = defaultdict(list)
1898
- for i in range(n_features):
1899
- groups_by_root[uf.find(i)].append(chunk_consensus_list[i])
1900
- return list(groups_by_root.values())
1901
- # (Obsolete relaxed + centroid stitching code removed.)
1902
-
1903
- # --- Stage 1: initial cross-chunk clustering of chunk consensus features ---
1904
- initial_groups = _cluster_chunk_consensus(all_chunk_consensus, params.rt_tol, params.mz_tol)
1905
-
1906
- # --- Stage 2: centroid refinement (lightweight second pass) ---
1907
- def _refine_groups(groups: list, rt_tol: float, mz_tol: float) -> list:
1908
- """Refine groups by clustering group centroids (single-link) under same tolerances.
1909
-
1910
- This reconciles borderline splits left after interval-overlap clustering without
1911
- re-introducing broad over-merging. Works on group centroids only (low cost).
1299
+
1300
+ feature_j = chunk_consensus_list[j]
1301
+ chunk_j = feature_j.get('chunk_idx', -1)
1302
+
1303
+ # Skip features from same chunk (already clustered within chunk)
1304
+ if chunk_i == chunk_j:
1305
+ continue
1306
+
1307
+ # Verify with precise original tolerances (more accurate than scaled)
1308
+ rt_diff = abs(feature_i['rt'] - feature_j['rt'])
1309
+ mz_diff = abs(feature_i['mz'] - feature_j['mz'])
1310
+
1311
+ if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
1312
+ if uf.union(i, j): # Merge if not already connected
1313
+ merges_made += 1
1314
+
1315
+ study.logger.debug(f"FIXED HierarchicalAnchorMerger: made {merges_made} cross-chunk merges")
1316
+
1317
+ # Group features by their connected component
1318
+ clusters = {}
1319
+ for i in range(n_features):
1320
+ root = uf.find(i)
1321
+ if root not in clusters:
1322
+ clusters[root] = []
1323
+ clusters[root].append(chunk_consensus_list[i])
1324
+
1325
+ # Merge each cluster into a single consensus feature
1326
+ result = []
1327
+ for cluster_features in clusters.values():
1328
+ merged = self._merge_cluster(cluster_features)
1329
+ result.append(merged)
1330
+
1331
+ study.logger.debug(f"FIXED HierarchicalAnchorMerger: output {len(result)} merged features (from {n_features} inputs)")
1332
+
1333
+ # VERIFICATION: Ensure we haven't lost features
1334
+ if len(result) > len(chunk_consensus_list):
1335
+ study.logger.warning(f"FIXED HierarchicalAnchorMerger: More outputs than inputs ({len(result)} > {n_features})")
1336
+
1337
+ return result
1338
+
1339
+ def _merge_cluster(self, cluster: list) -> dict:
1340
+ """Merge cluster using sample-weighted consensus with robust error handling"""
1341
+ if len(cluster) == 1:
1342
+ return cluster[0] # No merging needed for single feature
1343
+
1344
+ # Calculate weights robustly to prevent division by zero
1345
+ weights = []
1346
+ for c in cluster:
1347
+ sample_count = c.get('sample_count', 0)
1348
+ # Use minimum weight of 1 to prevent zero weights
1349
+ weights.append(max(sample_count, 1))
1350
+
1351
+ total_weight = sum(weights)
1352
+ # Fallback for edge cases
1353
+ if total_weight == 0:
1354
+ total_weight = len(cluster)
1355
+ weights = [1] * len(cluster)
1356
+
1357
+ # Weighted consensus for RT/mz coordinates
1358
+ merged = {
1359
+ 'consensus_id': cluster[0]['consensus_id'], # Use first feature's ID
1360
+ 'chunk_indices': [c.get('chunk_idx', 0) for c in cluster],
1361
+ 'mz': sum(c['mz'] * w for c, w in zip(cluster, weights)) / total_weight,
1362
+ 'rt': sum(c['rt'] * w for c, w in zip(cluster, weights)) / total_weight,
1363
+ 'intensity': sum(c.get('intensity', 0) for c in cluster),
1364
+ 'quality': sum(c.get('quality', 1) * w for c, w in zip(cluster, weights)) / total_weight,
1365
+ 'feature_uids': [],
1366
+ 'feature_data_list': [],
1367
+ 'sample_uids': [],
1368
+ 'sample_count': 0
1369
+ }
1370
+
1371
+ # Aggregate all features and samples from all chunks
1372
+ all_feature_uids = []
1373
+ all_feature_data = []
1374
+ all_sample_uids = []
1375
+
1376
+ for chunk in cluster:
1377
+ # Collect feature UIDs
1378
+ chunk_feature_uids = chunk.get('feature_uids', [])
1379
+ all_feature_uids.extend(chunk_feature_uids)
1380
+
1381
+ # Collect feature data
1382
+ chunk_feature_data = chunk.get('feature_data_list', [])
1383
+ all_feature_data.extend(chunk_feature_data)
1384
+
1385
+ # Collect sample UIDs
1386
+ chunk_sample_uids = chunk.get('sample_uids', [])
1387
+ all_sample_uids.extend(chunk_sample_uids)
1388
+
1389
+ # Remove duplicates properly and count unique samples
1390
+ merged['feature_uids'] = list(set(all_feature_uids))
1391
+ merged['feature_data_list'] = all_feature_data # Keep all feature data
1392
+ merged['sample_uids'] = list(set(all_sample_uids)) # Unique sample UIDs only
1393
+ merged['sample_count'] = len(merged['sample_uids']) # Count of unique samples
1394
+
1395
+ return merged
1396
+
1397
+ class KDTreeSpatialMerger:
1912
1398
  """
1913
- if len(groups) <= 1:
1914
- return groups
1915
- # Build centroid list
1916
- centroids = [] # (idx, rt, mz)
1917
- for gi, g in enumerate(groups):
1918
- if not g:
1919
- continue
1920
- rt_vals = [cf['rt'] for cf in g]
1921
- mz_vals = [cf['mz'] for cf in g]
1922
- if not rt_vals or not mz_vals:
1923
- continue
1924
- centroids.append((gi, float(np.mean(rt_vals)), float(np.mean(mz_vals))))
1925
- if len(centroids) <= 1:
1926
- return groups
1927
-
1928
- # Spatial binning for centroid clustering
1929
- rt_bin = rt_tol if rt_tol > 0 else 1.0
1930
- mz_bin = mz_tol if mz_tol > 0 else 0.01
1931
- bins = defaultdict(list)
1932
- for idx, rt_c, mz_c in centroids:
1933
- bins[(int(rt_c/rt_bin), int(mz_c/mz_bin))].append((idx, rt_c, mz_c))
1934
-
1935
- # Union-Find over group indices
1936
- parent = list(range(len(groups)))
1937
- rank = [0]*len(groups)
1938
- def find(x):
1939
- if parent[x] != x:
1940
- parent[x] = find(parent[x])
1941
- return parent[x]
1942
- def union(a,b):
1943
- pa, pb = find(a), find(b)
1944
- if pa == pb:
1945
- return
1946
- if rank[pa] < rank[pb]:
1947
- pa, pb = pb, pa
1948
- parent[pb] = pa
1949
- if rank[pa] == rank[pb]:
1950
- rank[pa] += 1
1951
-
1952
- checked = set()
1953
- for (rb, mb), items in bins.items():
1954
- for dr in (-1,0,1):
1955
- for dm in (-1,0,1):
1956
- neigh_key = (rb+dr, mb+dm)
1957
- if neigh_key not in bins:
1399
+ KD-Tree Spatial Merger: Optimized for high-sample features.
1400
+ """
1401
+ def __init__(self, rt_tol: float, mz_tol: float):
1402
+ self.rt_tol = rt_tol
1403
+ self.mz_tol = mz_tol
1404
+
1405
+ def merge(self, chunk_consensus_list: list) -> list:
1406
+ """KD-tree based spatial merging"""
1407
+ if not chunk_consensus_list:
1408
+ return []
1409
+
1410
+ try:
1411
+ from scipy.spatial import cKDTree
1412
+ import numpy as np
1413
+ except ImportError:
1414
+ # Fallback to simple clustering if scipy not available
1415
+ return self._fallback_merge(chunk_consensus_list)
1416
+
1417
+ # Build spatial index
1418
+ points = np.array([[c['rt'], c['mz']] for c in chunk_consensus_list])
1419
+ tree = cKDTree(points)
1420
+
1421
+ # Scale tolerances for KD-tree query
1422
+ rt_scale = 1.0 / self.rt_tol if self.rt_tol > 0 else 1.0
1423
+ mz_scale = 1.0 / self.mz_tol if self.mz_tol > 0 else 1.0
1424
+ scaled_points = points * np.array([rt_scale, mz_scale])
1425
+ scaled_tree = cKDTree(scaled_points)
1426
+
1427
+ clusters = []
1428
+ used = set()
1429
+
1430
+ # Priority processing for high-sample features
1431
+ high_sample_indices = [i for i, c in enumerate(chunk_consensus_list) if c['sample_count'] >= 100]
1432
+ remaining_indices = [i for i in range(len(chunk_consensus_list)) if i not in high_sample_indices]
1433
+
1434
+ for idx in high_sample_indices + remaining_indices:
1435
+ if idx in used:
1436
+ continue
1437
+
1438
+ # Find neighbors in scaled space
1439
+ neighbors = scaled_tree.query_ball_point(scaled_points[idx], r=1.0)
1440
+ cluster_indices = [i for i in neighbors if i not in used and i != idx]
1441
+ cluster_indices.append(idx)
1442
+
1443
+ if cluster_indices:
1444
+ cluster = [chunk_consensus_list[i] for i in cluster_indices]
1445
+ clusters.append(self._merge_cluster(cluster))
1446
+ used.update(cluster_indices)
1447
+
1448
+ return clusters
1449
+
1450
+ def _fallback_merge(self, chunk_consensus_list: list) -> list:
1451
+ """Simple distance-based fallback when scipy unavailable"""
1452
+ clusters = []
1453
+ used = set()
1454
+
1455
+ for i, anchor in enumerate(chunk_consensus_list):
1456
+ if i in used:
1457
+ continue
1458
+
1459
+ cluster = [anchor]
1460
+ used.add(i)
1461
+
1462
+ for j, candidate in enumerate(chunk_consensus_list):
1463
+ if j in used or j == i:
1958
1464
  continue
1959
- for (gi, rt_i, mz_i) in items:
1960
- for (gj, rt_j, mz_j) in bins[neigh_key]:
1961
- if gi >= gj:
1962
- continue
1963
- pair = (gi, gj)
1964
- if pair in checked:
1965
- continue
1966
- checked.add(pair)
1967
- if abs(rt_i-rt_j) <= rt_tol and abs(mz_i-mz_j) <= mz_tol:
1968
- union(gi, gj)
1969
-
1970
- merged = defaultdict(list)
1971
- for gi, g in enumerate(groups):
1972
- merged[find(gi)].extend(g)
1973
- return list(merged.values())
1974
-
1975
- refined_groups = _refine_groups(initial_groups, params.rt_tol, params.mz_tol)
1976
-
1977
- # --- Stage 3: build final consensus feature metadata and mapping ---
1465
+
1466
+ rt_diff = abs(candidate['rt'] - anchor['rt'])
1467
+ mz_diff = abs(candidate['mz'] - anchor['mz'])
1468
+
1469
+ if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
1470
+ cluster.append(candidate)
1471
+ used.add(j)
1472
+
1473
+ clusters.append(self._merge_cluster(cluster))
1474
+
1475
+ return clusters
1476
+
1477
+ def _merge_cluster(self, cluster: list) -> dict:
1478
+ """Merge cluster with intensity-weighted consensus"""
1479
+ if len(cluster) == 1:
1480
+ return cluster[0]
1481
+
1482
+ # Weight by intensity for spatial accuracy
1483
+ total_intensity = sum(c['intensity'] for c in cluster)
1484
+
1485
+ merged = {
1486
+ 'consensus_id': cluster[0]['consensus_id'],
1487
+ 'chunk_indices': [c['chunk_idx'] for c in cluster],
1488
+ 'mz': sum(c['mz'] * c['intensity'] for c in cluster) / total_intensity,
1489
+ 'rt': sum(c['rt'] * c['intensity'] for c in cluster) / total_intensity,
1490
+ 'intensity': total_intensity,
1491
+ 'quality': sum(c['quality'] for c in cluster) / len(cluster),
1492
+ 'feature_uids': [],
1493
+ 'feature_data_list': [],
1494
+ 'sample_uids': [],
1495
+ 'sample_count': 0
1496
+ }
1497
+
1498
+ # Aggregate features
1499
+ for chunk in cluster:
1500
+ merged['feature_uids'].extend(chunk['feature_uids'])
1501
+ merged['feature_data_list'].extend(chunk['feature_data_list'])
1502
+ merged['sample_uids'].extend(chunk['sample_uids'])
1503
+
1504
+ merged['feature_uids'] = list(set(merged['feature_uids']))
1505
+ merged['sample_count'] = len(set(merged['sample_uids']))
1506
+
1507
+ return merged
1508
+ # SELECT DECHUNKING ALGORITHM BASED ON PARAMETER
1509
+ if params.dechunking == "hierarchical":
1510
+ merger = HierarchicalAnchorMerger(params.rt_tol, params.mz_tol)
1511
+ final_consensus = merger.merge(all_chunk_consensus)
1512
+ elif params.dechunking == "kdtree":
1513
+ merger = KDTreeSpatialMerger(params.rt_tol, params.mz_tol)
1514
+ final_consensus = merger.merge(all_chunk_consensus)
1515
+ else:
1516
+ raise ValueError(f"Invalid dechunking method '{params.dechunking}'. Must be one of: ['hierarchical', 'kdtree']")
1517
+
1518
+ # --- Stage 1: Cross-chunk clustering using selected dechunking algorithm ---
1519
+ # New algorithms return final consensus features, no further refinement needed
1520
+ # Convert each merged consensus feature to a "group" of one feature for compatibility
1521
+ refined_groups = [[feature] for feature in final_consensus]
1978
1522
  consensus_metadata = []
1979
1523
  consensus_mapping_list = []
1980
1524
  consensus_uid_counter = 0
@@ -2011,7 +1555,6 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
2011
1555
 
2012
1556
  number_samples = len(sample_uids_acc)
2013
1557
 
2014
- # NOTE: Don't filter by min_samples here - let _finalize_merge handle it
2015
1558
  # This allows proper cross-chunk consensus building before final filtering
2016
1559
 
2017
1560
  metadata = _calculate_consensus_statistics(
@@ -2028,13 +1571,29 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
2028
1571
  cached_valid_adducts=cached_valid_adducts,
2029
1572
  )
2030
1573
 
2031
- # Validate RT spread doesn't exceed tolerance (with some flexibility for chunked merge)
1574
+ # Validate RT and m/z spread don't exceed tolerance limits
2032
1575
  rt_spread = metadata.get('rt_max', 0) - metadata.get('rt_min', 0)
2033
- max_allowed_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
1576
+ mz_spread = metadata.get('mz_max', 0) - metadata.get('mz_min', 0)
1577
+ max_allowed_rt_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
1578
+ max_allowed_mz_spread = params.mz_tol * 2 # Enforce strict m/z spread limit
1579
+
1580
+ skip_feature = False
1581
+ skip_reason = ""
2034
1582
 
2035
- if rt_spread > max_allowed_spread:
2036
- # Skip consensus features with excessive RT spread
2037
- study.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
1583
+ if rt_spread > max_allowed_rt_spread:
1584
+ skip_feature = True
1585
+ skip_reason = f"RT spread {rt_spread:.3f}s > {max_allowed_rt_spread:.3f}s"
1586
+
1587
+ if mz_spread > max_allowed_mz_spread:
1588
+ skip_feature = True
1589
+ if skip_reason:
1590
+ skip_reason += f" AND m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
1591
+ else:
1592
+ skip_reason = f"m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
1593
+
1594
+ if skip_feature:
1595
+ # Skip consensus features with excessive spread
1596
+ study.logger.debug(f"Skipping consensus feature {consensus_uid_counter}: {skip_reason}")
2038
1597
  consensus_uid_counter += 1
2039
1598
  continue
2040
1599
 
@@ -2043,6 +1602,7 @@ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaul
2043
1602
  # Build mapping rows (deduplicated)
2044
1603
  for fid, fd in feature_data_acc.items():
2045
1604
  samp_uid = fd.get('sample_uid') or fd.get('sample_id') or fd.get('sample')
1605
+
2046
1606
  # If absent we attempt to derive from original group sample_uids pairing
2047
1607
  # but most feature_data rows should include sample_uid already.
2048
1608
  if samp_uid is None:
@@ -2238,80 +1798,6 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
2238
1798
  }
2239
1799
 
2240
1800
 
2241
- def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) -> list:
2242
- """
2243
- Cluster consensus features from different chunks based on RT and m/z similarity.
2244
-
2245
- Args:
2246
- features: List of feature dictionaries with 'mz', 'rt', 'id' keys
2247
- rt_tol: RT tolerance in seconds
2248
- mz_tol: m/z tolerance in Da
2249
-
2250
- Returns:
2251
- List of groups, where each group is a list of feature dictionaries
2252
- """
2253
- if not features:
2254
- return []
2255
-
2256
- # Use Union-Find for efficient clustering
2257
- class UnionFind:
2258
- def __init__(study, n):
2259
- study.parent = list(range(n))
2260
- study.rank = [0] * n
2261
-
2262
- def find(study, x):
2263
- if study.parent[x] != x:
2264
- study.parent[x] = study.find(study.parent[x])
2265
- return study.parent[x]
2266
-
2267
- def union(study, x, y):
2268
- px, py = study.find(x), study.find(y)
2269
- if px == py:
2270
- return
2271
- if study.rank[px] < study.rank[py]:
2272
- px, py = py, px
2273
- study.parent[py] = px
2274
- if study.rank[px] == study.rank[py]:
2275
- study.rank[px] += 1
2276
-
2277
- n_features = len(features)
2278
- uf = UnionFind(n_features)
2279
-
2280
- # Build distance matrix and cluster features within tolerance
2281
- for i in range(n_features):
2282
- for j in range(i + 1, n_features):
2283
- feat_i = features[i]
2284
- feat_j = features[j]
2285
-
2286
- # Skip if features are from the same chunk (they're already processed)
2287
- if feat_i['chunk_idx'] == feat_j['chunk_idx']:
2288
- continue
2289
-
2290
- mz_diff = abs(feat_i['mz'] - feat_j['mz'])
2291
- rt_diff = abs(feat_i['rt'] - feat_j['rt'])
2292
-
2293
- # Cluster if within tolerance
2294
- if mz_diff <= mz_tol and rt_diff <= rt_tol:
2295
- uf.union(i, j)
2296
-
2297
- # Extract groups
2298
- groups_by_root = {}
2299
- for i in range(n_features):
2300
- root = uf.find(i)
2301
- if root not in groups_by_root:
2302
- groups_by_root[root] = []
2303
- groups_by_root[root].append(features[i])
2304
-
2305
- return list(groups_by_root.values())
2306
-
2307
-
2308
- def _reset_consensus_data(study):
2309
- """Reset consensus-related DataFrames at the start of merge."""
2310
- study.consensus_df = pl.DataFrame()
2311
- study.consensus_ms2 = pl.DataFrame()
2312
- study.consensus_mapping_df = pl.DataFrame()
2313
-
2314
-
2315
1801
  def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
2316
1802
  """Extract consensus features and build metadata."""
2317
1803
  # create a dict to map uid to feature_uid using study.features_df
@@ -2324,7 +1810,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2324
1810
  study.logger.debug(f"Found {imax} feature groups by clustering.")
2325
1811
 
2326
1812
  # Pre-build fast lookup tables for features_df data using optimized approach
2327
- features_lookup = _optimized_feature_lookup(study, study.features_df)
1813
+ features_lookup = __merge_feature_lookup(study, study.features_df)
2328
1814
 
2329
1815
  # create a list to store the consensus mapping
2330
1816
  consensus_mapping = []
@@ -2752,7 +2238,11 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2752
2238
  pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
2753
2239
  )
2754
2240
 
2755
- study.consensus_map = consensus_map
2241
+ # Log final counts
2242
+ study.logger.info(
2243
+ f"Extracted {len(study.consensus_df)} consensus features with "
2244
+ f"at least {min_samples} samples."
2245
+ )
2756
2246
 
2757
2247
 
2758
2248
  def _perform_adduct_grouping(study, rt_tol, mz_tol):
@@ -2775,7 +2265,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2775
2265
  )
2776
2266
 
2777
2267
  # Use optimized adduct grouping
2778
- adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
2268
+ adduct_group_list, adduct_of_list = __merge_adduct_grouping(
2779
2269
  study, consensus_data, rt_tol, mz_tol
2780
2270
  )
2781
2271
 
@@ -2802,70 +2292,186 @@ def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> i
2802
2292
  if len(study.consensus_df) < 2:
2803
2293
  return 0
2804
2294
 
2805
- # Extract consensus feature data
2806
- consensus_data = []
2807
- for row in study.consensus_df.iter_rows(named=True):
2808
- consensus_data.append({
2809
- 'consensus_uid': row['consensus_uid'],
2810
- 'mz': row['mz'],
2811
- 'rt': row['rt']
2812
- })
2813
-
2814
- # Build spatial index using bins
2815
- rt_bin_size = rt_tol / 2
2816
- mz_bin_size = mz_tol / 2
2817
-
2818
- bins = defaultdict(list)
2819
- for feature in consensus_data:
2820
- rt_bin = int(feature['rt'] / rt_bin_size)
2821
- mz_bin = int(feature['mz'] / mz_bin_size)
2822
- bins[(rt_bin, mz_bin)].append(feature)
2295
+ # Extract consensus feature coordinates efficiently
2296
+ feature_coords = study.consensus_df.select([
2297
+ pl.col("consensus_uid"),
2298
+ pl.col("mz"),
2299
+ pl.col("rt")
2300
+ ]).to_numpy()
2823
2301
 
2824
- processed_features = set()
2302
+ n_features = len(feature_coords)
2303
+ processed = [False] * n_features
2825
2304
  tight_clusters_count = 0
2826
2305
 
2827
- for bin_key, bin_features in bins.items():
2828
- if len(bin_features) < 2:
2306
+ # Use vectorized distance calculations for efficiency
2307
+ for i in range(n_features):
2308
+ if processed[i]:
2829
2309
  continue
2830
2310
 
2831
- # Check neighboring bins for additional features
2832
- rt_bin, mz_bin = bin_key
2833
- all_nearby_features = list(bin_features)
2834
-
2835
- # Check 8 neighboring bins
2836
- for drt in [-1, 0, 1]:
2837
- for dmz in [-1, 0, 1]:
2838
- if drt == 0 and dmz == 0:
2839
- continue
2840
- neighbor_key = (rt_bin + drt, mz_bin + dmz)
2841
- if neighbor_key in bins:
2842
- all_nearby_features.extend(bins[neighbor_key])
2311
+ # Find all features within tolerance of feature i
2312
+ cluster_members = [i]
2313
+ rt_i, mz_i = feature_coords[i][2], feature_coords[i][1]
2843
2314
 
2844
- # Filter to features within actual tolerances and not yet processed
2845
- valid_cluster_features = []
2846
- for feature in all_nearby_features:
2847
- if feature['consensus_uid'] in processed_features:
2315
+ for j in range(i + 1, n_features):
2316
+ if processed[j]:
2848
2317
  continue
2849
2318
 
2850
- # Check if this feature is within tolerances of any bin feature
2851
- for bin_feature in bin_features:
2852
- rt_diff = abs(feature['rt'] - bin_feature['rt'])
2853
- mz_diff = abs(feature['mz'] - bin_feature['mz'])
2854
-
2855
- if rt_diff <= rt_tol and mz_diff <= mz_tol:
2856
- valid_cluster_features.append(feature)
2857
- break
2319
+ rt_j, mz_j = feature_coords[j][2], feature_coords[j][1]
2320
+
2321
+ if abs(rt_i - rt_j) <= rt_tol and abs(mz_i - mz_j) <= mz_tol:
2322
+ cluster_members.append(j)
2858
2323
 
2859
- # Count as tight cluster if we have multiple features
2860
- if len(valid_cluster_features) >= 2:
2324
+ # Mark cluster as tight if it has 2+ members
2325
+ if len(cluster_members) >= 2:
2861
2326
  tight_clusters_count += 1
2862
- for feature in valid_cluster_features:
2863
- processed_features.add(feature['consensus_uid'])
2327
+ for idx in cluster_members:
2328
+ processed[idx] = True
2864
2329
 
2865
2330
  return tight_clusters_count
2866
2331
 
2867
2332
 
2868
- def _consensus_cleanup(study, rt_tol, mz_tol):
2333
+ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
2334
+ """
2335
+ Merge partial consensus features that likely represent the same compound but were
2336
+ split across chunks. This is specifically for chunked methods.
2337
+ """
2338
+ if len(study.consensus_df) == 0:
2339
+ return
2340
+
2341
+ initial_count = len(study.consensus_df)
2342
+ study.logger.debug(f"Post-processing chunked results: merging partial consensus features from {initial_count} features")
2343
+
2344
+ # Convert to list of dictionaries for easier processing
2345
+ consensus_features = []
2346
+ for row in study.consensus_df.iter_rows(named=True):
2347
+ consensus_features.append({
2348
+ 'consensus_uid': row['consensus_uid'],
2349
+ 'rt': row['rt'],
2350
+ 'mz': row['mz'],
2351
+ 'number_samples': row.get('number_samples', 0),
2352
+ 'inty_mean': row.get('inty_mean', 0.0)
2353
+ })
2354
+
2355
+ # Use Union-Find to group features that should be merged
2356
+ class UnionFind:
2357
+ def __init__(self, n):
2358
+ self.parent = list(range(n))
2359
+
2360
+ def find(self, x):
2361
+ if self.parent[x] != x:
2362
+ self.parent[x] = self.find(self.parent[x])
2363
+ return self.parent[x]
2364
+
2365
+ def union(self, x, y):
2366
+ px, py = self.find(x), self.find(y)
2367
+ if px != py:
2368
+ self.parent[py] = px
2369
+
2370
+ n_features = len(consensus_features)
2371
+ uf = UnionFind(n_features)
2372
+
2373
+ # Find features that should be merged using original tolerances
2374
+ for i in range(n_features):
2375
+ for j in range(i + 1, n_features):
2376
+ feature_a = consensus_features[i]
2377
+ feature_b = consensus_features[j]
2378
+
2379
+ rt_diff = abs(feature_a['rt'] - feature_b['rt'])
2380
+ mz_diff = abs(feature_a['mz'] - feature_b['mz'])
2381
+
2382
+ # Merge if within tolerance
2383
+ if rt_diff <= rt_tol and mz_diff <= mz_tol:
2384
+ uf.union(i, j)
2385
+
2386
+ # Group features by their root
2387
+ groups = {}
2388
+ for i, feature in enumerate(consensus_features):
2389
+ root = uf.find(i)
2390
+ if root not in groups:
2391
+ groups[root] = []
2392
+ groups[root].append(consensus_features[i])
2393
+
2394
+ # Create merged features
2395
+ merged_features = []
2396
+ merged_mapping_data = []
2397
+ uids_to_remove = set()
2398
+
2399
+ for group in groups.values():
2400
+ if len(group) < 2:
2401
+ # Single feature, keep as is
2402
+ continue
2403
+ else:
2404
+ # Multiple features, merge them
2405
+ # Find best representative feature (highest sample count, then intensity)
2406
+ best_feature = max(group, key=lambda x: (x['number_samples'], x['inty_mean']))
2407
+
2408
+ # Calculate merged properties
2409
+ total_samples = sum(f['number_samples'] for f in group)
2410
+ weighted_rt = sum(f['rt'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['rt']
2411
+ weighted_mz = sum(f['mz'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['mz']
2412
+ mean_intensity = sum(f['inty_mean'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['inty_mean']
2413
+
2414
+ # Keep the best feature's UID but update its properties
2415
+ merged_features.append({
2416
+ 'consensus_uid': best_feature['consensus_uid'],
2417
+ 'rt': weighted_rt,
2418
+ 'mz': weighted_mz,
2419
+ 'number_samples': total_samples,
2420
+ 'inty_mean': mean_intensity
2421
+ })
2422
+
2423
+ # Mark other features for removal
2424
+ for f in group:
2425
+ if f['consensus_uid'] != best_feature['consensus_uid']:
2426
+ uids_to_remove.add(f['consensus_uid'])
2427
+
2428
+ if merged_features:
2429
+ study.logger.debug(f"Merging {len(merged_features)} groups of partial consensus features")
2430
+
2431
+ # Update consensus_df with merged features
2432
+ for merged_feature in merged_features:
2433
+ study.consensus_df = study.consensus_df.with_columns([
2434
+ pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2435
+ .then(pl.lit(merged_feature['rt']))
2436
+ .otherwise(pl.col('rt'))
2437
+ .alias('rt'),
2438
+
2439
+ pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2440
+ .then(pl.lit(merged_feature['mz']))
2441
+ .otherwise(pl.col('mz'))
2442
+ .alias('mz'),
2443
+
2444
+ pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2445
+ .then(pl.lit(merged_feature['number_samples']))
2446
+ .otherwise(pl.col('number_samples'))
2447
+ .alias('number_samples'),
2448
+
2449
+ pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2450
+ .then(pl.lit(merged_feature['inty_mean']))
2451
+ .otherwise(pl.col('inty_mean'))
2452
+ .alias('inty_mean')
2453
+ ])
2454
+
2455
+ # Remove duplicate features
2456
+ if uids_to_remove:
2457
+ study.consensus_df = study.consensus_df.filter(
2458
+ ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2459
+ )
2460
+
2461
+ # Also update consensus_mapping_df - reassign mappings from removed UIDs
2462
+ if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
2463
+ study.consensus_mapping_df = study.consensus_mapping_df.with_columns(
2464
+ pl.when(pl.col('consensus_uid').is_in(list(uids_to_remove)))
2465
+ .then(pl.lit(None)) # Will be handled by subsequent operations
2466
+ .otherwise(pl.col('consensus_uid'))
2467
+ .alias('consensus_uid')
2468
+ )
2469
+
2470
+ final_count = len(study.consensus_df)
2471
+ study.logger.debug(f"Partial consensus merging: {initial_count} → {final_count} features")
2472
+
2473
+
2474
+ def __consensus_cleanup(study, rt_tol, mz_tol):
2869
2475
  """
2870
2476
  Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
2871
2477
 
@@ -3091,7 +2697,7 @@ def _consensus_cleanup(study, rt_tol, mz_tol):
3091
2697
  study.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
3092
2698
 
3093
2699
 
3094
- def _identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2700
+ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
3095
2701
  """
3096
2702
  Identify coeluting consensus features by characteristic mass shifts between adducts
3097
2703
  and update their adduct information accordingly.
@@ -3378,7 +2984,7 @@ def _identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
3378
2984
  study.logger.debug("No consensus features updated based on mass shift analysis")
3379
2985
 
3380
2986
 
3381
- def _finalize_merge(study, link_ms2, min_samples):
2987
+ def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
3382
2988
  """Complete the merge process with final calculations and cleanup."""
3383
2989
  import polars as pl
3384
2990
 
@@ -3438,12 +3044,13 @@ def _finalize_merge(study, link_ms2, min_samples):
3438
3044
  )
3439
3045
 
3440
3046
  # add iso data from raw files.
3441
- study.find_iso()
3442
3047
  if link_ms2:
3443
3048
  study.find_ms2()
3049
+ if extract_ms1:
3050
+ study.find_iso()
3444
3051
 
3445
3052
 
3446
- def _optimized_feature_lookup(study_obj, features_df):
3053
+ def __merge_feature_lookup(study_obj, features_df):
3447
3054
  """
3448
3055
  Optimized feature lookup creation using Polars operations.
3449
3056
  """
@@ -3452,7 +3059,7 @@ def _optimized_feature_lookup(study_obj, features_df):
3452
3059
 
3453
3060
  # Use Polars select for faster conversion
3454
3061
  feature_columns = [
3455
- "feature_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
3062
+ "feature_uid", "sample_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
3456
3063
  "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
3457
3064
  "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
3458
3065
  "ms2_scans", "adduct", "adduct_mass"
@@ -3476,12 +3083,12 @@ def _optimized_feature_lookup(study_obj, features_df):
3476
3083
  return features_lookup
3477
3084
 
3478
3085
 
3479
- def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
3086
+ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3480
3087
  """
3481
3088
  Optimized O(n log n) adduct grouping using spatial indexing.
3482
3089
 
3483
3090
  Args:
3484
- study_obj: Study object with logger
3091
+ study: Study object with logger
3485
3092
  consensus_data: List of consensus feature dictionaries
3486
3093
  rt_tol: RT tolerance in minutes
3487
3094
  mz_tol: m/z tolerance in Da
@@ -3494,9 +3101,9 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
3494
3101
 
3495
3102
  n_features = len(consensus_data)
3496
3103
  if n_features > 10000:
3497
- study_obj.logger.info(f"Adduct grouping for {n_features} consensus features...")
3104
+ study.logger.info(f"Adduct grouping for {n_features} consensus features...")
3498
3105
  else:
3499
- study_obj.logger.debug(f"Adduct grouping for {n_features} consensus features...")
3106
+ study.logger.debug(f"Adduct grouping for {n_features} consensus features...")
3500
3107
 
3501
3108
  # Build spatial index using RT and neutral mass as coordinates
3502
3109
  features_by_mass = defaultdict(list)
@@ -3567,14 +3174,14 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
3567
3174
  groups_by_root = defaultdict(list)
3568
3175
  for i, (uid, rt, mass, inty, adduct, _) in enumerate(valid_features):
3569
3176
  root = uf.find(i)
3570
- groups_by_root[root].append((uid, rt, mass, inty, adduct))
3177
+ groups_by_root[root].append(valid_features[i])
3571
3178
 
3572
3179
  groups = {}
3573
3180
  group_id = 1
3574
3181
  assigned_groups = {}
3575
3182
 
3576
3183
  for group_members in groups_by_root.values():
3577
- member_uids = [uid for uid, _, _, _, _ in group_members]
3184
+ member_uids = [uid for uid, _, _, _, _, _ in group_members]
3578
3185
 
3579
3186
  for uid in member_uids:
3580
3187
  assigned_groups[uid] = group_id
@@ -3632,8 +3239,8 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
3632
3239
  adduct_of_list.append(adduct_of)
3633
3240
 
3634
3241
  if n_features > 10000:
3635
- study_obj.logger.info("Adduct grouping completed.")
3242
+ study.logger.info("Adduct grouping completed.")
3636
3243
  else:
3637
- study_obj.logger.debug("Adduct grouping completed.")
3244
+ study.logger.debug("Adduct grouping completed.")
3638
3245
 
3639
3246
  return adduct_group_list, adduct_of_list