masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py CHANGED
@@ -19,196 +19,195 @@ from masster.study.defaults import merge_defaults
19
19
  def _process_kd_chunk_parallel(chunk_data):
20
20
  """
21
21
  Process a single KD chunk in parallel by reconstructing FeatureMaps from features_df slice.
22
-
22
+
23
23
  Args:
24
24
  chunk_data: Dictionary containing chunk processing parameters
25
-
25
+
26
26
  Returns:
27
27
  Tuple of (chunk_start_idx, serialized_consensus_features)
28
28
  """
29
29
  import pyopenms as oms
30
-
31
- chunk_start_idx = chunk_data['chunk_start_idx']
32
- chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
33
- chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
34
- params_dict = chunk_data['params']
35
-
30
+
31
+ chunk_start_idx = chunk_data["chunk_start_idx"]
32
+ chunk_features_data = chunk_data["chunk_features_data"] # List of feature dicts
33
+ chunk_samples_data = chunk_data["chunk_samples_data"] # List of sample dicts
34
+ params_dict = chunk_data["params"]
35
+
36
36
  # Reconstruct FeatureMaps from features data for each sample in the chunk
37
37
  chunk_maps = []
38
-
38
+
39
39
  for sample_data in chunk_samples_data:
40
- sample_uid = sample_data['sample_uid']
41
-
40
+ sample_uid = sample_data["sample_uid"]
41
+
42
42
  # Filter features for this specific sample
43
- sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
44
-
43
+ sample_features = [f for f in chunk_features_data if f["sample_uid"] == sample_uid]
44
+
45
45
  # Create FeatureMap for this sample
46
46
  feature_map = oms.FeatureMap()
47
-
47
+
48
48
  # Add each feature to the map
49
49
  for feature_dict in sample_features:
50
50
  feature = oms.Feature()
51
- feature.setRT(float(feature_dict['rt']))
52
- feature.setMZ(float(feature_dict['mz']))
53
- feature.setIntensity(float(feature_dict['inty']))
54
- feature.setCharge(int(feature_dict.get('charge', 0)))
55
-
51
+ feature.setRT(float(feature_dict["rt"]))
52
+ feature.setMZ(float(feature_dict["mz"]))
53
+ feature.setIntensity(float(feature_dict["inty"]))
54
+ feature.setCharge(int(feature_dict.get("charge", 0)))
55
+
56
56
  # Set unique ID using feature_id for mapping back
57
- feature.setUniqueId(int(feature_dict['feature_id']))
58
-
57
+ feature.setUniqueId(int(feature_dict["feature_id"]))
58
+
59
59
  feature_map.push_back(feature)
60
-
60
+
61
61
  chunk_maps.append(feature_map)
62
-
62
+
63
63
  # Create the chunk consensus map
64
64
  chunk_consensus_map = oms.ConsensusMap()
65
-
65
+
66
66
  # Set up file descriptions for chunk
67
67
  file_descriptions = chunk_consensus_map.getColumnHeaders()
68
68
  for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
69
69
  file_description = file_descriptions.get(j, oms.ColumnHeader())
70
- file_description.filename = sample_data['sample_name']
70
+ file_description.filename = sample_data["sample_name"]
71
71
  file_description.size = feature_map.size()
72
72
  file_description.unique_id = feature_map.getUniqueId()
73
73
  file_descriptions[j] = file_description
74
-
74
+
75
75
  chunk_consensus_map.setColumnHeaders(file_descriptions)
76
-
76
+
77
77
  # Use KD algorithm for chunk
78
78
  grouper = oms.FeatureGroupingAlgorithmKD()
79
79
  chunk_params = grouper.getParameters()
80
80
  chunk_params.setValue("mz_unit", "Da")
81
- chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
81
+ chunk_params.setValue("nr_partitions", params_dict["nr_partitions"])
82
82
  chunk_params.setValue("warp:enabled", "true")
83
- chunk_params.setValue("warp:rt_tol", params_dict['rt_tol'])
84
- chunk_params.setValue("warp:mz_tol", params_dict['mz_tol'])
85
- chunk_params.setValue("link:rt_tol", params_dict['rt_tol'])
86
- chunk_params.setValue("link:mz_tol", params_dict['mz_tol'])
87
- chunk_params.setValue("link:min_rel_cc_size", params_dict['min_rel_cc_size'])
88
- chunk_params.setValue("link:max_pairwise_log_fc", params_dict['max_pairwise_log_fc'])
89
- chunk_params.setValue("link:max_nr_conflicts", params_dict['max_nr_conflicts'])
90
-
83
+ chunk_params.setValue("warp:rt_tol", params_dict["rt_tol"])
84
+ chunk_params.setValue("warp:mz_tol", params_dict["mz_tol"])
85
+ chunk_params.setValue("link:rt_tol", params_dict["rt_tol"])
86
+ chunk_params.setValue("link:mz_tol", params_dict["mz_tol"])
87
+ chunk_params.setValue("link:min_rel_cc_size", params_dict["min_rel_cc_size"])
88
+ chunk_params.setValue("link:max_pairwise_log_fc", params_dict["max_pairwise_log_fc"])
89
+ chunk_params.setValue("link:max_nr_conflicts", params_dict["max_nr_conflicts"])
90
+
91
91
  grouper.setParameters(chunk_params)
92
92
  grouper.group(chunk_maps, chunk_consensus_map)
93
-
93
+
94
94
  # Serialize the consensus map result for cross-process communication
95
95
  consensus_features = []
96
96
  for consensus_feature in chunk_consensus_map:
97
97
  feature_data = {
98
- 'rt': consensus_feature.getRT(),
99
- 'mz': consensus_feature.getMZ(),
100
- 'intensity': consensus_feature.getIntensity(),
101
- 'quality': consensus_feature.getQuality(),
102
- 'unique_id': str(consensus_feature.getUniqueId()),
103
- 'features': []
98
+ "rt": consensus_feature.getRT(),
99
+ "mz": consensus_feature.getMZ(),
100
+ "intensity": consensus_feature.getIntensity(),
101
+ "quality": consensus_feature.getQuality(),
102
+ "unique_id": str(consensus_feature.getUniqueId()),
103
+ "features": [],
104
104
  }
105
-
105
+
106
106
  # Get constituent features
107
107
  for feature_handle in consensus_feature.getFeatureList():
108
108
  feature_handle_data = {
109
- 'unique_id': str(feature_handle.getUniqueId()),
110
- 'map_index': feature_handle.getMapIndex()
109
+ "unique_id": str(feature_handle.getUniqueId()),
110
+ "map_index": feature_handle.getMapIndex(),
111
111
  }
112
- feature_data['features'].append(feature_handle_data)
113
-
112
+ feature_data["features"].append(feature_handle_data)
113
+
114
114
  consensus_features.append(feature_data)
115
-
115
+
116
116
  return chunk_start_idx, consensus_features
117
117
 
118
118
 
119
119
  def _process_qt_chunk_parallel(chunk_data):
120
120
  """
121
121
  Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
122
-
122
+
123
123
  Args:
124
124
  chunk_data: Dictionary containing chunk processing parameters
125
-
125
+
126
126
  Returns:
127
127
  Tuple of (chunk_start_idx, serialized_consensus_features)
128
128
  """
129
129
  import pyopenms as oms
130
-
131
- chunk_start_idx = chunk_data['chunk_start_idx']
132
- chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
133
- chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
134
- params_dict = chunk_data['params']
135
-
130
+
131
+ chunk_start_idx = chunk_data["chunk_start_idx"]
132
+ chunk_features_data = chunk_data["chunk_features_data"] # List of feature dicts
133
+ chunk_samples_data = chunk_data["chunk_samples_data"] # List of sample dicts
134
+ params_dict = chunk_data["params"]
135
+
136
136
  # Reconstruct FeatureMaps from features data for each sample in the chunk
137
137
  chunk_maps = []
138
-
138
+
139
139
  for sample_data in chunk_samples_data:
140
- sample_uid = sample_data['sample_uid']
141
-
140
+ sample_uid = sample_data["sample_uid"]
141
+
142
142
  # Filter features for this specific sample
143
- sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
144
-
143
+ sample_features = [f for f in chunk_features_data if f["sample_uid"] == sample_uid]
144
+
145
145
  # Create FeatureMap for this sample
146
146
  feature_map = oms.FeatureMap()
147
-
147
+
148
148
  # Add each feature to the map
149
149
  for feature_dict in sample_features:
150
150
  feature = oms.Feature()
151
- feature.setRT(float(feature_dict['rt']))
152
- feature.setMZ(float(feature_dict['mz']))
153
- feature.setIntensity(float(feature_dict['inty']))
154
- feature.setCharge(int(feature_dict.get('charge', 0)))
155
-
151
+ feature.setRT(float(feature_dict["rt"]))
152
+ feature.setMZ(float(feature_dict["mz"]))
153
+ feature.setIntensity(float(feature_dict["inty"]))
154
+ feature.setCharge(int(feature_dict.get("charge", 0)))
155
+
156
156
  # Set unique ID using feature_id for mapping back
157
- feature.setUniqueId(int(feature_dict['feature_id']))
158
-
157
+ feature.setUniqueId(int(feature_dict["feature_id"]))
158
+
159
159
  feature_map.push_back(feature)
160
-
160
+
161
161
  chunk_maps.append(feature_map)
162
-
162
+
163
163
  # Create the chunk consensus map
164
164
  chunk_consensus_map = oms.ConsensusMap()
165
-
165
+
166
166
  # Set up file descriptions for chunk
167
167
  file_descriptions = chunk_consensus_map.getColumnHeaders()
168
168
  for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
169
169
  file_description = file_descriptions.get(j, oms.ColumnHeader())
170
- file_description.filename = sample_data['sample_name']
170
+ file_description.filename = sample_data["sample_name"]
171
171
  file_description.size = feature_map.size()
172
172
  file_description.unique_id = feature_map.getUniqueId()
173
173
  file_descriptions[j] = file_description
174
-
174
+
175
175
  chunk_consensus_map.setColumnHeaders(file_descriptions)
176
-
176
+
177
177
  # Use QT algorithm for chunk
178
178
  grouper = oms.FeatureGroupingAlgorithmQT()
179
179
  chunk_params = grouper.getParameters()
180
- chunk_params.setValue("distance_RT:max_difference", params_dict['rt_tol'])
181
- chunk_params.setValue("distance_MZ:max_difference", params_dict['mz_tol'])
180
+ chunk_params.setValue("distance_RT:max_difference", params_dict["rt_tol"])
181
+ chunk_params.setValue("distance_MZ:max_difference", params_dict["mz_tol"])
182
182
  chunk_params.setValue("distance_MZ:unit", "Da")
183
183
  chunk_params.setValue("ignore_charge", "true")
184
- chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
185
-
184
+ chunk_params.setValue("nr_partitions", params_dict["nr_partitions"])
186
185
 
187
186
  grouper.setParameters(chunk_params)
188
187
  grouper.group(chunk_maps, chunk_consensus_map)
189
-
188
+
190
189
  # Serialize the consensus map result for cross-process communication
191
190
  consensus_features = []
192
191
  for consensus_feature in chunk_consensus_map:
193
192
  feature_data = {
194
- 'rt': consensus_feature.getRT(),
195
- 'mz': consensus_feature.getMZ(),
196
- 'intensity': consensus_feature.getIntensity(),
197
- 'quality': consensus_feature.getQuality(),
198
- 'unique_id': str(consensus_feature.getUniqueId()),
199
- 'features': []
193
+ "rt": consensus_feature.getRT(),
194
+ "mz": consensus_feature.getMZ(),
195
+ "intensity": consensus_feature.getIntensity(),
196
+ "quality": consensus_feature.getQuality(),
197
+ "unique_id": str(consensus_feature.getUniqueId()),
198
+ "features": [],
200
199
  }
201
-
200
+
202
201
  # Get constituent features
203
202
  for feature_handle in consensus_feature.getFeatureList():
204
203
  feature_handle_data = {
205
- 'unique_id': str(feature_handle.getUniqueId()),
206
- 'map_index': feature_handle.getMapIndex()
204
+ "unique_id": str(feature_handle.getUniqueId()),
205
+ "map_index": feature_handle.getMapIndex(),
207
206
  }
208
- feature_data['features'].append(feature_handle_data)
209
-
207
+ feature_data["features"].append(feature_handle_data)
208
+
210
209
  consensus_features.append(feature_data)
211
-
210
+
212
211
  return chunk_start_idx, consensus_features
213
212
 
214
213
 
@@ -225,7 +224,7 @@ def merge(study, **kwargs) -> None:
225
224
  Parameters from merge_defaults class:
226
225
  - method : str, default 'kd'
227
226
  Merge algorithm: 'kd', 'qt', 'kd_chunked', 'qt_chunked'
228
- - min_samples : int, default 2
227
+ - min_samples : int, default 2
229
228
  Minimum number of samples for consensus feature
230
229
  - rt_tol : float, default 5.0
231
230
  RT tolerance in seconds
@@ -261,7 +260,7 @@ def merge(study, **kwargs) -> None:
261
260
  - Uses spatial partitioning for efficient feature matching
262
261
 
263
262
  **QT (Quality Threshold)**
264
- - Thorough O(n²) clustering algorithm
263
+ - Thorough O(n²) clustering algorithm
265
264
  - Most accurate but slowest method
266
265
  - Recommended for small datasets (<1,000 samples)
267
266
  - Guarantees quality threshold constraints
@@ -326,7 +325,7 @@ def merge(study, **kwargs) -> None:
326
325
  study.merge(method='qt', rt_tol=2.0, mz_tol=0.005, min_samples=5)
327
326
 
328
327
  Large dataset with parallel processing:
329
- study.merge(method='kd_chunked', threads=8, chunk_size=500,
328
+ study.merge(method='kd_chunked', threads=8, chunk_size=500,
330
329
  dechunking='hierarchical')
331
330
 
332
331
  Custom tolerances for specific instrument:
@@ -341,11 +340,11 @@ def merge(study, **kwargs) -> None:
341
340
  - Adduct relationships are identified and stored after merging
342
341
  """
343
342
  # Initialize with defaults and override with kwargs
344
- params = merge_defaults()
345
-
343
+ params = merge_defaults()
344
+
346
345
  # Handle 'params' keyword argument specifically (like merge does)
347
- if 'params' in kwargs:
348
- provided_params = kwargs.pop('params')
346
+ if "params" in kwargs:
347
+ provided_params = kwargs.pop("params")
349
348
  if isinstance(provided_params, merge_defaults):
350
349
  params = provided_params
351
350
  study.logger.debug("Using provided merge_defaults parameters from 'params' argument")
@@ -370,71 +369,69 @@ def merge(study, **kwargs) -> None:
370
369
 
371
370
  # Backward compatibility: Map old method names to new names
372
371
  method_mapping = {
373
- 'qtchunked': 'qt_chunked', # QT chunked variants
374
- 'qt-chunked': 'qt_chunked',
375
- 'kdchunked': 'kd_chunked', # KD chunked variants
376
- 'kd-chunked': 'kd_chunked'
372
+ "qtchunked": "qt_chunked", # QT chunked variants
373
+ "qt-chunked": "qt_chunked",
374
+ "kdchunked": "kd_chunked", # KD chunked variants
375
+ "kd-chunked": "kd_chunked",
377
376
  }
378
-
377
+
379
378
  if params.method in method_mapping:
380
379
  old_method = params.method
381
380
  params.method = method_mapping[old_method]
382
381
  study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
383
-
382
+
384
383
  # Validate method
385
- if params.method not in ['kd', 'qt', 'kd_chunked', 'qt_chunked']:
384
+ if params.method not in ["kd", "qt", "kd_chunked", "qt_chunked"]:
386
385
  raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['kd', 'qt', 'kd_chunked', 'qt_chunked']")
387
-
386
+
388
387
  # Check if chunked method is advisable for large datasets
389
- num_samples = len(study.samples_df) if hasattr(study, 'samples_df') and study.samples_df is not None else 0
388
+ num_samples = len(study.samples_df) if hasattr(study, "samples_df") and study.samples_df is not None else 0
390
389
  if num_samples == 0:
391
390
  raise ValueError("No samples loaded in study. Load features before merging.")
392
- if params.method == 'kd' and num_samples > params.chunk_size:
393
- params.method = 'kd_chunked'
391
+ if params.method == "kd" and num_samples > params.chunk_size:
392
+ params.method = "kd_chunked"
394
393
  study.logger.info(
395
394
  f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
396
395
  )
397
- if params.method == 'qt' and num_samples > params.chunk_size:
398
- params.method = 'qt_chunked'
396
+ if params.method == "qt" and num_samples > params.chunk_size:
397
+ params.method = "qt_chunked"
399
398
  study.logger.info(
400
399
  f"Switching to chunked method for large dataset ({num_samples} samples > chunk_size {params.chunk_size})"
401
400
  )
402
401
 
403
402
  if num_samples > 500:
404
- if params.method not in {'kd_chunked', 'qt_chunked'}:
403
+ if params.method not in {"kd_chunked", "qt_chunked"}:
405
404
  study.logger.warning(
406
405
  f"Large dataset detected ({num_samples} samples > 500). Consider dropping chunk_size to 500 to use chunked methods."
407
406
  )
408
-
407
+
409
408
  # Persist last used params for diagnostics
410
409
  try:
411
410
  study._merge_params_last = params.to_dict()
412
411
  except Exception:
413
412
  study._merge_params_last = {}
414
-
413
+
415
414
  # Store merge parameters in history
416
415
  try:
417
- if hasattr(study, 'store_history'):
418
- study.update_history(['merge'], params.to_dict())
416
+ if hasattr(study, "store_history"):
417
+ study.update_history(["merge"], params.to_dict())
419
418
  else:
420
419
  study.logger.warning("History storage not available - parameters not saved to history")
421
420
  except Exception as e:
422
421
  study.logger.warning(f"Failed to store merge parameters in history: {e}")
423
-
422
+
424
423
  # Ensure feature maps are available for merging (regenerate if needed)
425
424
  if len(study.features_maps) < len(study.samples_df):
426
425
  study.features_maps = []
427
426
  # Feature maps will be generated on-demand within each merge method
428
-
427
+
429
428
  study.logger.info(
430
- f"Merging samples using {params.method}, min_samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
431
- )
429
+ f"Merging samples using {params.method}, min_samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
430
+ )
432
431
  if "chunked" in params.method:
433
- study.logger.info(
434
- f"threads={params.threads}, chunk_size={params.chunk_size}, dechunking='{params.dechunking}'"
435
- )
436
-
437
- # Initialize
432
+ study.logger.info(f"threads={params.threads}, chunk_size={params.chunk_size}, dechunking='{params.dechunking}'")
433
+
434
+ # Initialize
438
435
  study.consensus_df = pl.DataFrame()
439
436
  study.consensus_ms2 = pl.DataFrame()
440
437
  study.consensus_mapping_df = pl.DataFrame()
@@ -451,67 +448,67 @@ def merge(study, **kwargs) -> None:
451
448
  except Exception as e:
452
449
  study.logger.warning(f"Could not retrieve study adducts: {e}")
453
450
  cached_valid_adducts = set()
454
-
451
+
455
452
  # Always allow '?' adducts
456
453
  cached_valid_adducts.add("?")
457
-
458
- # Route to algorithm implementation
459
- if params.method == 'kd':
454
+
455
+ # Route to algorithm implementation
456
+ if params.method == "kd":
460
457
  consensus_map = _merge_kd(study, params)
461
458
  # Extract consensus features
462
459
  _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
463
- elif params.method == 'qt':
460
+ elif params.method == "qt":
464
461
  consensus_map = _merge_qt(study, params)
465
462
  # Extract consensus features
466
463
  _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
467
- elif params.method == 'kd_chunked':
464
+ elif params.method == "kd_chunked":
468
465
  consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
469
466
  # Note: _merge_kd_chunked populates consensus_df directly, no need to extract
470
- elif params.method == 'qt_chunked':
467
+ elif params.method == "qt_chunked":
471
468
  consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
472
469
  # Note: _merge_qt_chunked populates consensus_df directly, no need to extract
473
-
470
+
474
471
  # Enhanced post-clustering to merge over-segmented features (for non-chunked methods)
475
472
  # Chunked methods already perform their own cross-chunk consensus building
476
- if params.method in ['qt', 'kd']:
473
+ if params.method in ["qt", "kd"]:
477
474
  __consensus_cleanup(study, params.rt_tol, params.mz_tol)
478
-
475
+
479
476
  # Perform adduct grouping
480
477
  _perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
481
-
478
+
482
479
  # Identify coeluting consensus features by mass shifts and update adduct information
483
480
  __identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
484
-
481
+
485
482
  # Post-processing for chunked methods: merge partial consensus features
486
- if params.method in ['qt_chunked', 'kd_chunked']:
483
+ if params.method in ["qt_chunked", "kd_chunked"]:
487
484
  _merge_partial_consensus_features(study, params.rt_tol, params.mz_tol)
488
-
485
+
489
486
  # Finalize merge: filter by min_samples and add isotope/MS2 data
490
487
  __finalize_merge(study, params.link_ms2, params.extract_ms1, params.min_samples)
491
488
 
492
489
 
493
490
  def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
494
491
  """KD-tree based merge (fast, recommended)"""
495
-
492
+
496
493
  # Generate temporary feature maps on-demand from features_df
497
494
  temp_feature_maps = _generate_feature_maps_on_demand(study)
498
-
495
+
499
496
  consensus_map = oms.ConsensusMap()
500
497
  file_descriptions = consensus_map.getColumnHeaders()
501
-
498
+
502
499
  for i, feature_map in enumerate(temp_feature_maps):
503
500
  file_description = file_descriptions.get(i, oms.ColumnHeader())
504
501
  file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
505
502
  file_description.size = feature_map.size()
506
503
  file_description.unique_id = feature_map.getUniqueId()
507
504
  file_descriptions[i] = file_description
508
-
505
+
509
506
  consensus_map.setColumnHeaders(file_descriptions)
510
-
507
+
511
508
  # Configure KD algorithm
512
509
  grouper = oms.FeatureGroupingAlgorithmKD()
513
510
  params_oms = grouper.getParameters()
514
-
511
+
515
512
  params_oms.setValue("mz_unit", "Da")
516
513
  params_oms.setValue("nr_partitions", params.nr_partitions)
517
514
  params_oms.setValue("warp:enabled", "true")
@@ -519,10 +516,10 @@ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
519
516
  params_oms.setValue("warp:mz_tol", params.mz_tol)
520
517
  params_oms.setValue("link:rt_tol", params.rt_tol)
521
518
  params_oms.setValue("link:mz_tol", params.mz_tol)
522
-
519
+
523
520
  grouper.setParameters(params_oms)
524
521
  grouper.group(temp_feature_maps, consensus_map)
525
-
522
+
526
523
  return consensus_map
527
524
 
528
525
 
@@ -530,49 +527,49 @@ def _generate_feature_maps_on_demand(study):
530
527
  """
531
528
  Generate feature maps on-demand using Sample-level _load_ms1() for merge operations.
532
529
  Returns temporary feature maps that are not cached in the study.
533
-
530
+
534
531
  Args:
535
532
  study: Study object containing samples
536
-
533
+
537
534
  Returns:
538
535
  list: List of temporary FeatureMap objects
539
536
  """
540
537
  import polars as pl
541
538
  import pyopenms as oms
542
539
  import numpy as np
543
-
540
+
544
541
  # Check if we should use Sample-level loading instead of features_df
545
542
  use_sample_loading = True # Default to Sample-level loading as requested
546
-
543
+
547
544
  # Use Sample-level loading if requested and samples_df is available
548
- #if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
545
+ # if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
549
546
  # study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
550
547
  # return _generate_feature_maps_from_samples(study)
551
-
548
+
552
549
  # Fallback to original features_df approach
553
550
  if study.features_df is None or len(study.features_df) == 0:
554
551
  study.logger.error("No features_df available for generating feature maps")
555
552
  return []
556
-
553
+
557
554
  temp_feature_maps = []
558
555
  n_samples = len(study.samples_df)
559
556
  n_features = len(study.features_df)
560
-
557
+
561
558
  # Performance optimization: use efficient polars groupby for large datasets
562
559
  use_groupby_optimization = n_features > 5000
563
560
  if use_groupby_optimization:
564
561
  study.logger.debug(f"Using polars groupby optimization for {n_features} features across {n_samples} samples")
565
-
562
+
566
563
  # Pre-group features by sample_uid - this is much more efficient than repeated filtering
567
564
  features_by_sample = study.features_df.group_by("sample_uid").agg([
568
565
  pl.col("feature_id"),
569
- pl.col("mz"),
566
+ pl.col("mz"),
570
567
  pl.col("rt"),
571
568
  pl.col("inty"),
572
569
  pl.col("quality").fill_null(1.0),
573
- pl.col("charge").fill_null(0)
570
+ pl.col("charge").fill_null(0),
574
571
  ])
575
-
572
+
576
573
  # Convert to dictionary for fast lookups
577
574
  sample_feature_dict = {}
578
575
  for row in features_by_sample.iter_rows(named=True):
@@ -584,31 +581,31 @@ def _generate_feature_maps_on_demand(study):
584
581
  "rt": np.array(row["rt"]),
585
582
  "inty": np.array(row["inty"]),
586
583
  "quality": np.array(row["quality"]),
587
- "charge": np.array(row["charge"])
584
+ "charge": np.array(row["charge"]),
588
585
  }
589
-
586
+
590
587
  # Process each sample in order
591
588
  for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
592
589
  sample_uid = row_dict["sample_uid"]
593
-
590
+
594
591
  if use_groupby_optimization:
595
592
  # Use pre-grouped data with vectorized operations
596
593
  if sample_uid not in sample_feature_dict:
597
594
  feature_map = oms.FeatureMap()
598
595
  temp_feature_maps.append(feature_map)
599
596
  continue
600
-
597
+
601
598
  sample_data = sample_feature_dict[sample_uid]
602
599
  n_sample_features = len(sample_data["feature_id"])
603
-
600
+
604
601
  if n_sample_features == 0:
605
602
  feature_map = oms.FeatureMap()
606
603
  temp_feature_maps.append(feature_map)
607
604
  continue
608
-
605
+
609
606
  # Create new FeatureMap
610
607
  feature_map = oms.FeatureMap()
611
-
608
+
612
609
  # Use vectorized data directly (no conversion needed)
613
610
  for i in range(n_sample_features):
614
611
  try:
@@ -626,14 +623,14 @@ def _generate_feature_maps_on_demand(study):
626
623
  else:
627
624
  # Use original polars-based approach for smaller datasets
628
625
  sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
629
-
626
+
630
627
  # Create new FeatureMap
631
628
  feature_map = oms.FeatureMap()
632
-
629
+
633
630
  # Convert DataFrame features to OpenMS Features
634
631
  for feature_row in sample_features.iter_rows(named=True):
635
632
  feature = oms.Feature()
636
-
633
+
637
634
  # Set properties from DataFrame (handle missing values gracefully)
638
635
  try:
639
636
  feature.setUniqueId(int(feature_row["feature_id"]))
@@ -642,45 +639,45 @@ def _generate_feature_maps_on_demand(study):
642
639
  feature.setIntensity(float(feature_row["inty"]))
643
640
  feature.setOverallQuality(float(feature_row["quality"]))
644
641
  feature.setCharge(int(feature_row["charge"]))
645
-
642
+
646
643
  # Add to feature map
647
644
  feature_map.push_back(feature)
648
645
  except (ValueError, TypeError) as e:
649
646
  study.logger.warning(f"Skipping feature due to conversion error: {e}")
650
647
  continue
651
-
648
+
652
649
  temp_feature_maps.append(feature_map)
653
-
650
+
654
651
  study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df")
655
652
  return temp_feature_maps
656
653
 
657
654
 
658
655
  def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
659
656
  """QT (Quality Threshold) based merge"""
660
-
657
+
661
658
  # Generate temporary feature maps on-demand from features_df
662
659
  temp_feature_maps = _generate_feature_maps_on_demand(study)
663
-
660
+
664
661
  n_samples = len(temp_feature_maps)
665
662
  if n_samples > 1000:
666
663
  study.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
667
-
664
+
668
665
  consensus_map = oms.ConsensusMap()
669
666
  file_descriptions = consensus_map.getColumnHeaders()
670
-
667
+
671
668
  for i, feature_map in enumerate(temp_feature_maps):
672
669
  file_description = file_descriptions.get(i, oms.ColumnHeader())
673
670
  file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
674
671
  file_description.size = feature_map.size()
675
672
  file_description.unique_id = feature_map.getUniqueId()
676
673
  file_descriptions[i] = file_description
677
-
674
+
678
675
  consensus_map.setColumnHeaders(file_descriptions)
679
-
676
+
680
677
  # Configure QT algorithm
681
678
  grouper = oms.FeatureGroupingAlgorithmQT()
682
679
  params_oms = grouper.getParameters()
683
-
680
+
684
681
  params_oms.setValue("distance_RT:max_difference", params.rt_tol)
685
682
  params_oms.setValue("distance_MZ:max_difference", params.mz_tol)
686
683
  params_oms.setValue("distance_MZ:unit", "Da") # QT now uses Da like all other methods
@@ -689,16 +686,18 @@ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
689
686
 
690
687
  grouper.setParameters(params_oms)
691
688
  grouper.group(temp_feature_maps, consensus_map)
692
-
689
+
693
690
  return consensus_map
694
691
 
695
692
 
696
- def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
693
+ def _merge_kd_chunked(
694
+ study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None
695
+ ) -> oms.ConsensusMap:
697
696
  """KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
698
-
697
+
699
698
  # Generate temporary feature maps on-demand from features_df
700
699
  temp_feature_maps = _generate_feature_maps_on_demand(study)
701
-
700
+
702
701
  n_samples = len(temp_feature_maps)
703
702
  if n_samples <= params.chunk_size:
704
703
  study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
@@ -706,23 +705,31 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
706
705
  # Extract consensus features to populate consensus_df for chunked method consistency
707
706
  _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
708
707
  return consensus_map
709
-
708
+
710
709
  # Process in chunks
711
710
  chunks = []
712
711
  for i in range(0, n_samples, params.chunk_size):
713
712
  chunk_end = min(i + params.chunk_size, n_samples)
714
713
  chunks.append((i, temp_feature_maps[i:chunk_end]))
715
-
716
- study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
717
-
714
+
715
+ study.logger.debug(
716
+ f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)"
717
+ )
718
+
718
719
  # Process each chunk to create chunk consensus maps
719
720
  chunk_consensus_maps = []
720
-
721
+
721
722
  if params.threads is None:
722
723
  # Sequential processing (original behavior)
723
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}KD Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
724
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(
725
+ tqdm(
726
+ chunks,
727
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}KD Chunk",
728
+ disable=study.log_level not in ["TRACE", "DEBUG", "INFO"],
729
+ )
730
+ ):
724
731
  chunk_consensus_map = oms.ConsensusMap()
725
-
732
+
726
733
  # Set up file descriptions for chunk
727
734
  file_descriptions = chunk_consensus_map.getColumnHeaders()
728
735
  for j, feature_map in enumerate(chunk_maps):
@@ -731,9 +738,9 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
731
738
  file_description.size = feature_map.size()
732
739
  file_description.unique_id = feature_map.getUniqueId()
733
740
  file_descriptions[j] = file_description
734
-
741
+
735
742
  chunk_consensus_map.setColumnHeaders(file_descriptions)
736
-
743
+
737
744
  # Use KD algorithm for chunk
738
745
  grouper = oms.FeatureGroupingAlgorithmKD()
739
746
  chunk_params = grouper.getParameters()
@@ -747,16 +754,16 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
747
754
  chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
748
755
  chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
749
756
  chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
750
-
757
+
751
758
  grouper.setParameters(chunk_params)
752
759
  grouper.group(chunk_maps, chunk_consensus_map)
753
-
760
+
754
761
  chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
755
-
762
+
756
763
  else:
757
764
  # Parallel processing
758
- #study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
759
-
765
+ # study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
766
+
760
767
  # Prepare chunk data for parallel processing using features_df slices
761
768
  chunk_data_list = []
762
769
  for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
@@ -765,58 +772,65 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
765
772
  chunk_samples_df_rows = []
766
773
  for j in range(len(chunk_maps)):
767
774
  sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
768
- chunk_sample_uids.append(sample_row['sample_uid'])
775
+ chunk_sample_uids.append(sample_row["sample_uid"])
769
776
  chunk_samples_df_rows.append(sample_row)
770
-
777
+
771
778
  # Create a DataFrame for this chunk's samples
772
779
  chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
773
-
780
+
774
781
  # Filter features_df for this chunk's samples and select only necessary columns
775
- chunk_features_df = study.features_df.filter(
776
- pl.col('sample_uid').is_in(chunk_sample_uids)
777
- ).select([
778
- 'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
782
+ chunk_features_df = study.features_df.filter(pl.col("sample_uid").is_in(chunk_sample_uids)).select([
783
+ "sample_uid",
784
+ "rt",
785
+ "mz",
786
+ "inty",
787
+ "charge",
788
+ "feature_id",
779
789
  ])
780
-
790
+
781
791
  # Convert DataFrames to serializable format (lists of dicts)
782
792
  chunk_features_data = chunk_features_df.to_dicts()
783
793
  chunk_samples_data = chunk_samples_df.to_dicts()
784
-
794
+
785
795
  chunk_data = {
786
- 'chunk_start_idx': chunk_start_idx,
787
- 'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
788
- 'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
789
- 'params': {
790
- 'nr_partitions': params.nr_partitions,
791
- 'rt_tol': params.rt_tol,
792
- 'mz_tol': params.mz_tol,
793
- 'min_rel_cc_size': params.min_rel_cc_size,
794
- 'max_pairwise_log_fc': params.max_pairwise_log_fc,
795
- 'max_nr_conflicts': params.max_nr_conflicts
796
- }
796
+ "chunk_start_idx": chunk_start_idx,
797
+ "chunk_features_data": chunk_features_data, # List of dicts instead of DataFrame
798
+ "chunk_samples_data": chunk_samples_data, # List of dicts instead of DataFrame
799
+ "params": {
800
+ "nr_partitions": params.nr_partitions,
801
+ "rt_tol": params.rt_tol,
802
+ "mz_tol": params.mz_tol,
803
+ "min_rel_cc_size": params.min_rel_cc_size,
804
+ "max_pairwise_log_fc": params.max_pairwise_log_fc,
805
+ "max_nr_conflicts": params.max_nr_conflicts,
806
+ },
797
807
  }
798
808
  chunk_data_list.append(chunk_data)
799
-
809
+
800
810
  # Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
801
811
  try:
802
812
  with ProcessPoolExecutor(max_workers=params.threads) as executor:
803
813
  # Submit all chunk processing tasks
804
- future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
805
- for i, chunk_data in enumerate(chunk_data_list)}
806
-
814
+ future_to_chunk = {
815
+ executor.submit(_process_kd_chunk_parallel, chunk_data): i
816
+ for i, chunk_data in enumerate(chunk_data_list)
817
+ }
818
+
807
819
  # Collect results with progress tracking
808
820
  completed_chunks = 0
809
821
  total_chunks = len(chunk_data_list)
810
822
  serialized_chunk_results = []
811
-
823
+
812
824
  for future in as_completed(future_to_chunk):
813
825
  chunk_idx = future_to_chunk[future]
814
826
  try:
815
827
  chunk_start_idx, consensus_features = future.result()
816
828
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
817
829
  completed_chunks += 1
818
- n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
819
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
830
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
831
+ study.logger.info(
832
+ f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
833
+ )
820
834
  except Exception as exc:
821
835
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
822
836
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
@@ -825,60 +839,71 @@ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
825
839
  else:
826
840
  study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
827
841
  raise exc
828
-
842
+
829
843
  except (RuntimeError, OSError, BrokenProcessPool) as e:
830
844
  # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
831
- if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
832
- "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
845
+ if (
846
+ "freeze_support" in str(e)
847
+ or "spawn" in str(e)
848
+ or "bootstrapping" in str(e)
849
+ or "process pool" in str(e).lower()
850
+ or "Windows multiprocessing failure" in str(e)
851
+ ):
833
852
  study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
834
853
  study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
835
-
854
+
836
855
  with ThreadPoolExecutor(max_workers=params.threads) as executor:
837
856
  # Submit all chunk processing tasks
838
- future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
839
- for i, chunk_data in enumerate(chunk_data_list)}
840
-
857
+ future_to_chunk = {
858
+ executor.submit(_process_kd_chunk_parallel, chunk_data): i
859
+ for i, chunk_data in enumerate(chunk_data_list)
860
+ }
861
+
841
862
  # Collect results with progress tracking
842
863
  completed_chunks = 0
843
864
  total_chunks = len(chunk_data_list)
844
865
  serialized_chunk_results = []
845
-
866
+
846
867
  for future in as_completed(future_to_chunk):
847
868
  chunk_idx = future_to_chunk[future]
848
869
  try:
849
870
  chunk_start_idx, consensus_features = future.result()
850
871
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
851
872
  completed_chunks += 1
852
- n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
853
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
873
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
874
+ study.logger.info(
875
+ f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
876
+ )
854
877
  except Exception as exc:
855
878
  study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
856
879
  raise exc
857
880
  else:
858
881
  # Re-raise other exceptions
859
882
  raise
860
-
861
- # Store serialized results for _merge_chunk_results to handle directly
883
+
884
+ # Store serialized results for _merge_chunk_results to handle directly
862
885
  chunk_consensus_maps = []
863
886
  for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
864
887
  # Store serialized data directly for _merge_chunk_results to handle
865
888
  chunk_consensus_maps.append((chunk_start_idx, consensus_features))
866
-
867
- # Merge chunk results with proper cross-chunk consensus building
889
+
890
+ # Merge chunk results with proper cross-chunk consensus building
868
891
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
869
892
  _dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
870
-
893
+
871
894
  # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
872
895
  consensus_map = oms.ConsensusMap()
873
896
  return consensus_map
874
897
 
875
898
 
876
- def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
899
+ def _merge_qt_chunked(
900
+ study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None
901
+ ) -> oms.ConsensusMap:
877
902
  """QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
878
-
903
+
879
904
  # Generate temporary feature maps on-demand from features_df
880
905
  temp_feature_maps = _generate_feature_maps_on_demand(study)
881
-
906
+
882
907
  n_samples = len(temp_feature_maps)
883
908
  if n_samples <= params.chunk_size:
884
909
  study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
@@ -886,23 +911,31 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
886
911
  # Extract consensus features to populate consensus_df for chunked method consistency
887
912
  _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
888
913
  return consensus_map
889
-
914
+
890
915
  # Process in chunks
891
916
  chunks = []
892
917
  for i in range(0, n_samples, params.chunk_size):
893
918
  chunk_end = min(i + params.chunk_size, n_samples)
894
919
  chunks.append((i, temp_feature_maps[i:chunk_end]))
895
-
896
- study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
897
-
920
+
921
+ study.logger.debug(
922
+ f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)"
923
+ )
924
+
898
925
  # Process each chunk to create chunk consensus maps
899
926
  chunk_consensus_maps = []
900
-
927
+
901
928
  if params.threads is None:
902
929
  # Sequential processing (original behavior)
903
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}QT Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
930
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(
931
+ tqdm(
932
+ chunks,
933
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}QT Chunk",
934
+ disable=study.log_level not in ["TRACE", "DEBUG", "INFO"],
935
+ )
936
+ ):
904
937
  chunk_consensus_map = oms.ConsensusMap()
905
-
938
+
906
939
  # Set up file descriptions for chunk
907
940
  file_descriptions = chunk_consensus_map.getColumnHeaders()
908
941
  for j, feature_map in enumerate(chunk_maps):
@@ -911,9 +944,9 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
911
944
  file_description.size = feature_map.size()
912
945
  file_description.unique_id = feature_map.getUniqueId()
913
946
  file_descriptions[j] = file_description
914
-
947
+
915
948
  chunk_consensus_map.setColumnHeaders(file_descriptions)
916
-
949
+
917
950
  # Use QT algorithm for chunk (main difference from KD chunked)
918
951
  grouper = oms.FeatureGroupingAlgorithmQT()
919
952
  chunk_params = grouper.getParameters()
@@ -922,16 +955,16 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
922
955
  chunk_params.setValue("distance_MZ:unit", "Da")
923
956
  chunk_params.setValue("ignore_charge", "true")
924
957
  chunk_params.setValue("nr_partitions", params.nr_partitions)
925
-
958
+
926
959
  grouper.setParameters(chunk_params)
927
960
  grouper.group(chunk_maps, chunk_consensus_map)
928
-
961
+
929
962
  chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
930
-
963
+
931
964
  else:
932
965
  # Parallel processing
933
- #study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
934
-
966
+ # study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
967
+
935
968
  # Prepare chunk data for parallel processing using features_df slices
936
969
  chunk_data_list = []
937
970
  for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
@@ -940,58 +973,65 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
940
973
  chunk_samples_df_rows = []
941
974
  for j in range(len(chunk_maps)):
942
975
  sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
943
- chunk_sample_uids.append(sample_row['sample_uid'])
976
+ chunk_sample_uids.append(sample_row["sample_uid"])
944
977
  chunk_samples_df_rows.append(sample_row)
945
-
978
+
946
979
  # Create a DataFrame for this chunk's samples
947
980
  chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
948
-
981
+
949
982
  # Filter features_df for this chunk's samples and select only necessary columns
950
- chunk_features_df = study.features_df.filter(
951
- pl.col('sample_uid').is_in(chunk_sample_uids)
952
- ).select([
953
- 'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
983
+ chunk_features_df = study.features_df.filter(pl.col("sample_uid").is_in(chunk_sample_uids)).select([
984
+ "sample_uid",
985
+ "rt",
986
+ "mz",
987
+ "inty",
988
+ "charge",
989
+ "feature_id",
954
990
  ])
955
-
991
+
956
992
  # Convert DataFrames to serializable format (lists of dicts)
957
993
  chunk_features_data = chunk_features_df.to_dicts()
958
994
  chunk_samples_data = chunk_samples_df.to_dicts()
959
-
995
+
960
996
  chunk_data = {
961
- 'chunk_start_idx': chunk_start_idx,
962
- 'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
963
- 'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
964
- 'params': {
965
- 'nr_partitions': params.nr_partitions,
966
- 'rt_tol': params.rt_tol,
967
- 'mz_tol': params.mz_tol,
968
- }
997
+ "chunk_start_idx": chunk_start_idx,
998
+ "chunk_features_data": chunk_features_data, # List of dicts instead of DataFrame
999
+ "chunk_samples_data": chunk_samples_data, # List of dicts instead of DataFrame
1000
+ "params": {
1001
+ "nr_partitions": params.nr_partitions,
1002
+ "rt_tol": params.rt_tol,
1003
+ "mz_tol": params.mz_tol,
1004
+ },
969
1005
  }
970
1006
  chunk_data_list.append(chunk_data)
971
-
1007
+
972
1008
  # Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
973
1009
  executor_class = ProcessPoolExecutor
974
1010
  executor_name = "processes"
975
-
1011
+
976
1012
  try:
977
1013
  with ProcessPoolExecutor(max_workers=params.threads) as executor:
978
1014
  # Submit all chunk processing tasks
979
- future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
980
- for i, chunk_data in enumerate(chunk_data_list)}
981
-
1015
+ future_to_chunk = {
1016
+ executor.submit(_process_qt_chunk_parallel, chunk_data): i
1017
+ for i, chunk_data in enumerate(chunk_data_list)
1018
+ }
1019
+
982
1020
  # Collect results with progress tracking
983
1021
  completed_chunks = 0
984
1022
  total_chunks = len(chunk_data_list)
985
1023
  serialized_chunk_results = []
986
-
1024
+
987
1025
  for future in as_completed(future_to_chunk):
988
1026
  chunk_idx = future_to_chunk[future]
989
1027
  try:
990
1028
  chunk_start_idx, consensus_features = future.result()
991
1029
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
992
1030
  completed_chunks += 1
993
- n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
994
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1031
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
1032
+ study.logger.info(
1033
+ f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
1034
+ )
995
1035
  except Exception as exc:
996
1036
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
997
1037
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
@@ -1000,64 +1040,75 @@ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cac
1000
1040
  else:
1001
1041
  study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1002
1042
  raise exc
1003
-
1043
+
1004
1044
  except (RuntimeError, OSError, BrokenProcessPool) as e:
1005
1045
  # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1006
- if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1007
- "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1046
+ if (
1047
+ "freeze_support" in str(e)
1048
+ or "spawn" in str(e)
1049
+ or "bootstrapping" in str(e)
1050
+ or "process pool" in str(e).lower()
1051
+ or "Windows multiprocessing failure" in str(e)
1052
+ ):
1008
1053
  study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1009
1054
  study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1010
-
1055
+
1011
1056
  with ThreadPoolExecutor(max_workers=params.threads) as executor:
1012
1057
  # Submit all chunk processing tasks
1013
- future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
1014
- for i, chunk_data in enumerate(chunk_data_list)}
1015
-
1058
+ future_to_chunk = {
1059
+ executor.submit(_process_qt_chunk_parallel, chunk_data): i
1060
+ for i, chunk_data in enumerate(chunk_data_list)
1061
+ }
1062
+
1016
1063
  # Collect results with progress tracking
1017
1064
  completed_chunks = 0
1018
1065
  total_chunks = len(chunk_data_list)
1019
1066
  serialized_chunk_results = []
1020
-
1067
+
1021
1068
  for future in as_completed(future_to_chunk):
1022
1069
  chunk_idx = future_to_chunk[future]
1023
1070
  try:
1024
1071
  chunk_start_idx, consensus_features = future.result()
1025
1072
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1026
1073
  completed_chunks += 1
1027
- n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1028
- study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1074
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]["chunk_samples_data"])
1075
+ study.logger.info(
1076
+ f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})"
1077
+ )
1029
1078
  except Exception as exc:
1030
1079
  study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1031
1080
  raise exc
1032
1081
  else:
1033
1082
  # Re-raise other exceptions
1034
1083
  raise
1035
-
1036
- # Store serialized results for _merge_chunk_results to handle directly
1084
+
1085
+ # Store serialized results for _merge_chunk_results to handle directly
1037
1086
  chunk_consensus_maps = []
1038
1087
  for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
1039
1088
  # Store serialized data directly for _merge_chunk_results to handle
1040
1089
  chunk_consensus_maps.append((chunk_start_idx, consensus_features))
1041
-
1042
- # Merge chunk results with proper cross-chunk consensus building
1090
+
1091
+ # Merge chunk results with proper cross-chunk consensus building
1043
1092
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1044
1093
  _dechunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1045
-
1094
+
1046
1095
  # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
1047
1096
  consensus_map = oms.ConsensusMap()
1048
1097
  return consensus_map
1049
1098
 
1050
1099
 
1051
- def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
1100
+ def _dechunk_results(
1101
+ study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None
1102
+ ) -> None:
1052
1103
  """
1053
1104
  Scalable aggregation of chunk consensus maps into final consensus_df.
1054
-
1105
+
1055
1106
  This function implements cross-chunk consensus building by:
1056
1107
  1. Extracting feature_uids from each chunk consensus map
1057
1108
  2. Aggregating features close in RT/m/z across chunks
1058
1109
  3. Building consensus_df and consensus_mapping_df directly
1059
1110
  """
1060
-
1111
+
1061
1112
  if len(chunk_consensus_maps) == 1:
1062
1113
  # Single chunk case - just extract using the true global min_samples.
1063
1114
  # No need for permissive threshold because we are not discarding singletons pre-aggregation.
@@ -1069,19 +1120,16 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1069
1120
  cached_valid_adducts,
1070
1121
  )
1071
1122
  return
1072
-
1123
+
1073
1124
  # Build feature_uid to feature_data lookup for fast access
1074
- feature_uid_map = {
1075
- row["feature_id"]: row["feature_uid"]
1076
- for row in study.features_df.iter_rows(named=True)
1077
- }
1078
-
1125
+ feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in study.features_df.iter_rows(named=True)}
1126
+
1079
1127
  features_lookup = __merge_feature_lookup(study, study.features_df)
1080
-
1128
+
1081
1129
  # Extract all consensus features from chunks with their feature_uids
1082
1130
  all_chunk_consensus = []
1083
1131
  consensus_id_counter = 0
1084
-
1132
+
1085
1133
  for chunk_idx, (chunk_start_idx, chunk_data) in enumerate(chunk_consensus_maps):
1086
1134
  # Handle both ConsensusMap objects (sequential) and serialized data (parallel)
1087
1135
  if isinstance(chunk_data, list):
@@ -1091,45 +1139,45 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1091
1139
  # Sequential processing: chunk_data is a ConsensusMap object
1092
1140
  chunk_consensus_map = chunk_data
1093
1141
  consensus_features_data = []
1094
-
1142
+
1095
1143
  # Extract data from ConsensusMap and convert to serialized format
1096
1144
  for consensus_feature in chunk_consensus_map:
1097
1145
  # Extract feature_uids from this consensus feature
1098
1146
  feature_uids = []
1099
1147
  feature_data_list = []
1100
1148
  sample_uids = []
1101
-
1149
+
1102
1150
  for feature_handle in consensus_feature.getFeatureList():
1103
1151
  fuid = str(feature_handle.getUniqueId())
1104
1152
  if fuid not in feature_uid_map:
1105
1153
  continue
1106
-
1154
+
1107
1155
  feature_uid = feature_uid_map[fuid]
1108
1156
  feature_data = features_lookup.get(feature_uid)
1109
1157
  if feature_data:
1110
1158
  feature_uids.append(feature_uid)
1111
1159
  feature_data_list.append(feature_data)
1112
-
1160
+
1113
1161
  # Use feature_uid to lookup actual sample_uid instead of chunk position
1114
- actual_sample_uid = feature_data['sample_uid']
1162
+ actual_sample_uid = feature_data["sample_uid"]
1115
1163
  sample_uids.append(actual_sample_uid)
1116
1164
 
1117
1165
  if not feature_data_list:
1118
1166
  # No retrievable feature metadata (possible stale map reference) -> skip
1119
1167
  continue
1120
-
1168
+
1121
1169
  # Convert ConsensusFeature to serialized format
1122
1170
  consensus_feature_data = {
1123
- 'rt': consensus_feature.getRT(),
1124
- 'mz': consensus_feature.getMZ(),
1125
- 'intensity': consensus_feature.getIntensity(),
1126
- 'quality': consensus_feature.getQuality(),
1127
- 'feature_uids': feature_uids,
1128
- 'feature_data_list': feature_data_list,
1129
- 'sample_uids': sample_uids
1171
+ "rt": consensus_feature.getRT(),
1172
+ "mz": consensus_feature.getMZ(),
1173
+ "intensity": consensus_feature.getIntensity(),
1174
+ "quality": consensus_feature.getQuality(),
1175
+ "feature_uids": feature_uids,
1176
+ "feature_data_list": feature_data_list,
1177
+ "sample_uids": sample_uids,
1130
1178
  }
1131
1179
  consensus_features_data.append(consensus_feature_data)
1132
-
1180
+
1133
1181
  # Process the consensus features (now all in serialized format)
1134
1182
  for consensus_feature_data in consensus_features_data:
1135
1183
  # For parallel processing, feature data is already extracted
@@ -1138,44 +1186,44 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1138
1186
  feature_uids = []
1139
1187
  feature_data_list = []
1140
1188
  sample_uids = []
1141
-
1142
- for handle_data in consensus_feature_data['features']:
1143
- fuid = str(handle_data['unique_id'])
1189
+
1190
+ for handle_data in consensus_feature_data["features"]:
1191
+ fuid = str(handle_data["unique_id"])
1144
1192
  if fuid not in feature_uid_map:
1145
1193
  continue
1146
-
1194
+
1147
1195
  feature_uid = feature_uid_map[fuid]
1148
1196
  feature_data = features_lookup.get(feature_uid)
1149
1197
  if feature_data:
1150
1198
  feature_uids.append(feature_uid)
1151
1199
  feature_data_list.append(feature_data)
1152
-
1200
+
1153
1201
  # Use feature_uid to lookup actual sample_uid instead of chunk position
1154
- actual_sample_uid = feature_data['sample_uid']
1202
+ actual_sample_uid = feature_data["sample_uid"]
1155
1203
  sample_uids.append(actual_sample_uid)
1156
-
1204
+
1157
1205
  if not feature_data_list:
1158
1206
  continue
1159
-
1207
+
1160
1208
  # Get RT/MZ from consensus feature data
1161
- consensus_rt = consensus_feature_data['rt']
1162
- consensus_mz = consensus_feature_data['mz']
1163
- consensus_intensity = consensus_feature_data['intensity']
1164
- consensus_quality = consensus_feature_data['quality']
1209
+ consensus_rt = consensus_feature_data["rt"]
1210
+ consensus_mz = consensus_feature_data["mz"]
1211
+ consensus_intensity = consensus_feature_data["intensity"]
1212
+ consensus_quality = consensus_feature_data["quality"]
1165
1213
  else:
1166
1214
  # Sequential processing: data is already extracted above
1167
- feature_uids = consensus_feature_data['feature_uids']
1168
- feature_data_list = consensus_feature_data['feature_data_list']
1169
- sample_uids = consensus_feature_data['sample_uids']
1170
- consensus_rt = consensus_feature_data['rt']
1171
- consensus_mz = consensus_feature_data['mz']
1172
- consensus_intensity = consensus_feature_data['intensity']
1173
- consensus_quality = consensus_feature_data['quality']
1215
+ feature_uids = consensus_feature_data["feature_uids"]
1216
+ feature_data_list = consensus_feature_data["feature_data_list"]
1217
+ sample_uids = consensus_feature_data["sample_uids"]
1218
+ consensus_rt = consensus_feature_data["rt"]
1219
+ consensus_mz = consensus_feature_data["mz"]
1220
+ consensus_intensity = consensus_feature_data["intensity"]
1221
+ consensus_quality = consensus_feature_data["quality"]
1174
1222
 
1175
1223
  if not feature_data_list:
1176
1224
  # No retrievable feature metadata (possible stale map reference) -> skip
1177
1225
  continue
1178
-
1226
+
1179
1227
  # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
1180
1228
  rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
1181
1229
  mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
@@ -1189,30 +1237,31 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1189
1237
  mz_max_local = max(mz_vals_local)
1190
1238
  else:
1191
1239
  mz_min_local = mz_max_local = consensus_mz
1192
-
1240
+
1193
1241
  # Store chunk consensus with feature tracking
1194
1242
  # Generate unique 16-character consensus_id string
1195
1243
  import uuid
1196
- consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
1197
-
1244
+
1245
+ consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
1246
+
1198
1247
  chunk_consensus_data = {
1199
- 'consensus_id': consensus_id_str,
1200
- 'chunk_idx': chunk_idx,
1201
- 'chunk_start_idx': chunk_start_idx,
1202
- 'mz': consensus_mz,
1203
- 'rt': consensus_rt,
1204
- 'mz_min': mz_min_local,
1205
- 'mz_max': mz_max_local,
1206
- 'rt_min': rt_min_local,
1207
- 'rt_max': rt_max_local,
1208
- 'intensity': consensus_intensity,
1209
- 'quality': consensus_quality,
1210
- 'feature_uids': feature_uids,
1211
- 'feature_data_list': feature_data_list,
1212
- 'sample_uids': sample_uids,
1213
- 'sample_count': len(feature_data_list)
1248
+ "consensus_id": consensus_id_str,
1249
+ "chunk_idx": chunk_idx,
1250
+ "chunk_start_idx": chunk_start_idx,
1251
+ "mz": consensus_mz,
1252
+ "rt": consensus_rt,
1253
+ "mz_min": mz_min_local,
1254
+ "mz_max": mz_max_local,
1255
+ "rt_min": rt_min_local,
1256
+ "rt_max": rt_max_local,
1257
+ "intensity": consensus_intensity,
1258
+ "quality": consensus_quality,
1259
+ "feature_uids": feature_uids,
1260
+ "feature_data_list": feature_data_list,
1261
+ "sample_uids": sample_uids,
1262
+ "sample_count": len(feature_data_list),
1214
1263
  }
1215
-
1264
+
1216
1265
  all_chunk_consensus.append(chunk_consensus_data)
1217
1266
 
1218
1267
  if not all_chunk_consensus:
@@ -1220,37 +1269,38 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1220
1269
  study.consensus_df = pl.DataFrame()
1221
1270
  study.consensus_mapping_df = pl.DataFrame()
1222
1271
  return
1223
-
1272
+
1224
1273
  # CROSS-CHUNK DECHUNKING ALGORITHMS
1225
1274
  # Multiple algorithms available for combining chunk results
1226
-
1275
+
1227
1276
  class HierarchicalAnchorMerger:
1228
1277
  """
1229
1278
  Hierarchical Anchor Merger: Comprehensive cross-chunk feature preservation.
1230
1279
  Uses Union-Find clustering for transitive matching across multiple chunks.
1231
1280
  """
1281
+
1232
1282
  def __init__(self, rt_tol: float, mz_tol: float):
1233
1283
  self.rt_tol = rt_tol
1234
1284
  self.mz_tol = mz_tol
1235
-
1285
+
1236
1286
  def merge(self, chunk_consensus_list: list) -> list:
1237
1287
  """Fixed hierarchical merging with union-find clustering for complete feature preservation"""
1238
1288
  if not chunk_consensus_list:
1239
1289
  return []
1240
-
1290
+
1241
1291
  study.logger.debug(f"FIXED HierarchicalAnchorMerger: processing {len(chunk_consensus_list)} chunk features")
1242
-
1292
+
1243
1293
  # Union-Find data structure for transitive clustering
1244
1294
  class UnionFind:
1245
1295
  def __init__(self, n):
1246
1296
  self.parent = list(range(n))
1247
1297
  self.rank = [0] * n
1248
-
1298
+
1249
1299
  def find(self, x):
1250
1300
  if self.parent[x] != x:
1251
1301
  self.parent[x] = self.find(self.parent[x]) # Path compression
1252
1302
  return self.parent[x]
1253
-
1303
+
1254
1304
  def union(self, x, y):
1255
1305
  px, py = self.find(x), self.find(y)
1256
1306
  if px == py:
@@ -1262,55 +1312,55 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1262
1312
  if self.rank[px] == self.rank[py]:
1263
1313
  self.rank[px] += 1
1264
1314
  return True # Union was performed
1265
-
1315
+
1266
1316
  n_features = len(chunk_consensus_list)
1267
1317
  uf = UnionFind(n_features)
1268
1318
  merges_made = 0
1269
-
1319
+
1270
1320
  # Optimized cross-chunk feature matching using KD-tree spatial indexing
1271
-
1321
+
1272
1322
  # Proper dimensional scaling for RT vs m/z
1273
- rt_scale = 1.0 # RT in seconds (1-30 min range)
1323
+ rt_scale = 1.0 # RT in seconds (1-30 min range)
1274
1324
  mz_scale = 100.0 # m/z in Da (100-1000 range) - scale to match RT magnitude
1275
-
1325
+
1276
1326
  # Build spatial index with scaled coordinates
1277
- points = np.array([[f['rt'] * rt_scale, f['mz'] * mz_scale] for f in chunk_consensus_list])
1327
+ points = np.array([[f["rt"] * rt_scale, f["mz"] * mz_scale] for f in chunk_consensus_list])
1278
1328
  tree = cKDTree(points, balanced_tree=True, compact_nodes=True)
1279
-
1329
+
1280
1330
  # Calculate proper Euclidean radius in scaled space
1281
1331
  scaled_rt_tol = self.rt_tol * rt_scale
1282
- scaled_mz_tol = self.mz_tol * mz_scale
1332
+ scaled_mz_tol = self.mz_tol * mz_scale
1283
1333
  radius = np.sqrt(scaled_rt_tol**2 + scaled_mz_tol**2)
1284
-
1334
+
1285
1335
  # Efficient neighbor search for feature matching
1286
1336
  for i in range(n_features):
1287
1337
  feature_i = chunk_consensus_list[i]
1288
- chunk_i = feature_i.get('chunk_idx', -1)
1289
-
1338
+ chunk_i = feature_i.get("chunk_idx", -1)
1339
+
1290
1340
  # Query spatial index for nearby features
1291
1341
  neighbor_indices = tree.query_ball_point(points[i], r=radius, p=2)
1292
-
1342
+
1293
1343
  for j in neighbor_indices:
1294
1344
  if i >= j: # Skip duplicates and self
1295
1345
  continue
1296
-
1346
+
1297
1347
  feature_j = chunk_consensus_list[j]
1298
- chunk_j = feature_j.get('chunk_idx', -1)
1299
-
1348
+ chunk_j = feature_j.get("chunk_idx", -1)
1349
+
1300
1350
  # Skip features from same chunk (already clustered within chunk)
1301
1351
  if chunk_i == chunk_j:
1302
1352
  continue
1303
-
1353
+
1304
1354
  # Verify with precise original tolerances (more accurate than scaled)
1305
- rt_diff = abs(feature_i['rt'] - feature_j['rt'])
1306
- mz_diff = abs(feature_i['mz'] - feature_j['mz'])
1307
-
1355
+ rt_diff = abs(feature_i["rt"] - feature_j["rt"])
1356
+ mz_diff = abs(feature_i["mz"] - feature_j["mz"])
1357
+
1308
1358
  if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
1309
1359
  if uf.union(i, j): # Merge if not already connected
1310
1360
  merges_made += 1
1311
-
1361
+
1312
1362
  study.logger.debug(f"FIXED HierarchicalAnchorMerger: made {merges_made} cross-chunk merges")
1313
-
1363
+
1314
1364
  # Group features by their connected component
1315
1365
  clusters = {}
1316
1366
  for i in range(n_features):
@@ -1318,190 +1368,196 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1318
1368
  if root not in clusters:
1319
1369
  clusters[root] = []
1320
1370
  clusters[root].append(chunk_consensus_list[i])
1321
-
1371
+
1322
1372
  # Merge each cluster into a single consensus feature
1323
1373
  result = []
1324
1374
  for cluster_features in clusters.values():
1325
1375
  merged = self._merge_cluster(cluster_features)
1326
1376
  result.append(merged)
1327
-
1328
- study.logger.debug(f"FIXED HierarchicalAnchorMerger: output {len(result)} merged features (from {n_features} inputs)")
1329
-
1377
+
1378
+ study.logger.debug(
1379
+ f"FIXED HierarchicalAnchorMerger: output {len(result)} merged features (from {n_features} inputs)"
1380
+ )
1381
+
1330
1382
  # VERIFICATION: Ensure we haven't lost features
1331
1383
  if len(result) > len(chunk_consensus_list):
1332
- study.logger.warning(f"FIXED HierarchicalAnchorMerger: More outputs than inputs ({len(result)} > {n_features})")
1333
-
1384
+ study.logger.warning(
1385
+ f"FIXED HierarchicalAnchorMerger: More outputs than inputs ({len(result)} > {n_features})"
1386
+ )
1387
+
1334
1388
  return result
1335
-
1389
+
1336
1390
  def _merge_cluster(self, cluster: list) -> dict:
1337
1391
  """Merge cluster using sample-weighted consensus with robust error handling"""
1338
1392
  if len(cluster) == 1:
1339
1393
  return cluster[0] # No merging needed for single feature
1340
-
1394
+
1341
1395
  # Calculate weights robustly to prevent division by zero
1342
1396
  weights = []
1343
1397
  for c in cluster:
1344
- sample_count = c.get('sample_count', 0)
1398
+ sample_count = c.get("sample_count", 0)
1345
1399
  # Use minimum weight of 1 to prevent zero weights
1346
1400
  weights.append(max(sample_count, 1))
1347
-
1401
+
1348
1402
  total_weight = sum(weights)
1349
1403
  # Fallback for edge cases
1350
1404
  if total_weight == 0:
1351
1405
  total_weight = len(cluster)
1352
1406
  weights = [1] * len(cluster)
1353
-
1407
+
1354
1408
  # Weighted consensus for RT/mz coordinates
1355
1409
  merged = {
1356
- 'consensus_id': cluster[0]['consensus_id'], # Use first feature's ID
1357
- 'chunk_indices': [c.get('chunk_idx', 0) for c in cluster],
1358
- 'mz': sum(c['mz'] * w for c, w in zip(cluster, weights)) / total_weight,
1359
- 'rt': sum(c['rt'] * w for c, w in zip(cluster, weights)) / total_weight,
1360
- 'intensity': sum(c.get('intensity', 0) for c in cluster),
1361
- 'quality': sum(c.get('quality', 1) * w for c, w in zip(cluster, weights)) / total_weight,
1362
- 'feature_uids': [],
1363
- 'feature_data_list': [],
1364
- 'sample_uids': [],
1365
- 'sample_count': 0
1410
+ "consensus_id": cluster[0]["consensus_id"], # Use first feature's ID
1411
+ "chunk_indices": [c.get("chunk_idx", 0) for c in cluster],
1412
+ "mz": sum(c["mz"] * w for c, w in zip(cluster, weights)) / total_weight,
1413
+ "rt": sum(c["rt"] * w for c, w in zip(cluster, weights)) / total_weight,
1414
+ "intensity": sum(c.get("intensity", 0) for c in cluster),
1415
+ "quality": sum(c.get("quality", 1) * w for c, w in zip(cluster, weights)) / total_weight,
1416
+ "feature_uids": [],
1417
+ "feature_data_list": [],
1418
+ "sample_uids": [],
1419
+ "sample_count": 0,
1366
1420
  }
1367
-
1421
+
1368
1422
  # Aggregate all features and samples from all chunks
1369
1423
  all_feature_uids = []
1370
1424
  all_feature_data = []
1371
1425
  all_sample_uids = []
1372
-
1426
+
1373
1427
  for chunk in cluster:
1374
1428
  # Collect feature UIDs
1375
- chunk_feature_uids = chunk.get('feature_uids', [])
1429
+ chunk_feature_uids = chunk.get("feature_uids", [])
1376
1430
  all_feature_uids.extend(chunk_feature_uids)
1377
-
1431
+
1378
1432
  # Collect feature data
1379
- chunk_feature_data = chunk.get('feature_data_list', [])
1433
+ chunk_feature_data = chunk.get("feature_data_list", [])
1380
1434
  all_feature_data.extend(chunk_feature_data)
1381
-
1435
+
1382
1436
  # Collect sample UIDs
1383
- chunk_sample_uids = chunk.get('sample_uids', [])
1437
+ chunk_sample_uids = chunk.get("sample_uids", [])
1384
1438
  all_sample_uids.extend(chunk_sample_uids)
1385
-
1439
+
1386
1440
  # Remove duplicates properly and count unique samples
1387
- merged['feature_uids'] = list(set(all_feature_uids))
1388
- merged['feature_data_list'] = all_feature_data # Keep all feature data
1389
- merged['sample_uids'] = list(set(all_sample_uids)) # Unique sample UIDs only
1390
- merged['sample_count'] = len(merged['sample_uids']) # Count of unique samples
1391
-
1441
+ merged["feature_uids"] = list(set(all_feature_uids))
1442
+ merged["feature_data_list"] = all_feature_data # Keep all feature data
1443
+ merged["sample_uids"] = list(set(all_sample_uids)) # Unique sample UIDs only
1444
+ merged["sample_count"] = len(merged["sample_uids"]) # Count of unique samples
1445
+
1392
1446
  return merged
1393
-
1447
+
1394
1448
  class KDTreeSpatialMerger:
1395
1449
  """
1396
1450
  KD-Tree Spatial Merger: Optimized for high-sample features.
1397
1451
  """
1452
+
1398
1453
  def __init__(self, rt_tol: float, mz_tol: float):
1399
1454
  self.rt_tol = rt_tol
1400
1455
  self.mz_tol = mz_tol
1401
-
1456
+
1402
1457
  def merge(self, chunk_consensus_list: list) -> list:
1403
1458
  """KD-tree based spatial merging"""
1404
1459
  if not chunk_consensus_list:
1405
1460
  return []
1406
-
1461
+
1407
1462
  try:
1408
1463
  from scipy.spatial import cKDTree
1409
1464
  import numpy as np
1410
1465
  except ImportError:
1411
1466
  # Fallback to simple clustering if scipy not available
1412
1467
  return self._fallback_merge(chunk_consensus_list)
1413
-
1468
+
1414
1469
  # Build spatial index
1415
- points = np.array([[c['rt'], c['mz']] for c in chunk_consensus_list])
1470
+ points = np.array([[c["rt"], c["mz"]] for c in chunk_consensus_list])
1416
1471
  tree = cKDTree(points)
1417
-
1472
+
1418
1473
  # Scale tolerances for KD-tree query
1419
1474
  rt_scale = 1.0 / self.rt_tol if self.rt_tol > 0 else 1.0
1420
1475
  mz_scale = 1.0 / self.mz_tol if self.mz_tol > 0 else 1.0
1421
1476
  scaled_points = points * np.array([rt_scale, mz_scale])
1422
1477
  scaled_tree = cKDTree(scaled_points)
1423
-
1478
+
1424
1479
  clusters = []
1425
1480
  used = set()
1426
-
1481
+
1427
1482
  # Priority processing for high-sample features
1428
- high_sample_indices = [i for i, c in enumerate(chunk_consensus_list) if c['sample_count'] >= 100]
1483
+ high_sample_indices = [i for i, c in enumerate(chunk_consensus_list) if c["sample_count"] >= 100]
1429
1484
  remaining_indices = [i for i in range(len(chunk_consensus_list)) if i not in high_sample_indices]
1430
-
1485
+
1431
1486
  for idx in high_sample_indices + remaining_indices:
1432
1487
  if idx in used:
1433
1488
  continue
1434
-
1489
+
1435
1490
  # Find neighbors in scaled space
1436
1491
  neighbors = scaled_tree.query_ball_point(scaled_points[idx], r=1.0)
1437
1492
  cluster_indices = [i for i in neighbors if i not in used and i != idx]
1438
1493
  cluster_indices.append(idx)
1439
-
1494
+
1440
1495
  if cluster_indices:
1441
1496
  cluster = [chunk_consensus_list[i] for i in cluster_indices]
1442
1497
  clusters.append(self._merge_cluster(cluster))
1443
1498
  used.update(cluster_indices)
1444
-
1499
+
1445
1500
  return clusters
1446
-
1501
+
1447
1502
  def _fallback_merge(self, chunk_consensus_list: list) -> list:
1448
1503
  """Simple distance-based fallback when scipy unavailable"""
1449
1504
  clusters = []
1450
1505
  used = set()
1451
-
1506
+
1452
1507
  for i, anchor in enumerate(chunk_consensus_list):
1453
1508
  if i in used:
1454
1509
  continue
1455
-
1510
+
1456
1511
  cluster = [anchor]
1457
1512
  used.add(i)
1458
-
1513
+
1459
1514
  for j, candidate in enumerate(chunk_consensus_list):
1460
1515
  if j in used or j == i:
1461
1516
  continue
1462
-
1463
- rt_diff = abs(candidate['rt'] - anchor['rt'])
1464
- mz_diff = abs(candidate['mz'] - anchor['mz'])
1465
-
1517
+
1518
+ rt_diff = abs(candidate["rt"] - anchor["rt"])
1519
+ mz_diff = abs(candidate["mz"] - anchor["mz"])
1520
+
1466
1521
  if rt_diff <= self.rt_tol and mz_diff <= self.mz_tol:
1467
1522
  cluster.append(candidate)
1468
1523
  used.add(j)
1469
-
1524
+
1470
1525
  clusters.append(self._merge_cluster(cluster))
1471
-
1526
+
1472
1527
  return clusters
1473
-
1528
+
1474
1529
  def _merge_cluster(self, cluster: list) -> dict:
1475
1530
  """Merge cluster with intensity-weighted consensus"""
1476
1531
  if len(cluster) == 1:
1477
1532
  return cluster[0]
1478
-
1533
+
1479
1534
  # Weight by intensity for spatial accuracy
1480
- total_intensity = sum(c['intensity'] for c in cluster)
1481
-
1535
+ total_intensity = sum(c["intensity"] for c in cluster)
1536
+
1482
1537
  merged = {
1483
- 'consensus_id': cluster[0]['consensus_id'],
1484
- 'chunk_indices': [c['chunk_idx'] for c in cluster],
1485
- 'mz': sum(c['mz'] * c['intensity'] for c in cluster) / total_intensity,
1486
- 'rt': sum(c['rt'] * c['intensity'] for c in cluster) / total_intensity,
1487
- 'intensity': total_intensity,
1488
- 'quality': sum(c['quality'] for c in cluster) / len(cluster),
1489
- 'feature_uids': [],
1490
- 'feature_data_list': [],
1491
- 'sample_uids': [],
1492
- 'sample_count': 0
1538
+ "consensus_id": cluster[0]["consensus_id"],
1539
+ "chunk_indices": [c["chunk_idx"] for c in cluster],
1540
+ "mz": sum(c["mz"] * c["intensity"] for c in cluster) / total_intensity,
1541
+ "rt": sum(c["rt"] * c["intensity"] for c in cluster) / total_intensity,
1542
+ "intensity": total_intensity,
1543
+ "quality": sum(c["quality"] for c in cluster) / len(cluster),
1544
+ "feature_uids": [],
1545
+ "feature_data_list": [],
1546
+ "sample_uids": [],
1547
+ "sample_count": 0,
1493
1548
  }
1494
-
1549
+
1495
1550
  # Aggregate features
1496
1551
  for chunk in cluster:
1497
- merged['feature_uids'].extend(chunk['feature_uids'])
1498
- merged['feature_data_list'].extend(chunk['feature_data_list'])
1499
- merged['sample_uids'].extend(chunk['sample_uids'])
1500
-
1501
- merged['feature_uids'] = list(set(merged['feature_uids']))
1502
- merged['sample_count'] = len(set(merged['sample_uids']))
1503
-
1552
+ merged["feature_uids"].extend(chunk["feature_uids"])
1553
+ merged["feature_data_list"].extend(chunk["feature_data_list"])
1554
+ merged["sample_uids"].extend(chunk["sample_uids"])
1555
+
1556
+ merged["feature_uids"] = list(set(merged["feature_uids"]))
1557
+ merged["sample_count"] = len(set(merged["sample_uids"]))
1558
+
1504
1559
  return merged
1560
+
1505
1561
  # SELECT DECHUNKING ALGORITHM BASED ON PARAMETER
1506
1562
  if params.dechunking == "hierarchical":
1507
1563
  merger = HierarchicalAnchorMerger(params.rt_tol, params.mz_tol)
@@ -1523,7 +1579,7 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1523
1579
  for group in refined_groups:
1524
1580
  if not group:
1525
1581
  continue
1526
-
1582
+
1527
1583
  # Aggregate underlying feature data (deduplicated by feature_uid)
1528
1584
  feature_data_acc = {}
1529
1585
  sample_uids_acc = set()
@@ -1533,25 +1589,25 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1533
1589
  quality_values_chunk = []
1534
1590
 
1535
1591
  for cf in group:
1536
- rt_values_chunk.append(cf['rt'])
1537
- mz_values_chunk.append(cf['mz'])
1538
- intensity_values_chunk.append(cf.get('intensity', 0.0) or 0.0)
1539
- quality_values_chunk.append(cf.get('quality', 1.0) or 1.0)
1540
-
1541
- for fd, samp_uid in zip(cf['feature_data_list'], cf['sample_uids']):
1542
- fid = fd.get('feature_uid') or fd.get('uid') or fd.get('feature_id')
1592
+ rt_values_chunk.append(cf["rt"])
1593
+ mz_values_chunk.append(cf["mz"])
1594
+ intensity_values_chunk.append(cf.get("intensity", 0.0) or 0.0)
1595
+ quality_values_chunk.append(cf.get("quality", 1.0) or 1.0)
1596
+
1597
+ for fd, samp_uid in zip(cf["feature_data_list"], cf["sample_uids"]):
1598
+ fid = fd.get("feature_uid") or fd.get("uid") or fd.get("feature_id")
1543
1599
  # feature_uid expected in fd under 'feature_uid'; fallback attempts just in case
1544
1600
  if fid is None:
1545
1601
  continue
1546
1602
  if fid not in feature_data_acc:
1547
1603
  feature_data_acc[fid] = fd
1548
1604
  sample_uids_acc.add(samp_uid)
1549
-
1605
+
1550
1606
  if not feature_data_acc:
1551
1607
  continue
1552
1608
 
1553
1609
  number_samples = len(sample_uids_acc)
1554
-
1610
+
1555
1611
  # This allows proper cross-chunk consensus building before final filtering
1556
1612
 
1557
1613
  metadata = _calculate_consensus_statistics(
@@ -1567,46 +1623,46 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1567
1623
  cached_adducts_df=cached_adducts_df,
1568
1624
  cached_valid_adducts=cached_valid_adducts,
1569
1625
  )
1570
-
1626
+
1571
1627
  # Validate RT and m/z spread don't exceed tolerance limits
1572
- rt_spread = metadata.get('rt_max', 0) - metadata.get('rt_min', 0)
1573
- mz_spread = metadata.get('mz_max', 0) - metadata.get('mz_min', 0)
1628
+ rt_spread = metadata.get("rt_max", 0) - metadata.get("rt_min", 0)
1629
+ mz_spread = metadata.get("mz_max", 0) - metadata.get("mz_min", 0)
1574
1630
  max_allowed_rt_spread = params.rt_tol * 2 # Allow 2x tolerance for chunked method
1575
1631
  max_allowed_mz_spread = params.mz_tol * 2 # Enforce strict m/z spread limit
1576
-
1632
+
1577
1633
  skip_feature = False
1578
1634
  skip_reason = ""
1579
-
1635
+
1580
1636
  if rt_spread > max_allowed_rt_spread:
1581
1637
  skip_feature = True
1582
1638
  skip_reason = f"RT spread {rt_spread:.3f}s > {max_allowed_rt_spread:.3f}s"
1583
-
1639
+
1584
1640
  if mz_spread > max_allowed_mz_spread:
1585
1641
  skip_feature = True
1586
1642
  if skip_reason:
1587
1643
  skip_reason += f" AND m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
1588
1644
  else:
1589
1645
  skip_reason = f"m/z spread {mz_spread:.4f} Da > {max_allowed_mz_spread:.4f} Da"
1590
-
1646
+
1591
1647
  if skip_feature:
1592
1648
  # Skip consensus features with excessive spread
1593
1649
  study.logger.debug(f"Skipping consensus feature {consensus_uid_counter}: {skip_reason}")
1594
1650
  consensus_uid_counter += 1
1595
1651
  continue
1596
-
1652
+
1597
1653
  consensus_metadata.append(metadata)
1598
1654
 
1599
1655
  # Build mapping rows (deduplicated)
1600
1656
  for fid, fd in feature_data_acc.items():
1601
- samp_uid = fd.get('sample_uid') or fd.get('sample_id') or fd.get('sample')
1602
-
1657
+ samp_uid = fd.get("sample_uid") or fd.get("sample_id") or fd.get("sample")
1658
+
1603
1659
  # If absent we attempt to derive from original group sample_uids pairing
1604
1660
  # but most feature_data rows should include sample_uid already.
1605
1661
  if samp_uid is None:
1606
1662
  # fallback: search for cf containing this fid
1607
1663
  for cf in group:
1608
- for fd2, samp2 in zip(cf['feature_data_list'], cf['sample_uids']):
1609
- f2id = fd2.get('feature_uid') or fd2.get('uid') or fd2.get('feature_id')
1664
+ for fd2, samp2 in zip(cf["feature_data_list"], cf["sample_uids"]):
1665
+ f2id = fd2.get("feature_uid") or fd2.get("uid") or fd2.get("feature_id")
1610
1666
  if f2id == fid:
1611
1667
  samp_uid = samp2
1612
1668
  break
@@ -1615,9 +1671,9 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1615
1671
  if samp_uid is None:
1616
1672
  continue
1617
1673
  consensus_mapping_list.append({
1618
- 'consensus_uid': consensus_uid_counter,
1619
- 'sample_uid': samp_uid,
1620
- 'feature_uid': fid,
1674
+ "consensus_uid": consensus_uid_counter,
1675
+ "sample_uid": samp_uid,
1676
+ "feature_uid": fid,
1621
1677
  })
1622
1678
 
1623
1679
  consensus_uid_counter += 1
@@ -1628,9 +1684,9 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1628
1684
 
1629
1685
  # Ensure mapping only contains features from retained consensus_df
1630
1686
  if len(study.consensus_df) > 0:
1631
- valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
1687
+ valid_consensus_ids = set(study.consensus_df["consensus_uid"].to_list())
1632
1688
  study.consensus_mapping_df = study.consensus_mapping_df.filter(
1633
- pl.col('consensus_uid').is_in(list(valid_consensus_ids))
1689
+ pl.col("consensus_uid").is_in(list(valid_consensus_ids))
1634
1690
  )
1635
1691
  else:
1636
1692
  study.consensus_mapping_df = pl.DataFrame()
@@ -1640,28 +1696,36 @@ def _dechunk_results(study, chunk_consensus_maps: list, params: merge_defaults,
1640
1696
  return
1641
1697
 
1642
1698
 
1643
- def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
1644
- rt_values: list, mz_values: list,
1645
- intensity_values: list, quality_values: list,
1646
- number_features: int | None = None, number_samples: int | None = None,
1647
- cached_adducts_df=None, cached_valid_adducts=None) -> dict:
1699
+ def _calculate_consensus_statistics(
1700
+ study_obj,
1701
+ consensus_uid: int,
1702
+ feature_data_list: list,
1703
+ rt_values: list,
1704
+ mz_values: list,
1705
+ intensity_values: list,
1706
+ quality_values: list,
1707
+ number_features: int | None = None,
1708
+ number_samples: int | None = None,
1709
+ cached_adducts_df=None,
1710
+ cached_valid_adducts=None,
1711
+ ) -> dict:
1648
1712
  """
1649
1713
  Calculate comprehensive statistics for a consensus feature from aggregated feature data.
1650
-
1714
+
1651
1715
  Args:
1652
1716
  consensus_uid: Unique ID for this consensus feature
1653
1717
  feature_data_list: List of individual feature dictionaries
1654
1718
  rt_values: RT values from chunk consensus features
1655
- mz_values: m/z values from chunk consensus features
1719
+ mz_values: m/z values from chunk consensus features
1656
1720
  intensity_values: Intensity values from chunk consensus features
1657
1721
  quality_values: Quality values from chunk consensus features
1658
-
1722
+
1659
1723
  Returns:
1660
1724
  Dictionary with consensus feature metadata
1661
1725
  """
1662
1726
  if not feature_data_list:
1663
1727
  return {}
1664
-
1728
+
1665
1729
  # Convert feature data to numpy arrays for vectorized computation
1666
1730
  rt_feat_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
1667
1731
  mz_feat_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
@@ -1671,41 +1735,51 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1671
1735
  mz_start_values = np.array([fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None])
1672
1736
  mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
1673
1737
  inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
1674
- coherence_values = np.array([fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None])
1675
- prominence_values = np.array([fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None])
1676
- prominence_scaled_values = np.array([fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None])
1677
- height_scaled_values = np.array([fd.get("chrom_prominence_scaled", 0) for fd in feature_data_list if fd.get("chrom_prominence_scaled") is not None])
1738
+ coherence_values = np.array([
1739
+ fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
1740
+ ])
1741
+ prominence_values = np.array([
1742
+ fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
1743
+ ])
1744
+ prominence_scaled_values = np.array([
1745
+ fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
1746
+ ])
1747
+ height_scaled_values = np.array([
1748
+ fd.get("chrom_prominence_scaled", 0)
1749
+ for fd in feature_data_list
1750
+ if fd.get("chrom_prominence_scaled") is not None
1751
+ ])
1678
1752
  iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
1679
1753
  charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
1680
-
1754
+
1681
1755
  # Process adducts with cached validation
1682
1756
  all_adducts = []
1683
1757
  valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
1684
1758
  valid_adducts.add("?") # Always allow '?' adducts
1685
-
1759
+
1686
1760
  for fd in feature_data_list:
1687
1761
  adduct = fd.get("adduct")
1688
1762
  if adduct is not None:
1689
1763
  # Only include adducts that are valid (from cached study adducts or contain '?')
1690
1764
  if adduct in valid_adducts or "?" in adduct:
1691
1765
  all_adducts.append(adduct)
1692
-
1766
+
1693
1767
  # Calculate adduct consensus
1694
1768
  adduct_values = []
1695
1769
  adduct_top = None
1696
1770
  adduct_charge_top = None
1697
1771
  adduct_mass_neutral_top = None
1698
1772
  adduct_mass_shift_top = None
1699
-
1773
+
1700
1774
  if all_adducts:
1701
1775
  adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
1702
1776
  total_count = sum(adduct_counts.values())
1703
1777
  for adduct, count in adduct_counts.items():
1704
1778
  percentage = (count / total_count) * 100 if total_count > 0 else 0
1705
1779
  adduct_values.append([str(adduct), int(count), float(round(percentage, 2))])
1706
-
1780
+
1707
1781
  adduct_values.sort(key=lambda x: x[1], reverse=True)
1708
-
1782
+
1709
1783
  if adduct_values:
1710
1784
  adduct_top = adduct_values[0][0]
1711
1785
  # Try to get charge and mass shift from cached study adducts
@@ -1719,7 +1793,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1719
1793
  adduct_charge_top = adduct_row["charge"]
1720
1794
  adduct_mass_shift_top = adduct_row["mass_shift"]
1721
1795
  adduct_found = True
1722
-
1796
+
1723
1797
  if not adduct_found:
1724
1798
  # Set default charge and mass shift for top adduct
1725
1799
  adduct_charge_top = 1
@@ -1735,26 +1809,27 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1735
1809
  adduct_top = "[M+?]1+"
1736
1810
  adduct_charge_top = 1
1737
1811
  adduct_mass_shift_top = 1.007825
1738
-
1812
+
1739
1813
  adduct_values = [[adduct_top, 1, 100.0]]
1740
-
1814
+
1741
1815
  # Calculate neutral mass
1742
1816
  consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
1743
1817
  if adduct_charge_top and adduct_mass_shift_top is not None:
1744
1818
  adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
1745
-
1819
+
1746
1820
  # Calculate MS2 count
1747
1821
  ms2_count = 0
1748
1822
  for fd in feature_data_list:
1749
1823
  ms2_scans = fd.get("ms2_scans")
1750
1824
  if ms2_scans is not None:
1751
1825
  ms2_count += len(ms2_scans)
1752
-
1826
+
1753
1827
  # Build consensus metadata
1754
1828
  # Generate unique 16-character consensus_id string
1755
1829
  import uuid
1756
- consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
1757
-
1830
+
1831
+ consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
1832
+
1758
1833
  return {
1759
1834
  "consensus_uid": int(consensus_uid),
1760
1835
  "consensus_id": consensus_id_str, # Use unique 16-char string ID
@@ -1777,8 +1852,12 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1777
1852
  "bl": -1.0,
1778
1853
  "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
1779
1854
  "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
1780
- "chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3) if len(prominence_scaled_values) > 0 else 0.0,
1781
- "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3) if len(height_scaled_values) > 0 else 0.0,
1855
+ "chrom_prominence_scaled_mean": round(float(np.mean(prominence_scaled_values)), 3)
1856
+ if len(prominence_scaled_values) > 0
1857
+ else 0.0,
1858
+ "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
1859
+ if len(height_scaled_values) > 0
1860
+ else 0.0,
1782
1861
  "iso": None, # Will be filled by find_iso() function
1783
1862
  "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
1784
1863
  "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
@@ -1799,10 +1878,7 @@ def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_
1799
1878
  def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
1800
1879
  """Extract consensus features and build metadata."""
1801
1880
  # create a dict to map uid to feature_uid using study.features_df
1802
- feature_uid_map = {
1803
- row["feature_id"]: row["feature_uid"]
1804
- for row in study.features_df.iter_rows(named=True)
1805
- }
1881
+ feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in study.features_df.iter_rows(named=True)}
1806
1882
  imax = consensus_map.size()
1807
1883
 
1808
1884
  study.logger.debug(f"Found {imax} feature groups by clustering.")
@@ -1862,67 +1938,31 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
1862
1938
  [fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None],
1863
1939
  )
1864
1940
  rt_start_values = np.array(
1865
- [
1866
- fd.get("rt_start", 0)
1867
- for fd in feature_data_list
1868
- if fd.get("rt_start") is not None
1869
- ],
1941
+ [fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None],
1870
1942
  )
1871
1943
  rt_end_values = np.array(
1872
- [
1873
- fd.get("rt_end", 0)
1874
- for fd in feature_data_list
1875
- if fd.get("rt_end") is not None
1876
- ],
1944
+ [fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None],
1877
1945
  )
1878
1946
  rt_delta_values = np.array(
1879
- [
1880
- fd.get("rt_delta", 0)
1881
- for fd in feature_data_list
1882
- if fd.get("rt_delta") is not None
1883
- ],
1947
+ [fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None],
1884
1948
  )
1885
1949
  mz_start_values = np.array(
1886
- [
1887
- fd.get("mz_start", 0)
1888
- for fd in feature_data_list
1889
- if fd.get("mz_start") is not None
1890
- ],
1950
+ [fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None],
1891
1951
  )
1892
1952
  mz_end_values = np.array(
1893
- [
1894
- fd.get("mz_end", 0)
1895
- for fd in feature_data_list
1896
- if fd.get("mz_end") is not None
1897
- ],
1953
+ [fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None],
1898
1954
  )
1899
1955
  inty_values = np.array(
1900
- [
1901
- fd.get("inty", 0)
1902
- for fd in feature_data_list
1903
- if fd.get("inty") is not None
1904
- ],
1956
+ [fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None],
1905
1957
  )
1906
1958
  coherence_values = np.array(
1907
- [
1908
- fd.get("chrom_coherence", 0)
1909
- for fd in feature_data_list
1910
- if fd.get("chrom_coherence") is not None
1911
- ],
1959
+ [fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None],
1912
1960
  )
1913
1961
  prominence_values = np.array(
1914
- [
1915
- fd.get("chrom_prominence", 0)
1916
- for fd in feature_data_list
1917
- if fd.get("chrom_prominence") is not None
1918
- ],
1962
+ [fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None],
1919
1963
  )
1920
1964
  prominence_scaled_values = np.array(
1921
- [
1922
- fd.get("chrom_height_scaled", 0)
1923
- for fd in feature_data_list
1924
- if fd.get("chrom_height_scaled") is not None
1925
- ],
1965
+ [fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None],
1926
1966
  )
1927
1967
  height_scaled_values = np.array(
1928
1968
  [
@@ -1935,11 +1975,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
1935
1975
  [fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None],
1936
1976
  )
1937
1977
  charge_values = np.array(
1938
- [
1939
- fd.get("charge", 0)
1940
- for fd in feature_data_list
1941
- if fd.get("charge") is not None
1942
- ],
1978
+ [fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None],
1943
1979
  )
1944
1980
 
1945
1981
  # adduct_values
@@ -1967,9 +2003,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
1967
2003
  # Calculate adduct_values for the consensus feature
1968
2004
  adduct_values = []
1969
2005
  if all_adducts:
1970
- adduct_counts = {
1971
- adduct: all_adducts.count(adduct) for adduct in set(all_adducts)
1972
- }
2006
+ adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
1973
2007
  total_count = sum(adduct_counts.values())
1974
2008
  for adduct, count in adduct_counts.items():
1975
2009
  percentage = (count / total_count) * 100 if total_count > 0 else 0
@@ -2055,11 +2089,7 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2055
2089
  element,
2056
2090
  1.007825,
2057
2091
  ) # Default to H if unknown
2058
- mass_shift = (
2059
- base_mass * multiplier
2060
- if sign == "+"
2061
- else -base_mass * multiplier
2062
- )
2092
+ mass_shift = base_mass * multiplier if sign == "+" else -base_mass * multiplier
2063
2093
  adduct_mass_shift_top = mass_shift
2064
2094
  else:
2065
2095
  # Default fallback
@@ -2083,13 +2113,9 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2083
2113
  consensus_adduct_values = [[adduct_top, 1, 100.0]]
2084
2114
 
2085
2115
  # Calculate neutral mass from consensus mz (for both cases)
2086
- consensus_mz = (
2087
- round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
2088
- )
2116
+ consensus_mz = round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0
2089
2117
  if adduct_charge_top and adduct_mass_shift_top is not None:
2090
- adduct_mass_neutral_top = (
2091
- consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
2092
- )
2118
+ adduct_mass_neutral_top = consensus_mz * abs(adduct_charge_top) - adduct_mass_shift_top
2093
2119
 
2094
2120
  # Calculate number of MS2 spectra
2095
2121
  ms2_count = 0
@@ -2100,7 +2126,8 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2100
2126
 
2101
2127
  # Generate unique 16-character consensus_id string (UUID-based)
2102
2128
  import uuid
2103
- consensus_id_str = str(uuid.uuid4()).replace('-', '')[:16]
2129
+
2130
+ consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
2104
2131
 
2105
2132
  metadata_list.append(
2106
2133
  {
@@ -2109,48 +2136,20 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2109
2136
  "quality": round(float(feature.getQuality()), 3),
2110
2137
  "number_samples": len(feature_data_list),
2111
2138
  # "number_ext": int(len(features_list)),
2112
- "rt": round(float(np.mean(rt_values)), 4)
2113
- if len(rt_values) > 0
2114
- else 0.0,
2115
- "mz": round(float(np.mean(mz_values)), 4)
2116
- if len(mz_values) > 0
2117
- else 0.0,
2118
- "rt_min": round(float(np.min(rt_values)), 3)
2119
- if len(rt_values) > 0
2120
- else 0.0,
2121
- "rt_max": round(float(np.max(rt_values)), 3)
2122
- if len(rt_values) > 0
2123
- else 0.0,
2124
- "rt_mean": round(float(np.mean(rt_values)), 3)
2125
- if len(rt_values) > 0
2126
- else 0.0,
2127
- "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
2128
- if len(rt_start_values) > 0
2129
- else 0.0,
2130
- "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
2131
- if len(rt_end_values) > 0
2132
- else 0.0,
2133
- "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
2134
- if len(rt_delta_values) > 0
2135
- else 0.0,
2136
- "mz_min": round(float(np.min(mz_values)), 4)
2137
- if len(mz_values) > 0
2138
- else 0.0,
2139
- "mz_max": round(float(np.max(mz_values)), 4)
2140
- if len(mz_values) > 0
2141
- else 0.0,
2142
- "mz_mean": round(float(np.mean(mz_values)), 4)
2143
- if len(mz_values) > 0
2144
- else 0.0,
2145
- "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
2146
- if len(mz_start_values) > 0
2147
- else 0.0,
2148
- "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
2149
- if len(mz_end_values) > 0
2150
- else 0.0,
2151
- "inty_mean": round(float(np.mean(inty_values)), 0)
2152
- if len(inty_values) > 0
2153
- else 0.0,
2139
+ "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
2140
+ "mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
2141
+ "rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
2142
+ "rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
2143
+ "rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
2144
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
2145
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
2146
+ "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
2147
+ "mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
2148
+ "mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
2149
+ "mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
2150
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
2151
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
2152
+ "inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
2154
2153
  "bl": -1.0,
2155
2154
  "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
2156
2155
  if len(coherence_values) > 0
@@ -2171,25 +2170,17 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2171
2170
  if len(height_scaled_values) > 0
2172
2171
  else 0.0,
2173
2172
  "iso": None, # Will be filled by find_iso() function
2174
- "iso_mean": round(float(np.mean(iso_values)), 2)
2175
- if len(iso_values) > 0
2176
- else 0.0,
2177
- "charge_mean": round(float(np.mean(charge_values)), 2)
2178
- if len(charge_values) > 0
2179
- else 0.0,
2173
+ "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
2174
+ "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
2180
2175
  "number_ms2": int(ms2_count),
2181
- "adducts": consensus_adduct_values
2182
- if consensus_adduct_values
2183
- else [], # Ensure it's always a list
2176
+ "adducts": consensus_adduct_values if consensus_adduct_values else [], # Ensure it's always a list
2184
2177
  # New columns for top-ranked adduct information
2185
2178
  "adduct_top": adduct_top,
2186
2179
  "adduct_charge_top": adduct_charge_top,
2187
2180
  "adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
2188
2181
  if adduct_mass_neutral_top is not None
2189
2182
  else None,
2190
- "adduct_mass_shift_top": round(adduct_mass_shift_top, 6)
2191
- if adduct_mass_shift_top is not None
2192
- else None,
2183
+ "adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
2193
2184
  # New columns for top-scoring identification results
2194
2185
  "id_top_name": None,
2195
2186
  "id_top_class": None,
@@ -2238,16 +2229,13 @@ def _extract_consensus_features(study, consensus_map, min_samples, cached_adduct
2238
2229
  )
2239
2230
 
2240
2231
  # Log final counts
2241
- study.logger.info(
2242
- f"Extracted {len(study.consensus_df)} consensus features with "
2243
- f"at least {min_samples} samples."
2244
- )
2232
+ study.logger.info(f"Extracted {len(study.consensus_df)} consensus features with at least {min_samples} samples.")
2245
2233
 
2246
2234
 
2247
2235
  def _perform_adduct_grouping(study, rt_tol, mz_tol):
2248
2236
  """Perform adduct grouping on consensus features."""
2249
2237
  import polars as pl
2250
-
2238
+
2251
2239
  # Add adduct grouping and adduct_of assignment
2252
2240
  if len(study.consensus_df) > 0:
2253
2241
  # Get relevant columns for grouping
@@ -2264,9 +2252,7 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2264
2252
  },
2265
2253
  )
2266
2254
 
2267
- adduct_group_list, adduct_of_list = __merge_adduct_grouping(
2268
- study, consensus_data, rt_tol/3, mz_tol
2269
- )
2255
+ adduct_group_list, adduct_of_list = __merge_adduct_grouping(study, consensus_data, rt_tol / 3, mz_tol)
2270
2256
 
2271
2257
  # Add the new columns to consensus_df
2272
2258
  study.consensus_df = study.consensus_df.with_columns(
@@ -2280,52 +2266,48 @@ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2280
2266
  def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
2281
2267
  """
2282
2268
  Count consensus features grouped in tight clusters.
2283
-
2269
+
2284
2270
  Args:
2285
2271
  mz_tol: m/z tolerance in Daltons for cluster detection
2286
2272
  rt_tol: RT tolerance in seconds for cluster detection
2287
-
2273
+
2288
2274
  Returns:
2289
2275
  Number of tight clusters found
2290
2276
  """
2291
2277
  if len(study.consensus_df) < 2:
2292
2278
  return 0
2293
-
2279
+
2294
2280
  # Extract consensus feature coordinates efficiently
2295
- feature_coords = study.consensus_df.select([
2296
- pl.col("consensus_uid"),
2297
- pl.col("mz"),
2298
- pl.col("rt")
2299
- ]).to_numpy()
2300
-
2281
+ feature_coords = study.consensus_df.select([pl.col("consensus_uid"), pl.col("mz"), pl.col("rt")]).to_numpy()
2282
+
2301
2283
  n_features = len(feature_coords)
2302
2284
  processed = [False] * n_features
2303
2285
  tight_clusters_count = 0
2304
-
2286
+
2305
2287
  # Use vectorized distance calculations for efficiency
2306
2288
  for i in range(n_features):
2307
2289
  if processed[i]:
2308
2290
  continue
2309
-
2291
+
2310
2292
  # Find all features within tolerance of feature i
2311
2293
  cluster_members = [i]
2312
2294
  rt_i, mz_i = feature_coords[i][2], feature_coords[i][1]
2313
-
2295
+
2314
2296
  for j in range(i + 1, n_features):
2315
2297
  if processed[j]:
2316
2298
  continue
2317
-
2299
+
2318
2300
  rt_j, mz_j = feature_coords[j][2], feature_coords[j][1]
2319
-
2301
+
2320
2302
  if abs(rt_i - rt_j) <= rt_tol and abs(mz_i - mz_j) <= mz_tol:
2321
2303
  cluster_members.append(j)
2322
-
2304
+
2323
2305
  # Mark cluster as tight if it has 2+ members
2324
2306
  if len(cluster_members) >= 2:
2325
2307
  tight_clusters_count += 1
2326
2308
  for idx in cluster_members:
2327
2309
  processed[idx] = True
2328
-
2310
+
2329
2311
  return tight_clusters_count
2330
2312
 
2331
2313
 
@@ -2336,52 +2318,54 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
2336
2318
  """
2337
2319
  if len(study.consensus_df) == 0:
2338
2320
  return
2339
-
2321
+
2340
2322
  initial_count = len(study.consensus_df)
2341
- study.logger.debug(f"Post-processing chunked results: merging partial consensus features from {initial_count} features")
2342
-
2323
+ study.logger.debug(
2324
+ f"Post-processing chunked results: merging partial consensus features from {initial_count} features"
2325
+ )
2326
+
2343
2327
  # Convert to list of dictionaries for easier processing
2344
2328
  consensus_features = []
2345
2329
  for row in study.consensus_df.iter_rows(named=True):
2346
2330
  consensus_features.append({
2347
- 'consensus_uid': row['consensus_uid'],
2348
- 'rt': row['rt'],
2349
- 'mz': row['mz'],
2350
- 'number_samples': row.get('number_samples', 0),
2351
- 'inty_mean': row.get('inty_mean', 0.0)
2331
+ "consensus_uid": row["consensus_uid"],
2332
+ "rt": row["rt"],
2333
+ "mz": row["mz"],
2334
+ "number_samples": row.get("number_samples", 0),
2335
+ "inty_mean": row.get("inty_mean", 0.0),
2352
2336
  })
2353
-
2337
+
2354
2338
  # Use Union-Find to group features that should be merged
2355
2339
  class UnionFind:
2356
2340
  def __init__(self, n):
2357
2341
  self.parent = list(range(n))
2358
-
2342
+
2359
2343
  def find(self, x):
2360
2344
  if self.parent[x] != x:
2361
2345
  self.parent[x] = self.find(self.parent[x])
2362
2346
  return self.parent[x]
2363
-
2347
+
2364
2348
  def union(self, x, y):
2365
2349
  px, py = self.find(x), self.find(y)
2366
2350
  if px != py:
2367
2351
  self.parent[py] = px
2368
-
2352
+
2369
2353
  n_features = len(consensus_features)
2370
2354
  uf = UnionFind(n_features)
2371
-
2355
+
2372
2356
  # Find features that should be merged using original tolerances
2373
2357
  for i in range(n_features):
2374
2358
  for j in range(i + 1, n_features):
2375
2359
  feature_a = consensus_features[i]
2376
2360
  feature_b = consensus_features[j]
2377
-
2378
- rt_diff = abs(feature_a['rt'] - feature_b['rt'])
2379
- mz_diff = abs(feature_a['mz'] - feature_b['mz'])
2380
-
2361
+
2362
+ rt_diff = abs(feature_a["rt"] - feature_b["rt"])
2363
+ mz_diff = abs(feature_a["mz"] - feature_b["mz"])
2364
+
2381
2365
  # Merge if within tolerance
2382
2366
  if rt_diff <= rt_tol and mz_diff <= mz_tol:
2383
2367
  uf.union(i, j)
2384
-
2368
+
2385
2369
  # Group features by their root
2386
2370
  groups = {}
2387
2371
  for i, feature in enumerate(consensus_features):
@@ -2389,12 +2373,12 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
2389
2373
  if root not in groups:
2390
2374
  groups[root] = []
2391
2375
  groups[root].append(consensus_features[i])
2392
-
2376
+
2393
2377
  # Create merged features
2394
2378
  merged_features = []
2395
2379
  merged_mapping_data = []
2396
2380
  uids_to_remove = set()
2397
-
2381
+
2398
2382
  for group in groups.values():
2399
2383
  if len(group) < 2:
2400
2384
  # Single feature, keep as is
@@ -2402,70 +2386,77 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
2402
2386
  else:
2403
2387
  # Multiple features, merge them
2404
2388
  # Find best representative feature (highest sample count, then intensity)
2405
- best_feature = max(group, key=lambda x: (x['number_samples'], x['inty_mean']))
2406
-
2389
+ best_feature = max(group, key=lambda x: (x["number_samples"], x["inty_mean"]))
2390
+
2407
2391
  # Calculate merged properties
2408
- total_samples = sum(f['number_samples'] for f in group)
2409
- weighted_rt = sum(f['rt'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['rt']
2410
- weighted_mz = sum(f['mz'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['mz']
2411
- mean_intensity = sum(f['inty_mean'] * f['number_samples'] for f in group) / total_samples if total_samples > 0 else best_feature['inty_mean']
2412
-
2392
+ total_samples = sum(f["number_samples"] for f in group)
2393
+ weighted_rt = (
2394
+ sum(f["rt"] * f["number_samples"] for f in group) / total_samples
2395
+ if total_samples > 0
2396
+ else best_feature["rt"]
2397
+ )
2398
+ weighted_mz = (
2399
+ sum(f["mz"] * f["number_samples"] for f in group) / total_samples
2400
+ if total_samples > 0
2401
+ else best_feature["mz"]
2402
+ )
2403
+ mean_intensity = (
2404
+ sum(f["inty_mean"] * f["number_samples"] for f in group) / total_samples
2405
+ if total_samples > 0
2406
+ else best_feature["inty_mean"]
2407
+ )
2408
+
2413
2409
  # Keep the best feature's UID but update its properties
2414
2410
  merged_features.append({
2415
- 'consensus_uid': best_feature['consensus_uid'],
2416
- 'rt': weighted_rt,
2417
- 'mz': weighted_mz,
2418
- 'number_samples': total_samples,
2419
- 'inty_mean': mean_intensity
2411
+ "consensus_uid": best_feature["consensus_uid"],
2412
+ "rt": weighted_rt,
2413
+ "mz": weighted_mz,
2414
+ "number_samples": total_samples,
2415
+ "inty_mean": mean_intensity,
2420
2416
  })
2421
-
2417
+
2422
2418
  # Mark other features for removal
2423
2419
  for f in group:
2424
- if f['consensus_uid'] != best_feature['consensus_uid']:
2425
- uids_to_remove.add(f['consensus_uid'])
2426
-
2420
+ if f["consensus_uid"] != best_feature["consensus_uid"]:
2421
+ uids_to_remove.add(f["consensus_uid"])
2422
+
2427
2423
  if merged_features:
2428
2424
  study.logger.debug(f"Merging {len(merged_features)} groups of partial consensus features")
2429
-
2425
+
2430
2426
  # Update consensus_df with merged features
2431
2427
  for merged_feature in merged_features:
2432
2428
  study.consensus_df = study.consensus_df.with_columns([
2433
- pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2434
- .then(pl.lit(merged_feature['rt']))
2435
- .otherwise(pl.col('rt'))
2436
- .alias('rt'),
2437
-
2438
- pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2439
- .then(pl.lit(merged_feature['mz']))
2440
- .otherwise(pl.col('mz'))
2441
- .alias('mz'),
2442
-
2443
- pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2444
- .then(pl.lit(merged_feature['number_samples']))
2445
- .otherwise(pl.col('number_samples'))
2446
- .alias('number_samples'),
2447
-
2448
- pl.when(pl.col('consensus_uid') == merged_feature['consensus_uid'])
2449
- .then(pl.lit(merged_feature['inty_mean']))
2450
- .otherwise(pl.col('inty_mean'))
2451
- .alias('inty_mean')
2429
+ pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
2430
+ .then(pl.lit(merged_feature["rt"]))
2431
+ .otherwise(pl.col("rt"))
2432
+ .alias("rt"),
2433
+ pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
2434
+ .then(pl.lit(merged_feature["mz"]))
2435
+ .otherwise(pl.col("mz"))
2436
+ .alias("mz"),
2437
+ pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
2438
+ .then(pl.lit(merged_feature["number_samples"]))
2439
+ .otherwise(pl.col("number_samples"))
2440
+ .alias("number_samples"),
2441
+ pl.when(pl.col("consensus_uid") == merged_feature["consensus_uid"])
2442
+ .then(pl.lit(merged_feature["inty_mean"]))
2443
+ .otherwise(pl.col("inty_mean"))
2444
+ .alias("inty_mean"),
2452
2445
  ])
2453
-
2446
+
2454
2447
  # Remove duplicate features
2455
2448
  if uids_to_remove:
2456
- study.consensus_df = study.consensus_df.filter(
2457
- ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2458
- )
2459
-
2449
+ study.consensus_df = study.consensus_df.filter(~pl.col("consensus_uid").is_in(list(uids_to_remove)))
2450
+
2460
2451
  # Also update consensus_mapping_df - reassign mappings from removed UIDs
2461
- if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
2452
+ if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
2462
2453
  study.consensus_mapping_df = study.consensus_mapping_df.with_columns(
2463
- pl.when(pl.col('consensus_uid').is_in(list(uids_to_remove)))
2454
+ pl.when(pl.col("consensus_uid").is_in(list(uids_to_remove)))
2464
2455
  .then(pl.lit(None)) # Will be handled by subsequent operations
2465
- .otherwise(pl.col('consensus_uid'))
2466
- .alias('consensus_uid')
2456
+ .otherwise(pl.col("consensus_uid"))
2457
+ .alias("consensus_uid")
2467
2458
  )
2468
-
2459
+
2469
2460
  final_count = len(study.consensus_df)
2470
2461
  study.logger.debug(f"Partial consensus merging: {initial_count} → {final_count} features")
2471
2462
 
@@ -2473,57 +2464,57 @@ def _merge_partial_consensus_features(study, rt_tol, mz_tol):
2473
2464
  def __consensus_cleanup(study, rt_tol, mz_tol):
2474
2465
  """
2475
2466
  Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
2476
-
2467
+
2477
2468
  This function:
2478
- 1. Identifies and merges consensus features that are likely over-segmented
2469
+ 1. Identifies and merges consensus features that are likely over-segmented
2479
2470
  (too many features in very tight m/z and RT windows)
2480
2471
  2. Performs deisotoping to remove +1 and +2 isotopic features
2481
2472
  """
2482
2473
  if len(study.consensus_df) == 0:
2483
2474
  return
2484
-
2475
+
2485
2476
  initial_count = len(study.consensus_df)
2486
-
2477
+
2487
2478
  # Only perform enhanced post-clustering if there are many features
2488
2479
  if initial_count < 50:
2489
2480
  return
2490
-
2481
+
2491
2482
  study.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
2492
-
2483
+
2493
2484
  # Find tight clusters using spatial binning
2494
2485
  consensus_data = []
2495
2486
  for row in study.consensus_df.iter_rows(named=True):
2496
2487
  consensus_data.append({
2497
- 'consensus_uid': row['consensus_uid'],
2498
- 'mz': row['mz'],
2499
- 'rt': row['rt'],
2500
- 'inty_mean': row.get('inty_mean', 0),
2501
- 'number_samples': row.get('number_samples', 0)
2488
+ "consensus_uid": row["consensus_uid"],
2489
+ "mz": row["mz"],
2490
+ "rt": row["rt"],
2491
+ "inty_mean": row.get("inty_mean", 0),
2492
+ "number_samples": row.get("number_samples", 0),
2502
2493
  })
2503
-
2494
+
2504
2495
  # Parameters for tight clustering detection - more lenient for effective merging
2505
2496
  tight_rt_tol = min(0.5, rt_tol * 0.5) # More lenient RT tolerance (max 0.5s)
2506
2497
  tight_mz_tol = min(0.05, max(0.03, mz_tol * 2.0)) # More lenient m/z tolerance (min 30 mDa, max 50 mDa)
2507
-
2498
+
2508
2499
  # Build spatial index using smaller RT and m/z bins for better coverage
2509
2500
  rt_bin_size = tight_rt_tol / 4 # Smaller bins to ensure nearby features are captured
2510
2501
  mz_bin_size = tight_mz_tol / 4 # Smaller bins to ensure nearby features are captured
2511
-
2502
+
2512
2503
  bins = defaultdict(list)
2513
2504
  for feature in consensus_data:
2514
- rt_bin = int(feature['rt'] / rt_bin_size)
2515
- mz_bin = int(feature['mz'] / mz_bin_size)
2505
+ rt_bin = int(feature["rt"] / rt_bin_size)
2506
+ mz_bin = int(feature["mz"] / mz_bin_size)
2516
2507
  bins[(rt_bin, mz_bin)].append(feature)
2517
-
2508
+
2518
2509
  # Find clusters that need merging
2519
2510
  merge_groups = []
2520
2511
  processed_uids = set()
2521
-
2512
+
2522
2513
  for bin_key, bin_features in bins.items():
2523
2514
  # Check current bin and extended neighboring bins for complete cluster
2524
2515
  rt_bin, mz_bin = bin_key
2525
2516
  cluster_features = list(bin_features)
2526
-
2517
+
2527
2518
  # Check a larger neighborhood (±2 bins) to ensure we capture all nearby features
2528
2519
  for dr in [-2, -1, 0, 1, 2]:
2529
2520
  for dm in [-2, -1, 0, 1, 2]:
@@ -2532,192 +2523,194 @@ def __consensus_cleanup(study, rt_tol, mz_tol):
2532
2523
  neighbor_key = (rt_bin + dr, mz_bin + dm)
2533
2524
  if neighbor_key in bins:
2534
2525
  cluster_features.extend(bins[neighbor_key])
2535
-
2526
+
2536
2527
  # Remove duplicates
2537
2528
  seen_uids = set()
2538
2529
  unique_features = []
2539
2530
  for f in cluster_features:
2540
- if f['consensus_uid'] not in seen_uids:
2531
+ if f["consensus_uid"] not in seen_uids:
2541
2532
  unique_features.append(f)
2542
- seen_uids.add(f['consensus_uid'])
2543
-
2533
+ seen_uids.add(f["consensus_uid"])
2534
+
2544
2535
  # Only proceed if we have at least 2 features after including neighbors
2545
2536
  if len(unique_features) < 2:
2546
2537
  continue
2547
-
2538
+
2548
2539
  # Calculate cluster bounds
2549
- mzs = [f['mz'] for f in unique_features]
2550
- rts = [f['rt'] for f in unique_features]
2551
-
2540
+ mzs = [f["mz"] for f in unique_features]
2541
+ rts = [f["rt"] for f in unique_features]
2542
+
2552
2543
  mz_spread = max(mzs) - min(mzs)
2553
2544
  rt_spread = max(rts) - min(rts)
2554
-
2545
+
2555
2546
  # Only merge if features are tightly clustered
2556
2547
  if mz_spread <= tight_mz_tol and rt_spread <= tight_rt_tol:
2557
2548
  # Filter out features that were already processed
2558
- uids_in_cluster = {f['consensus_uid'] for f in unique_features}
2559
- unprocessed_features = [f for f in unique_features if f['consensus_uid'] not in processed_uids]
2560
-
2549
+ uids_in_cluster = {f["consensus_uid"] for f in unique_features}
2550
+ unprocessed_features = [f for f in unique_features if f["consensus_uid"] not in processed_uids]
2551
+
2561
2552
  # Only proceed if we have at least 2 unprocessed features that still form a tight cluster
2562
2553
  if len(unprocessed_features) >= 2:
2563
2554
  # Recalculate bounds for unprocessed features only
2564
- unprocessed_mzs = [f['mz'] for f in unprocessed_features]
2565
- unprocessed_rts = [f['rt'] for f in unprocessed_features]
2566
-
2555
+ unprocessed_mzs = [f["mz"] for f in unprocessed_features]
2556
+ unprocessed_rts = [f["rt"] for f in unprocessed_features]
2557
+
2567
2558
  unprocessed_mz_spread = max(unprocessed_mzs) - min(unprocessed_mzs)
2568
2559
  unprocessed_rt_spread = max(unprocessed_rts) - min(unprocessed_rts)
2569
-
2560
+
2570
2561
  # Check if unprocessed features still meet tight clustering criteria
2571
2562
  if unprocessed_mz_spread <= tight_mz_tol and unprocessed_rt_spread <= tight_rt_tol:
2572
2563
  merge_groups.append(unprocessed_features)
2573
- processed_uids.update({f['consensus_uid'] for f in unprocessed_features})
2574
-
2564
+ processed_uids.update({f["consensus_uid"] for f in unprocessed_features})
2565
+
2575
2566
  if not merge_groups:
2576
2567
  return
2577
-
2568
+
2578
2569
  study.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
2579
-
2570
+
2580
2571
  # Merge clusters by keeping the most representative feature
2581
2572
  uids_to_remove = set()
2582
-
2573
+
2583
2574
  for group in merge_groups:
2584
2575
  if len(group) < 2:
2585
2576
  continue
2586
-
2577
+
2587
2578
  # Find the most representative feature (highest intensity and sample count)
2588
- best_feature = max(group, key=lambda x: (x['number_samples'], x['inty_mean']))
2589
-
2579
+ best_feature = max(group, key=lambda x: (x["number_samples"], x["inty_mean"]))
2580
+
2590
2581
  # Mark other features for removal
2591
2582
  for f in group:
2592
- if f['consensus_uid'] != best_feature['consensus_uid']:
2593
- uids_to_remove.add(f['consensus_uid'])
2594
-
2583
+ if f["consensus_uid"] != best_feature["consensus_uid"]:
2584
+ uids_to_remove.add(f["consensus_uid"])
2585
+
2595
2586
  if uids_to_remove:
2596
2587
  # Remove merged features from consensus_df
2597
- study.consensus_df = study.consensus_df.filter(
2598
- ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2599
- )
2600
-
2588
+ study.consensus_df = study.consensus_df.filter(~pl.col("consensus_uid").is_in(list(uids_to_remove)))
2589
+
2601
2590
  # Also update consensus_mapping_df if it exists
2602
- if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
2591
+ if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
2603
2592
  study.consensus_mapping_df = study.consensus_mapping_df.filter(
2604
- ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2593
+ ~pl.col("consensus_uid").is_in(list(uids_to_remove))
2605
2594
  )
2606
-
2595
+
2607
2596
  final_count = len(study.consensus_df)
2608
2597
  reduction = initial_count - final_count
2609
2598
  reduction_pct = (reduction / initial_count) * 100
2610
-
2599
+
2611
2600
  if reduction > 0:
2612
- study.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
2613
-
2601
+ study.logger.debug(
2602
+ f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)"
2603
+ )
2604
+
2614
2605
  # Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
2615
2606
  pre_deisotoping_count = len(study.consensus_df)
2616
2607
  isotope_uids_to_remove = set()
2617
-
2608
+
2618
2609
  # Use strict tolerances for deisotoping (same as declustering)
2619
2610
  deisotope_rt_tol = min(0.3, rt_tol * 0.3) # Strict RT tolerance for isotope detection
2620
2611
  deisotope_mz_tol = min(0.01, mz_tol * 0.5) # Strict m/z tolerance for isotope detection
2621
-
2612
+
2622
2613
  # Get current consensus data for isotope detection
2623
2614
  current_consensus_data = []
2624
2615
  for row in study.consensus_df.iter_rows(named=True):
2625
2616
  current_consensus_data.append({
2626
- 'consensus_uid': row['consensus_uid'],
2627
- 'mz': row['mz'],
2628
- 'rt': row['rt'],
2629
- 'number_samples': row.get('number_samples', 0)
2617
+ "consensus_uid": row["consensus_uid"],
2618
+ "mz": row["mz"],
2619
+ "rt": row["rt"],
2620
+ "number_samples": row.get("number_samples", 0),
2630
2621
  })
2631
-
2622
+
2632
2623
  # Sort by m/z for efficient searching
2633
- current_consensus_data.sort(key=lambda x: x['mz'])
2624
+ current_consensus_data.sort(key=lambda x: x["mz"])
2634
2625
  n_current = len(current_consensus_data)
2635
-
2626
+
2636
2627
  for i in range(n_current):
2637
2628
  feature_i = current_consensus_data[i]
2638
-
2629
+
2639
2630
  # Skip if already marked for removal
2640
- if feature_i['consensus_uid'] in isotope_uids_to_remove:
2631
+ if feature_i["consensus_uid"] in isotope_uids_to_remove:
2641
2632
  continue
2642
-
2633
+
2643
2634
  # Look for potential +1 and +2 isotopes (higher m/z)
2644
2635
  for j in range(i + 1, n_current):
2645
2636
  feature_j = current_consensus_data[j]
2646
-
2637
+
2647
2638
  # Skip if already marked for removal
2648
- if feature_j['consensus_uid'] in isotope_uids_to_remove:
2639
+ if feature_j["consensus_uid"] in isotope_uids_to_remove:
2649
2640
  continue
2650
-
2651
- mz_diff = feature_j['mz'] - feature_i['mz']
2652
-
2641
+
2642
+ mz_diff = feature_j["mz"] - feature_i["mz"]
2643
+
2653
2644
  # Break if m/z difference is too large (features are sorted by m/z)
2654
2645
  if mz_diff > 2.1: # Beyond +2 isotope range
2655
2646
  break
2656
-
2657
- rt_diff = abs(feature_j['rt'] - feature_i['rt'])
2658
-
2647
+
2648
+ rt_diff = abs(feature_j["rt"] - feature_i["rt"])
2649
+
2659
2650
  # Check for +1 isotope (C13 mass difference ≈ 1.003354 Da)
2660
2651
  if (0.995 <= mz_diff <= 1.011) and rt_diff <= deisotope_rt_tol:
2661
2652
  # Potential +1 isotope - should have fewer samples than main feature
2662
- if feature_j['number_samples'] < feature_i['number_samples']:
2663
- isotope_uids_to_remove.add(feature_j['consensus_uid'])
2653
+ if feature_j["number_samples"] < feature_i["number_samples"]:
2654
+ isotope_uids_to_remove.add(feature_j["consensus_uid"])
2664
2655
  continue
2665
-
2666
- # Check for +2 isotope (2 * C13 mass difference ≈ 2.006708 Da)
2656
+
2657
+ # Check for +2 isotope (2 * C13 mass difference ≈ 2.006708 Da)
2667
2658
  if (1.995 <= mz_diff <= 2.018) and rt_diff <= deisotope_rt_tol:
2668
2659
  # Potential +2 isotope - should have fewer samples than main feature
2669
- if feature_j['number_samples'] < feature_i['number_samples']:
2670
- isotope_uids_to_remove.add(feature_j['consensus_uid'])
2660
+ if feature_j["number_samples"] < feature_i["number_samples"]:
2661
+ isotope_uids_to_remove.add(feature_j["consensus_uid"])
2671
2662
  continue
2672
-
2663
+
2673
2664
  # Remove isotopic features
2674
2665
  if isotope_uids_to_remove:
2675
- study.consensus_df = study.consensus_df.filter(
2676
- ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2677
- )
2678
-
2666
+ study.consensus_df = study.consensus_df.filter(~pl.col("consensus_uid").is_in(list(isotope_uids_to_remove)))
2667
+
2679
2668
  # Also update consensus_mapping_df if it exists
2680
- if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
2669
+ if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
2681
2670
  study.consensus_mapping_df = study.consensus_mapping_df.filter(
2682
- ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2671
+ ~pl.col("consensus_uid").is_in(list(isotope_uids_to_remove))
2683
2672
  )
2684
-
2673
+
2685
2674
  post_deisotoping_count = len(study.consensus_df)
2686
2675
  isotope_reduction = pre_deisotoping_count - post_deisotoping_count
2687
-
2676
+
2688
2677
  if isotope_reduction > 0:
2689
- study.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
2690
-
2678
+ study.logger.debug(
2679
+ f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)"
2680
+ )
2681
+
2691
2682
  # Final summary
2692
2683
  final_count = len(study.consensus_df)
2693
2684
  total_reduction = initial_count - final_count
2694
2685
  if total_reduction > 0:
2695
2686
  total_reduction_pct = (total_reduction / initial_count) * 100
2696
- study.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
2687
+ study.logger.debug(
2688
+ f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)"
2689
+ )
2697
2690
 
2698
2691
 
2699
2692
  def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2700
2693
  """
2701
2694
  Identify coeluting consensus features by characteristic mass shifts between adducts
2702
2695
  and update their adduct information accordingly.
2703
-
2696
+
2704
2697
  This function:
2705
2698
  1. Generates a catalogue of mass shifts between adducts using _get_adducts()
2706
2699
  2. Searches for pairs of consensus features with same RT (within strict RT tolerance)
2707
2700
  and matching m/z shifts (±0.005 Da)
2708
2701
  3. Updates adduct_* columns based on identified relationships
2709
-
2702
+
2710
2703
  Args:
2711
2704
  rt_tol: RT tolerance in seconds (strict tolerance for coelution detection)
2712
2705
  cached_adducts_df: Pre-computed adducts DataFrame for performance
2713
2706
  """
2714
2707
  import polars as pl
2715
-
2708
+
2716
2709
  # Check if consensus_df exists and has features
2717
2710
  if len(study.consensus_df) == 0:
2718
2711
  study.logger.debug("No consensus features for adduct identification by mass shift")
2719
2712
  return
2720
-
2713
+
2721
2714
  # Get adducts DataFrame if not provided
2722
2715
  if cached_adducts_df is None or cached_adducts_df.is_empty():
2723
2716
  try:
@@ -2726,145 +2719,148 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2726
2719
  except Exception as e:
2727
2720
  study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
2728
2721
  return
2729
-
2722
+
2730
2723
  if cached_adducts_df.is_empty():
2731
2724
  study.logger.debug("No adducts available for mass shift identification")
2732
2725
  return
2733
-
2726
+
2734
2727
  # Build catalogue of mass shifts between adducts
2735
2728
  mass_shift_catalog = {}
2736
2729
  adduct_info = {}
2737
-
2730
+
2738
2731
  # Extract adduct information
2739
2732
  adducts_data = cached_adducts_df.select(["name", "charge", "mass_shift"]).to_dicts()
2740
-
2733
+
2741
2734
  for adduct in adducts_data:
2742
2735
  name = adduct["name"]
2743
- charge = adduct["charge"]
2736
+ charge = adduct["charge"]
2744
2737
  mass_shift = adduct["mass_shift"]
2745
-
2746
- adduct_info[name] = {
2747
- "charge": charge,
2748
- "mass_shift": mass_shift
2749
- }
2750
-
2738
+
2739
+ adduct_info[name] = {"charge": charge, "mass_shift": mass_shift}
2740
+
2751
2741
  # Generate pairwise mass differences for catalog
2752
2742
  for adduct1 in adducts_data:
2753
2743
  for adduct2 in adducts_data:
2754
2744
  if adduct1["name"] == adduct2["name"]:
2755
2745
  continue
2756
-
2746
+
2757
2747
  name1, charge1, ms1 = adduct1["name"], adduct1["charge"], adduct1["mass_shift"]
2758
2748
  name2, charge2, ms2 = adduct2["name"], adduct2["charge"], adduct2["mass_shift"]
2759
-
2749
+
2760
2750
  # Only consider shifts between adducts that have the same charge (same ionization state)
2761
2751
  if charge1 != charge2:
2762
2752
  continue
2763
-
2753
+
2764
2754
  # Calculate expected m/z difference
2765
2755
  if charge1 != 0 and charge2 != 0:
2766
2756
  mz_diff = (ms1 - ms2) / abs(charge1)
2767
2757
  else:
2768
2758
  continue # Skip neutral adducts for this analysis
2769
-
2759
+
2770
2760
  # Store the mass shift relationship
2771
2761
  shift_key = round(mz_diff, 4) # Round to 4 decimal places for matching
2772
2762
  if shift_key not in mass_shift_catalog:
2773
2763
  mass_shift_catalog[shift_key] = []
2774
2764
  mass_shift_catalog[shift_key].append({
2775
2765
  "from_adduct": name1,
2776
- "to_adduct": name2,
2766
+ "to_adduct": name2,
2777
2767
  "mz_shift": mz_diff,
2778
2768
  "from_charge": charge1,
2779
- "to_charge": charge2
2769
+ "to_charge": charge2,
2780
2770
  })
2781
-
2771
+
2782
2772
  study.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
2783
-
2773
+
2784
2774
  # Get consensus features data
2785
2775
  consensus_data = []
2786
2776
  for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
2787
2777
  consensus_data.append({
2788
2778
  "index": i,
2789
2779
  "consensus_uid": row["consensus_uid"],
2790
- "rt": row["rt"],
2780
+ "rt": row["rt"],
2791
2781
  "mz": row["mz"],
2792
2782
  "adduct_top": row.get("adduct_top", "[M+?]1+"),
2793
2783
  "adduct_charge_top": row.get("adduct_charge_top", 1),
2794
2784
  "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
2795
2785
  "adduct_mass_shift_top": row.get("adduct_mass_shift_top"),
2796
- "inty_mean": row.get("inty_mean", 0)
2786
+ "inty_mean": row.get("inty_mean", 0),
2797
2787
  })
2798
-
2788
+
2799
2789
  # Sort by RT for efficient searching
2800
2790
  consensus_data.sort(key=lambda x: x["rt"])
2801
2791
  n_features = len(consensus_data)
2802
-
2792
+
2803
2793
  # Track updates to make
2804
2794
  adduct_updates = {} # consensus_uid -> new_adduct_info
2805
-
2795
+
2806
2796
  # Strict RT tolerance for coelution (convert to minutes)
2807
2797
  rt_tol_strict = rt_tol * 0.5 # Use half the merge tolerance for strict coelution
2808
2798
  mz_tol_shift = 0.005 # ±5 mDa tolerance for mass shift matching
2809
-
2799
+
2810
2800
  # Search for coeluting pairs with characteristic mass shifts
2811
2801
  updated_count = 0
2812
-
2802
+
2813
2803
  for i in range(n_features):
2814
2804
  feature1 = consensus_data[i]
2815
2805
  rt1 = feature1["rt"]
2816
2806
  mz1 = feature1["mz"]
2817
2807
  adduct1 = feature1["adduct_top"]
2818
-
2808
+
2819
2809
  # Conservative approach: Don't skip features here - let algorithm find pairs first
2820
2810
  # We'll check for inappropriate assignments later in the pair processing logic
2821
-
2811
+
2822
2812
  # Search for coeluting features within strict RT tolerance
2823
2813
  for j in range(i + 1, n_features):
2824
2814
  feature2 = consensus_data[j]
2825
2815
  rt2 = feature2["rt"]
2826
-
2816
+
2827
2817
  # Break if RT difference exceeds tolerance (sorted by RT)
2828
2818
  if abs(rt2 - rt1) > rt_tol_strict:
2829
2819
  break
2830
-
2820
+
2831
2821
  mz2 = feature2["mz"]
2832
2822
  adduct2 = feature2["adduct_top"]
2833
-
2823
+
2834
2824
  # Conservative approach: Don't skip feature2 here either - process all potential pairs
2835
-
2825
+
2836
2826
  # Calculate observed m/z difference
2837
2827
  mz_diff = mz2 - mz1
2838
2828
  shift_key = round(mz_diff, 4)
2839
-
2829
+
2840
2830
  # Check if this mass shift matches any known adduct relationships
2841
2831
  for catalog_shift, relationships in mass_shift_catalog.items():
2842
2832
  if abs(shift_key - catalog_shift) <= mz_tol_shift:
2843
2833
  # Found a matching mass shift!
2844
-
2834
+
2845
2835
  # Choose the best relationship based on common adducts
2846
2836
  best_rel = None
2847
2837
  best_score = 0
2848
-
2838
+
2849
2839
  for rel in relationships:
2850
2840
  # Prioritize common adducts ([M+H]+, [M+Na]+, [M+NH4]+)
2851
2841
  score = 0
2852
- if "H]" in rel["from_adduct"]: score += 3
2853
- if "Na]" in rel["from_adduct"]: score += 2
2854
- if "NH4]" in rel["from_adduct"]: score += 2
2855
- if "H]" in rel["to_adduct"]: score += 3
2856
- if "Na]" in rel["to_adduct"]: score += 2
2857
- if "NH4]" in rel["to_adduct"]: score += 2
2858
-
2842
+ if "H]" in rel["from_adduct"]:
2843
+ score += 3
2844
+ if "Na]" in rel["from_adduct"]:
2845
+ score += 2
2846
+ if "NH4]" in rel["from_adduct"]:
2847
+ score += 2
2848
+ if "H]" in rel["to_adduct"]:
2849
+ score += 3
2850
+ if "Na]" in rel["to_adduct"]:
2851
+ score += 2
2852
+ if "NH4]" in rel["to_adduct"]:
2853
+ score += 2
2854
+
2859
2855
  if score > best_score:
2860
2856
  best_score = score
2861
2857
  best_rel = rel
2862
-
2858
+
2863
2859
  if best_rel:
2864
2860
  # Determine which feature gets which adduct based on intensity
2865
2861
  inty1 = feature1["inty_mean"]
2866
- inty2 = feature2["inty_mean"]
2867
-
2862
+ inty2 = feature2["inty_mean"]
2863
+
2868
2864
  # Assign higher intensity to [M+H]+ if possible
2869
2865
  if "H]" in best_rel["from_adduct"] and inty1 >= inty2:
2870
2866
  # Feature 1 = from_adduct, Feature 2 = to_adduct
@@ -2881,107 +2877,111 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2881
2877
  else:
2882
2878
  # Assignment based on mass shift direction
2883
2879
  # catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
2884
- # If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
2880
+ # If catalog_shift > 0: from_adduct has higher mass shift than to_adduct
2885
2881
  # If catalog_shift < 0: from_adduct has lower mass shift than to_adduct
2886
2882
  # observed mz_diff = mz2 - mz1 (always positive for mz2 > mz1)
2887
- #
2883
+ #
2888
2884
  # CRITICAL FIX: Correct assignment logic
2889
- # When mz_diff matches positive catalog_shift:
2885
+ # When mz_diff matches positive catalog_shift:
2890
2886
  # - from_adduct is the heavier adduct (higher mass shift)
2891
- # - to_adduct is the lighter adduct (lower mass shift)
2887
+ # - to_adduct is the lighter adduct (lower mass shift)
2892
2888
  # - Higher m/z feature should get the heavier adduct (from_adduct)
2893
2889
  # - Lower m/z feature should get the lighter adduct (to_adduct)
2894
-
2890
+
2895
2891
  if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
2896
2892
  # mz_diff matches catalog_shift direction
2897
2893
  if catalog_shift > 0:
2898
2894
  # from_adduct is heavier, to_adduct is lighter
2899
2895
  from_feature = feature2 # Higher m/z gets heavier adduct
2900
- to_feature = feature1 # Lower m/z gets lighter adduct
2896
+ to_feature = feature1 # Lower m/z gets lighter adduct
2901
2897
  from_adduct_name = best_rel["from_adduct"] # Heavier adduct
2902
- to_adduct_name = best_rel["to_adduct"] # Lighter adduct
2898
+ to_adduct_name = best_rel["to_adduct"] # Lighter adduct
2903
2899
  else:
2904
2900
  # from_adduct is lighter, to_adduct is heavier
2905
2901
  from_feature = feature1 # Lower m/z gets lighter adduct
2906
- to_feature = feature2 # Higher m/z gets heavier adduct
2907
- from_adduct_name = best_rel["from_adduct"] # Lighter adduct
2908
- to_adduct_name = best_rel["to_adduct"] # Heavier adduct
2902
+ to_feature = feature2 # Higher m/z gets heavier adduct
2903
+ from_adduct_name = best_rel["from_adduct"] # Lighter adduct
2904
+ to_adduct_name = best_rel["to_adduct"] # Heavier adduct
2909
2905
  else:
2910
2906
  # mz_diff matches reverse direction of catalog_shift
2911
2907
  if catalog_shift > 0:
2912
2908
  # Reverse: from_adduct becomes lighter, to_adduct becomes heavier
2913
2909
  from_feature = feature1 # Lower m/z gets lighter adduct
2914
- to_feature = feature2 # Higher m/z gets heavier adduct
2915
- from_adduct_name = best_rel["to_adduct"] # Now lighter adduct
2916
- to_adduct_name = best_rel["from_adduct"] # Now heavier adduct
2910
+ to_feature = feature2 # Higher m/z gets heavier adduct
2911
+ from_adduct_name = best_rel["to_adduct"] # Now lighter adduct
2912
+ to_adduct_name = best_rel["from_adduct"] # Now heavier adduct
2917
2913
  else:
2918
2914
  # Reverse: from_adduct becomes heavier, to_adduct becomes lighter
2919
2915
  from_feature = feature2 # Higher m/z gets heavier adduct
2920
- to_feature = feature1 # Lower m/z gets lighter adduct
2921
- from_adduct_name = best_rel["to_adduct"] # Now heavier adduct
2922
- to_adduct_name = best_rel["from_adduct"] # Now lighter adduct
2923
-
2916
+ to_feature = feature1 # Lower m/z gets lighter adduct
2917
+ from_adduct_name = best_rel["to_adduct"] # Now heavier adduct
2918
+ to_adduct_name = best_rel["from_adduct"] # Now lighter adduct
2919
+
2924
2920
  # Get adduct details from catalog
2925
2921
  from_adduct_info = adduct_info.get(from_adduct_name, {})
2926
2922
  to_adduct_info = adduct_info.get(to_adduct_name, {})
2927
-
2923
+
2928
2924
  # Calculate neutral masses
2929
2925
  from_charge = from_adduct_info.get("charge", 1)
2930
2926
  to_charge = to_adduct_info.get("charge", 1)
2931
2927
  from_mass_shift = from_adduct_info.get("mass_shift", 1.007825)
2932
2928
  to_mass_shift = to_adduct_info.get("mass_shift", 1.007825)
2933
-
2929
+
2934
2930
  from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
2935
2931
  to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
2936
-
2932
+
2937
2933
  # Smart conservative check: prevent inappropriate assignments to isolated features
2938
2934
  # Check if both features are isolated (single-member groups) with [M+?]1+ assignments
2939
2935
  def is_isolated_unknown_feature(feature):
2940
2936
  """Check if a feature is isolated with unknown adduct"""
2941
2937
  if not feature["adduct_top"] or "[M+?]" not in feature["adduct_top"]:
2942
2938
  return False # Not unknown, safe to process
2943
-
2939
+
2944
2940
  # Check group size
2945
2941
  try:
2946
- feature_row = study.consensus_df.filter(study.consensus_df["consensus_uid"] == feature["consensus_uid"])
2942
+ feature_row = study.consensus_df.filter(
2943
+ study.consensus_df["consensus_uid"] == feature["consensus_uid"]
2944
+ )
2947
2945
  if len(feature_row) > 0:
2948
2946
  adduct_group = feature_row["adduct_group"].iloc[0]
2949
2947
  if adduct_group > 0:
2950
- group_members = study.consensus_df.filter(study.consensus_df["adduct_group"] == adduct_group)
2948
+ group_members = study.consensus_df.filter(
2949
+ study.consensus_df["adduct_group"] == adduct_group
2950
+ )
2951
2951
  return len(group_members) <= 1 # Isolated if group size <= 1
2952
2952
  except Exception:
2953
2953
  pass
2954
2954
  return True # Default to isolated if can't determine
2955
-
2955
+
2956
2956
  from_isolated = is_isolated_unknown_feature(from_feature)
2957
2957
  to_isolated = is_isolated_unknown_feature(to_feature)
2958
-
2958
+
2959
2959
  # Only skip assignment if BOTH features are isolated AND would get the SAME adduct
2960
2960
  # (This prevents inappropriate duplicate assignments to isolated features)
2961
- skip_assignment = (from_isolated and to_isolated and from_adduct_name == to_adduct_name)
2962
-
2961
+ skip_assignment = from_isolated and to_isolated and from_adduct_name == to_adduct_name
2962
+
2963
2963
  if skip_assignment:
2964
2964
  study.logger.debug(
2965
2965
  f"Skipping inappropriate assignment: both isolated features would get {from_adduct_name} "
2966
2966
  f"(UIDs {from_feature['consensus_uid']}, {to_feature['consensus_uid']})"
2967
2967
  )
2968
2968
  continue # Skip this pair, continue to next relationship
2969
-
2969
+
2970
2970
  # Store updates (legitimate pair or at least one feature already has specific adduct)
2971
2971
  adduct_updates[from_feature["consensus_uid"]] = {
2972
2972
  "adduct_top": from_adduct_name,
2973
2973
  "adduct_charge_top": from_charge,
2974
2974
  "adduct_mass_neutral_top": from_neutral_mass,
2975
- "adduct_mass_shift_top": from_mass_shift
2975
+ "adduct_mass_shift_top": from_mass_shift,
2976
2976
  }
2977
-
2977
+
2978
2978
  adduct_updates[to_feature["consensus_uid"]] = {
2979
2979
  "adduct_top": to_adduct_name,
2980
2980
  "adduct_charge_top": to_charge,
2981
2981
  "adduct_mass_neutral_top": to_neutral_mass,
2982
- "adduct_mass_shift_top": to_mass_shift
2982
+ "adduct_mass_shift_top": to_mass_shift,
2983
2983
  }
2984
-
2984
+
2985
2985
  updated_count += 2
2986
2986
  study.logger.debug(
2987
2987
  f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
@@ -2989,17 +2989,17 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2989
2989
  f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
2990
2990
  )
2991
2991
  break # Found match, no need to check other relationships
2992
-
2992
+
2993
2993
  # Apply updates to consensus_df
2994
2994
  if adduct_updates:
2995
2995
  # Prepare update data
2996
2996
  consensus_uids = study.consensus_df["consensus_uid"].to_list()
2997
-
2997
+
2998
2998
  new_adduct_top = []
2999
2999
  new_adduct_charge_top = []
3000
3000
  new_adduct_mass_neutral_top = []
3001
3001
  new_adduct_mass_shift_top = []
3002
-
3002
+
3003
3003
  for uid in consensus_uids:
3004
3004
  if uid in adduct_updates:
3005
3005
  update = adduct_updates[uid]
@@ -3015,13 +3015,13 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
3015
3015
  new_adduct_charge_top.append(row.get("adduct_charge_top"))
3016
3016
  new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
3017
3017
  new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
3018
-
3018
+
3019
3019
  # Update the DataFrame
3020
3020
  study.consensus_df = study.consensus_df.with_columns([
3021
3021
  pl.Series("adduct_top", new_adduct_top),
3022
- pl.Series("adduct_charge_top", new_adduct_charge_top),
3022
+ pl.Series("adduct_charge_top", new_adduct_charge_top),
3023
3023
  pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
3024
- pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
3024
+ pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top),
3025
3025
  ])
3026
3026
  study.logger.info(f"Adduct information updated for {updated_count} consensus features.")
3027
3027
  else:
@@ -3031,12 +3031,12 @@ def __identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
3031
3031
  def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
3032
3032
  """Complete the merge process with final calculations and cleanup."""
3033
3033
  import polars as pl
3034
-
3034
+
3035
3035
  # Check if consensus_df is empty or missing required columns
3036
3036
  if len(study.consensus_df) == 0 or "number_samples" not in study.consensus_df.columns:
3037
3037
  study.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
3038
3038
  return
3039
-
3039
+
3040
3040
  # Validate min_samples parameter
3041
3041
  if min_samples is None:
3042
3042
  min_samples = 1
@@ -3059,7 +3059,7 @@ def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
3059
3059
  study.logger.debug(
3060
3060
  f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
3061
3061
  )
3062
-
3062
+
3063
3063
  # Filter out consensus mapping with less than min_samples features
3064
3064
  study.consensus_mapping_df = study.consensus_mapping_df.filter(
3065
3065
  pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
@@ -3068,15 +3068,11 @@ def __finalize_merge(study, link_ms2, extract_ms1, min_samples):
3068
3068
  # Calculate the completeness of the consensus map
3069
3069
  # Log completion with tight cluster metrics
3070
3070
  if len(study.consensus_df) > 0 and len(study.samples_df) > 0:
3071
- c = (
3072
- len(study.consensus_mapping_df)
3073
- / len(study.consensus_df)
3074
- / len(study.samples_df)
3075
- )
3076
-
3071
+ c = len(study.consensus_mapping_df) / len(study.consensus_df) / len(study.samples_df)
3072
+
3077
3073
  # Count tight clusters with specified thresholds
3078
- tight_clusters = _count_tight_clusters(study,mz_tol=0.04, rt_tol=0.3)
3079
-
3074
+ tight_clusters = _count_tight_clusters(study, mz_tol=0.04, rt_tol=0.3)
3075
+
3080
3076
  study.logger.success(
3081
3077
  f"Merging completed. Consensus features: {len(study.consensus_df)}. "
3082
3078
  f"Completeness: {c:.2f}. Tight clusters: {tight_clusters}.",
@@ -3100,27 +3096,42 @@ def __merge_feature_lookup(study_obj, features_df):
3100
3096
  """
3101
3097
  study_obj.logger.debug("Creating optimized feature lookup...")
3102
3098
  start_time = time.time()
3103
-
3099
+
3104
3100
  # Use Polars select for faster conversion
3105
3101
  feature_columns = [
3106
- "feature_uid", "sample_uid", "rt", "mz", "rt_start", "rt_end", "rt_delta",
3107
- "mz_start", "mz_end", "inty", "chrom_coherence", "chrom_prominence",
3108
- "chrom_prominence_scaled", "chrom_height_scaled", "iso", "charge",
3109
- "ms2_scans", "adduct", "adduct_mass"
3102
+ "feature_uid",
3103
+ "sample_uid",
3104
+ "rt",
3105
+ "mz",
3106
+ "rt_start",
3107
+ "rt_end",
3108
+ "rt_delta",
3109
+ "mz_start",
3110
+ "mz_end",
3111
+ "inty",
3112
+ "chrom_coherence",
3113
+ "chrom_prominence",
3114
+ "chrom_prominence_scaled",
3115
+ "chrom_height_scaled",
3116
+ "iso",
3117
+ "charge",
3118
+ "ms2_scans",
3119
+ "adduct",
3120
+ "adduct_mass",
3110
3121
  ]
3111
-
3122
+
3112
3123
  # Filter to only existing columns
3113
3124
  existing_columns = [col for col in feature_columns if col in features_df.columns]
3114
-
3125
+
3115
3126
  # Convert to dictionary more efficiently
3116
3127
  selected_df = features_df.select(existing_columns)
3117
-
3128
+
3118
3129
  features_lookup = {}
3119
3130
  for row in selected_df.iter_rows(named=True):
3120
3131
  feature_uid = row["feature_uid"]
3121
3132
  # Keep feature_uid in the dictionary for chunked merge compatibility
3122
3133
  features_lookup[feature_uid] = {k: v for k, v in row.items()}
3123
-
3134
+
3124
3135
  lookup_time = time.time() - start_time
3125
3136
  if len(features_lookup) > 50000:
3126
3137
  study_obj.logger.debug(f"Feature lookup created in {lookup_time:.2f}s for {len(features_lookup)} features")
@@ -3130,188 +3141,187 @@ def __merge_feature_lookup(study_obj, features_df):
3130
3141
  def _get_features_matrix(study, consensus_data, quant_col="inty"):
3131
3142
  """
3132
3143
  Create a local intensity matrix from features_df for correlation calculations.
3133
-
3144
+
3134
3145
  Args:
3135
3146
  study: Study object with features_df and samples_df
3136
3147
  consensus_data: List of consensus feature dictionaries
3137
3148
  quant_col: Column name to use for quantification (default: "inty")
3138
-
3149
+
3139
3150
  Returns:
3140
3151
  pandas.DataFrame: Matrix with consensus_uid as index, sample names as columns
3141
3152
  """
3142
3153
  import pandas as pd
3143
3154
  import numpy as np
3144
-
3155
+
3145
3156
  # Get all sample names
3146
3157
  sample_names = study.samples_df["sample_name"].to_list()
3147
3158
  consensus_uids = [int(f["consensus_uid"]) for f in consensus_data]
3148
-
3159
+
3149
3160
  # Initialize matrix with zeros
3150
3161
  matrix_data = pd.DataFrame(
3151
- index=pd.Index(consensus_uids, name="consensus_uid"),
3152
- columns=sample_names,
3153
- data=0.0,
3154
- dtype=float
3162
+ index=pd.Index(consensus_uids, name="consensus_uid"), columns=sample_names, data=0.0, dtype=float
3155
3163
  )
3156
-
3164
+
3157
3165
  study.logger.debug(f"Building local features matrix: {len(consensus_uids)} features x {len(sample_names)} samples")
3158
-
3166
+
3159
3167
  # Fill matrix with actual intensity values
3160
3168
  features_df_pandas = study.features_df.to_pandas()
3161
3169
  samples_df_pandas = study.samples_df.to_pandas()
3162
3170
  consensus_mapping_pandas = study.consensus_mapping_df.to_pandas()
3163
-
3171
+
3164
3172
  # Create sample_uid to sample_name mapping
3165
3173
  uid_to_name = dict(zip(samples_df_pandas["sample_uid"], samples_df_pandas["sample_name"]))
3166
-
3174
+
3167
3175
  # For each consensus feature, get intensities from all samples
3168
3176
  for consensus_uid in consensus_uids:
3169
3177
  # Get all feature_uids that map to this consensus_uid
3170
- feature_mappings = consensus_mapping_pandas[
3171
- consensus_mapping_pandas["consensus_uid"] == consensus_uid
3172
- ]
3173
-
3178
+ feature_mappings = consensus_mapping_pandas[consensus_mapping_pandas["consensus_uid"] == consensus_uid]
3179
+
3174
3180
  for _, mapping in feature_mappings.iterrows():
3175
3181
  feature_uid = mapping["feature_uid"]
3176
3182
  sample_uid = mapping["sample_uid"]
3177
3183
  sample_name = uid_to_name.get(sample_uid, f"sample_{sample_uid}")
3178
-
3184
+
3179
3185
  # Get intensity for this feature
3180
3186
  feature_row = features_df_pandas[
3181
- (features_df_pandas["feature_uid"] == feature_uid) &
3182
- (features_df_pandas["sample_uid"] == sample_uid)
3187
+ (features_df_pandas["feature_uid"] == feature_uid) & (features_df_pandas["sample_uid"] == sample_uid)
3183
3188
  ]
3184
-
3189
+
3185
3190
  if len(feature_row) > 0:
3186
3191
  intensity = feature_row[quant_col].iloc[0]
3187
3192
  if pd.notna(intensity):
3188
3193
  matrix_data.loc[consensus_uid, sample_name] = float(intensity)
3189
-
3194
+
3190
3195
  # Convert any remaining NaN to 0
3191
3196
  matrix_data = matrix_data.fillna(0.0)
3192
-
3197
+
3193
3198
  study.logger.debug(f"Local matrix built successfully with shape {matrix_data.shape}")
3194
-
3199
+
3195
3200
  return matrix_data
3196
3201
 
3197
3202
 
3198
3203
  def _get_adduct_deltas_with_likelihood(study):
3199
3204
  """
3200
3205
  Extract all pairwise mass differences between adducts with joint likelihood scoring.
3201
-
3206
+
3202
3207
  Args:
3203
3208
  study: Study object with _get_adducts method
3204
-
3209
+
3205
3210
  Returns:
3206
3211
  List of tuples: (mass_delta, joint_likelihood, adduct1_name, adduct2_name)
3207
3212
  Sorted by joint_likelihood descending (most likely pairs first)
3208
3213
  """
3209
3214
  try:
3210
3215
  adducts_df = study._get_adducts()
3211
-
3216
+
3212
3217
  if adducts_df is None or adducts_df.is_empty():
3213
3218
  study.logger.warning("No adducts dataframe available for study")
3214
3219
  return []
3215
-
3220
+
3216
3221
  # Convert to pandas for easier manipulation
3217
3222
  adducts_pd = adducts_df.to_pandas()
3218
-
3223
+
3219
3224
  # Check if we have likelihood/probability information
3220
3225
  likelihood_col = None
3221
- for col in ['likelihood', 'probability', 'freq', 'frequency', 'score']:
3226
+ for col in ["likelihood", "probability", "freq", "frequency", "score"]:
3222
3227
  if col in adducts_pd.columns:
3223
3228
  likelihood_col = col
3224
3229
  break
3225
-
3230
+
3226
3231
  # If no likelihood column, estimate based on adduct type
3227
3232
  if likelihood_col is None:
3228
- adducts_pd['estimated_likelihood'] = adducts_pd.apply(_estimate_adduct_likelihood, axis=1)
3229
- likelihood_col = 'estimated_likelihood'
3230
-
3233
+ adducts_pd["estimated_likelihood"] = adducts_pd.apply(_estimate_adduct_likelihood, axis=1)
3234
+ likelihood_col = "estimated_likelihood"
3235
+
3231
3236
  # Get mass column (try different possible column names)
3232
3237
  mass_col = None
3233
- for col_name in ['mass_shift', 'mass', 'mass_shift_da', 'mass_da']:
3238
+ for col_name in ["mass_shift", "mass", "mass_shift_da", "mass_da"]:
3234
3239
  if col_name in adducts_pd.columns:
3235
3240
  mass_col = col_name
3236
3241
  break
3237
-
3242
+
3238
3243
  if mass_col is None:
3239
- study.logger.warning(f"No mass column found in adducts dataframe. Available columns: {list(adducts_pd.columns)}")
3244
+ study.logger.warning(
3245
+ f"No mass column found in adducts dataframe. Available columns: {list(adducts_pd.columns)}"
3246
+ )
3240
3247
  return []
3241
-
3248
+
3242
3249
  # Calculate all pairwise differences with joint likelihoods
3243
3250
  adduct_pairs = []
3244
3251
  for i in range(len(adducts_pd)):
3245
3252
  for j in range(i + 1, len(adducts_pd)):
3246
3253
  row_i = adducts_pd.iloc[i]
3247
3254
  row_j = adducts_pd.iloc[j]
3248
-
3255
+
3249
3256
  # Skip if masses are NaN or invalid
3250
- if (hasattr(row_i[mass_col], '__iter__') and not isinstance(row_i[mass_col], str)) or \
3251
- (hasattr(row_j[mass_col], '__iter__') and not isinstance(row_j[mass_col], str)):
3257
+ if (hasattr(row_i[mass_col], "__iter__") and not isinstance(row_i[mass_col], str)) or (
3258
+ hasattr(row_j[mass_col], "__iter__") and not isinstance(row_j[mass_col], str)
3259
+ ):
3252
3260
  continue
3253
-
3261
+
3254
3262
  mass_i = float(row_i[mass_col])
3255
3263
  mass_j = float(row_j[mass_col])
3256
3264
  delta = abs(mass_i - mass_j)
3257
-
3265
+
3258
3266
  if delta > 0.1: # Only meaningful mass differences
3259
3267
  # Joint likelihood is sum of individual likelihoods
3260
3268
  joint_likelihood = float(row_i[likelihood_col]) + float(row_j[likelihood_col])
3261
-
3262
- adduct1_name = row_i.get('adduct', row_i.get('name', f'adduct_{i}'))
3263
- adduct2_name = row_j.get('adduct', row_j.get('name', f'adduct_{j}'))
3264
-
3269
+
3270
+ adduct1_name = row_i.get("adduct", row_i.get("name", f"adduct_{i}"))
3271
+ adduct2_name = row_j.get("adduct", row_j.get("name", f"adduct_{j}"))
3272
+
3265
3273
  # CRITICAL FIX: Order adducts consistently from lower mass to higher mass
3266
3274
  # This ensures consistent assignment: lower mass adduct = from_adduct, higher mass adduct = to_adduct
3267
3275
  if mass_i <= mass_j:
3268
3276
  # row_i has lower or equal mass shift -> from_adduct
3269
- # row_j has higher mass shift -> to_adduct
3277
+ # row_j has higher mass shift -> to_adduct
3270
3278
  adduct_pairs.append((round(delta, 4), joint_likelihood, adduct1_name, adduct2_name))
3271
3279
  else:
3272
3280
  # row_j has lower mass shift -> from_adduct
3273
3281
  # row_i has higher mass shift -> to_adduct
3274
3282
  adduct_pairs.append((round(delta, 4), joint_likelihood, adduct2_name, adduct1_name))
3275
-
3283
+
3276
3284
  # Sort by joint likelihood descending (most likely pairs first)
3277
3285
  adduct_pairs.sort(key=lambda x: x[1], reverse=True)
3278
-
3286
+
3279
3287
  study.logger.debug(f"Extracted {len(adduct_pairs)} adduct pairs with likelihood scoring")
3280
3288
  return adduct_pairs
3281
-
3289
+
3282
3290
  except Exception as e:
3283
- study.logger.warning(f"Could not extract adduct deltas with likelihood: {e}. No adducts defined - returning empty list.")
3291
+ study.logger.warning(
3292
+ f"Could not extract adduct deltas with likelihood: {e}. No adducts defined - returning empty list."
3293
+ )
3284
3294
  return []
3285
3295
 
3286
3296
 
3287
3297
  def _estimate_adduct_likelihood(adduct_row):
3288
3298
  """
3289
3299
  Estimate likelihood of an adduct based on common knowledge.
3290
-
3300
+
3291
3301
  Args:
3292
3302
  adduct_row: pandas Series with adduct information
3293
-
3303
+
3294
3304
  Returns:
3295
3305
  float: Estimated likelihood (0.0 to 1.0)
3296
3306
  """
3297
- adduct_name = str(adduct_row.get('adduct', adduct_row.get('name', ''))).lower()
3298
-
3307
+ adduct_name = str(adduct_row.get("adduct", adduct_row.get("name", ""))).lower()
3308
+
3299
3309
  # Common likelihood estimates based on adduct frequency in positive mode
3300
3310
  likelihood_map = {
3301
- '[m+h]': 0.9, # Most common
3302
- '[m+na]': 0.7, # Very common
3303
- '[m+nh4]': 0.6, # Common
3304
- '[m+k]': 0.3, # Less common
3305
- '[m+2h]': 0.2, # Doubly charged, less frequent
3306
- '[m+3h]': 0.1, # Triply charged, rare
3307
- '[m+h-h2o]': 0.4, # Loss adducts, moderately common
3311
+ "[m+h]": 0.9, # Most common
3312
+ "[m+na]": 0.7, # Very common
3313
+ "[m+nh4]": 0.6, # Common
3314
+ "[m+k]": 0.3, # Less common
3315
+ "[m+2h]": 0.2, # Doubly charged, less frequent
3316
+ "[m+3h]": 0.1, # Triply charged, rare
3317
+ "[m+h-h2o]": 0.4, # Loss adducts, moderately common
3308
3318
  }
3309
-
3319
+
3310
3320
  # Find best match
3311
3321
  for pattern, likelihood in likelihood_map.items():
3312
3322
  if pattern in adduct_name:
3313
3323
  return likelihood
3314
-
3324
+
3315
3325
  # Default for unknown adducts
3316
3326
  return 0.2
3317
3327
 
@@ -3319,10 +3329,10 @@ def _estimate_adduct_likelihood(adduct_row):
3319
3329
  def _get_adduct_deltas(study):
3320
3330
  """
3321
3331
  Extract all pairwise mass differences between adducts from study adducts data.
3322
-
3332
+
3323
3333
  Args:
3324
3334
  study: Study object with _get_adducts method
3325
-
3335
+
3326
3336
  Returns:
3327
3337
  List of mass differences (deltas) for adduct filtering
3328
3338
  """
@@ -3338,15 +3348,15 @@ def _fast_correlation(vec1, vec2):
3338
3348
  """
3339
3349
  if len(vec1) != len(vec2):
3340
3350
  return 0.0
3341
-
3351
+
3342
3352
  # Remove NaN values and corresponding positions
3343
3353
  mask = ~(np.isnan(vec1) | np.isnan(vec2))
3344
3354
  if np.sum(mask) < 2: # Need at least 2 valid points
3345
3355
  return 0.0
3346
-
3356
+
3347
3357
  v1 = vec1[mask]
3348
3358
  v2 = vec2[mask]
3349
-
3359
+
3350
3360
  # Fast correlation using numpy built-in
3351
3361
  try:
3352
3362
  corr_matrix = np.corrcoef(v1, v2)
@@ -3365,45 +3375,47 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3365
3375
  4. Hierarchical boss structure (prevent transitivity)
3366
3376
  5. Correlation-based confirmation
3367
3377
  6. Intensity-based ranking for final selection
3368
-
3378
+
3369
3379
  Args:
3370
3380
  study: Study object
3371
3381
  consensus_data: List of consensus feature dictionaries
3372
- rt_tol: Retention time tolerance (seconds)
3382
+ rt_tol: Retention time tolerance (seconds)
3373
3383
  mz_tol: M/z tolerance (Da)
3374
-
3384
+
3375
3385
  Returns:
3376
3386
  Tuple of (adduct_group_list, adduct_of_list)
3377
3387
  """
3378
-
3388
+
3379
3389
  if not consensus_data:
3380
3390
  return [], []
3381
-
3391
+
3382
3392
  n_features = len(consensus_data)
3383
3393
  study.logger.info(f"Starting adduct grouping for {n_features} features")
3384
-
3394
+
3385
3395
  # Step 1: Build local intensity matrix ONCE
3386
3396
  try:
3387
3397
  intensity_matrix_pd = _get_features_matrix(study, consensus_data, quant_col="inty")
3388
-
3398
+
3389
3399
  if intensity_matrix_pd is None or len(intensity_matrix_pd) == 0:
3390
3400
  study.logger.warning("Could not build local intensity matrix - creating single-feature groups")
3391
3401
  adduct_group_list = list(range(1, len(consensus_data) + 1))
3392
3402
  adduct_of_list = [0] * len(consensus_data)
3393
3403
  return adduct_group_list, adduct_of_list
3394
-
3395
- study.logger.debug(f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples")
3396
-
3404
+
3405
+ study.logger.debug(
3406
+ f"Built local intensity matrix: {len(intensity_matrix_pd)} features x {len(intensity_matrix_pd.columns)} samples"
3407
+ )
3408
+
3397
3409
  except Exception as e:
3398
3410
  study.logger.warning(f"Could not build local intensity matrix: {e}. Creating single-feature groups.")
3399
3411
  adduct_group_list = list(range(1, len(consensus_data) + 1))
3400
3412
  adduct_of_list = [0] * len(consensus_data)
3401
3413
  return adduct_group_list, adduct_of_list
3402
-
3403
- # Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
3414
+
3415
+ # Step 2: Get adduct pairs with likelihood information and build hash map for fast lookup
3404
3416
  adduct_pairs_with_likelihood = _get_adduct_deltas_with_likelihood(study)
3405
3417
  study.logger.debug(f"Using {len(adduct_pairs_with_likelihood)} adduct pairs with likelihood scoring")
3406
-
3418
+
3407
3419
  # Build hash map for O(1) mass shift lookup
3408
3420
  mass_shift_map = {} # rounded_delta -> [(likelihood, adduct1, adduct2), ...]
3409
3421
  for mass_delta, joint_likelihood, adduct1, adduct2 in adduct_pairs_with_likelihood:
@@ -3411,11 +3423,11 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3411
3423
  if key not in mass_shift_map:
3412
3424
  mass_shift_map[key] = []
3413
3425
  mass_shift_map[key].append((joint_likelihood, adduct1, adduct2))
3414
-
3426
+
3415
3427
  # Sort each mass shift group by likelihood (highest first)
3416
3428
  for key in mass_shift_map:
3417
3429
  mass_shift_map[key].sort(key=lambda x: x[0], reverse=True)
3418
-
3430
+
3419
3431
  # Step 3: Pre-compute feature properties and sort by RT for spatial filtering
3420
3432
  feature_props = []
3421
3433
  for i, feature in enumerate(consensus_data):
@@ -3423,222 +3435,224 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
3423
3435
  rt = feature["rt"]
3424
3436
  mz = feature["mz"]
3425
3437
  intensity = feature.get("inty_mean", 0)
3426
-
3438
+
3427
3439
  # Get matrix vector once
3428
3440
  matrix_vector = intensity_matrix_pd.loc[uid].values if uid in intensity_matrix_pd.index else None
3429
-
3441
+
3430
3442
  feature_props.append({
3431
- 'index': i,
3432
- 'uid': uid,
3433
- 'rt': rt,
3434
- 'mz': mz,
3435
- 'intensity': intensity,
3436
- 'vector': matrix_vector,
3437
- 'feature': feature
3443
+ "index": i,
3444
+ "uid": uid,
3445
+ "rt": rt,
3446
+ "mz": mz,
3447
+ "intensity": intensity,
3448
+ "vector": matrix_vector,
3449
+ "feature": feature,
3438
3450
  })
3439
-
3451
+
3440
3452
  # Sort by RT for efficient spatial filtering
3441
- feature_props.sort(key=lambda x: x['rt'])
3442
-
3453
+ feature_props.sort(key=lambda x: x["rt"])
3454
+
3443
3455
  # Initialize grouping structures
3444
3456
  uid_to_boss = {} # Hierarchical structure: uid -> boss_uid
3445
3457
  boss_to_members = {} # boss_uid -> [member_uids]
3446
3458
  processed_uids = set()
3447
-
3459
+
3448
3460
  # Step 4: Process features with optimized RT filtering
3449
3461
  for i, boss_prop in enumerate(feature_props):
3450
- boss_uid = boss_prop['uid']
3451
-
3462
+ boss_uid = boss_prop["uid"]
3463
+
3452
3464
  if boss_uid in processed_uids:
3453
3465
  continue
3454
-
3455
- if boss_prop['vector'] is None:
3466
+
3467
+ if boss_prop["vector"] is None:
3456
3468
  processed_uids.add(boss_uid)
3457
3469
  continue
3458
-
3470
+
3459
3471
  # Initialize as boss
3460
3472
  if boss_uid not in uid_to_boss:
3461
3473
  uid_to_boss[boss_uid] = boss_uid
3462
3474
  boss_to_members[boss_uid] = []
3463
-
3464
- boss_rt = boss_prop['rt']
3465
- boss_mz = boss_prop['mz']
3466
- boss_vector = boss_prop['vector']
3467
-
3475
+
3476
+ boss_rt = boss_prop["rt"]
3477
+ boss_mz = boss_prop["mz"]
3478
+ boss_vector = boss_prop["vector"]
3479
+
3468
3480
  # Step 5: Efficient RT coelution filtering using sorted array
3469
3481
  candidate_pairs = []
3470
-
3482
+
3471
3483
  # Search backwards from current position
3472
3484
  j = i - 1
3473
- while j >= 0 and (boss_rt - feature_props[j]['rt']) <= rt_tol:
3485
+ while j >= 0 and (boss_rt - feature_props[j]["rt"]) <= rt_tol:
3474
3486
  candidate = feature_props[j]
3475
- if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
3476
- if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
3487
+ if candidate["uid"] not in processed_uids and candidate["vector"] is not None:
3488
+ if candidate["uid"] not in uid_to_boss or uid_to_boss[candidate["uid"]] == candidate["uid"]:
3477
3489
  # Calculate mz difference and check mass shift
3478
- mz_diff = abs(boss_mz - candidate['mz'])
3490
+ mz_diff = abs(boss_mz - candidate["mz"])
3479
3491
  mass_shift_key = round(mz_diff / mz_tol) * mz_tol
3480
-
3492
+
3481
3493
  if mass_shift_key in mass_shift_map:
3482
3494
  likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
3483
3495
  candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
3484
3496
  j -= 1
3485
-
3497
+
3486
3498
  # Search forwards from current position
3487
3499
  j = i + 1
3488
- while j < len(feature_props) and (feature_props[j]['rt'] - boss_rt) <= rt_tol:
3500
+ while j < len(feature_props) and (feature_props[j]["rt"] - boss_rt) <= rt_tol:
3489
3501
  candidate = feature_props[j]
3490
- if candidate['uid'] not in processed_uids and candidate['vector'] is not None:
3491
- if candidate['uid'] not in uid_to_boss or uid_to_boss[candidate['uid']] == candidate['uid']:
3502
+ if candidate["uid"] not in processed_uids and candidate["vector"] is not None:
3503
+ if candidate["uid"] not in uid_to_boss or uid_to_boss[candidate["uid"]] == candidate["uid"]:
3492
3504
  # Calculate mz difference and check mass shift
3493
- mz_diff = abs(boss_mz - candidate['mz'])
3505
+ mz_diff = abs(boss_mz - candidate["mz"])
3494
3506
  mass_shift_key = round(mz_diff / mz_tol) * mz_tol
3495
-
3507
+
3496
3508
  if mass_shift_key in mass_shift_map:
3497
3509
  likelihood, adduct1, adduct2 = mass_shift_map[mass_shift_key][0] # Best likelihood
3498
3510
  candidate_pairs.append((candidate, likelihood, (adduct1, adduct2)))
3499
3511
  j += 1
3500
-
3512
+
3501
3513
  # Sort candidates by likelihood (descending) to prioritize chemically meaningful pairs
3502
3514
  candidate_pairs.sort(key=lambda x: x[1], reverse=True)
3503
-
3515
+
3504
3516
  # Step 6: Process candidates in likelihood priority order
3505
3517
  for candidate_prop, likelihood, adduct_info in candidate_pairs:
3506
- candidate_uid = candidate_prop['uid']
3507
- candidate_vector = candidate_prop['vector']
3508
-
3518
+ candidate_uid = candidate_prop["uid"]
3519
+ candidate_vector = candidate_prop["vector"]
3520
+
3509
3521
  # Correlation confirmation with optimized threshold
3510
3522
  try:
3511
3523
  correlation = _fast_correlation(boss_vector, candidate_vector)
3512
-
3524
+
3513
3525
  if correlation < 0.5: # More permissive for legitimate adduct relationships
3514
3526
  continue
3515
-
3527
+
3516
3528
  except Exception:
3517
3529
  continue
3518
-
3530
+
3519
3531
  # Step 7: Hierarchical assignment (merge groups if needed)
3520
3532
  if candidate_uid in boss_to_members:
3521
3533
  old_members = boss_to_members[candidate_uid].copy()
3522
3534
  del boss_to_members[candidate_uid]
3523
-
3535
+
3524
3536
  # Reassign old members to new boss
3525
3537
  for member in old_members:
3526
3538
  uid_to_boss[member] = boss_uid
3527
3539
  boss_to_members[boss_uid].append(member)
3528
-
3540
+
3529
3541
  # Assign candidate to current boss
3530
3542
  uid_to_boss[candidate_uid] = boss_uid
3531
3543
  boss_to_members[boss_uid].append(candidate_uid)
3532
3544
  processed_uids.add(candidate_uid)
3533
-
3545
+
3534
3546
  processed_uids.add(boss_uid)
3535
-
3547
+
3536
3548
  # Step 8: Intensity-based ranking within groups (optimized)
3537
3549
  for boss_uid in list(boss_to_members.keys()):
3538
3550
  members = boss_to_members[boss_uid]
3539
3551
  if len(members) == 0:
3540
3552
  continue
3541
-
3553
+
3542
3554
  all_group_members = [boss_uid] + members
3543
-
3555
+
3544
3556
  # Find member with highest intensity efficiently
3545
3557
  max_intensity = -1
3546
3558
  new_boss = boss_uid
3547
-
3559
+
3548
3560
  for member_uid in all_group_members:
3549
3561
  # Find member_uid in feature_props
3550
- member_intensity = next((fp['intensity'] for fp in feature_props if fp['uid'] == member_uid), 0)
3562
+ member_intensity = next((fp["intensity"] for fp in feature_props if fp["uid"] == member_uid), 0)
3551
3563
  if member_intensity > max_intensity:
3552
3564
  max_intensity = member_intensity
3553
3565
  new_boss = member_uid
3554
-
3566
+
3555
3567
  # Update boss if needed
3556
3568
  if new_boss != boss_uid:
3557
3569
  boss_to_members[new_boss] = [m for m in all_group_members if m != new_boss]
3558
3570
  del boss_to_members[boss_uid]
3559
-
3571
+
3560
3572
  # Update all member references
3561
3573
  for member in all_group_members:
3562
3574
  uid_to_boss[member] = new_boss
3563
-
3575
+
3564
3576
  # Count and log results
3565
3577
  total_groups = len(boss_to_members)
3566
3578
  multi_member_groups = sum(1 for members in boss_to_members.values() if len(members) > 0)
3567
3579
  total_grouped_features = sum(len(members) + 1 for members in boss_to_members.values())
3568
-
3569
- study.logger.info(f"Grouping results: {total_groups} groups ({multi_member_groups} multi-member, {total_grouped_features} features)")
3570
-
3580
+
3581
+ study.logger.info(
3582
+ f"Grouping results: {total_groups} groups ({multi_member_groups} multi-member, {total_grouped_features} features)"
3583
+ )
3584
+
3571
3585
  # Step 9: Convert to return format (optimized)
3572
- uid_to_index = {fp['uid']: fp['index'] for fp in feature_props}
3586
+ uid_to_index = {fp["uid"]: fp["index"] for fp in feature_props}
3573
3587
  adduct_group_list = [0] * n_features
3574
3588
  adduct_of_list = [0] * n_features
3575
-
3589
+
3576
3590
  group_counter = 1
3577
3591
  for boss_uid, members in boss_to_members.items():
3578
3592
  # Assign boss
3579
3593
  boss_idx = uid_to_index[boss_uid]
3580
3594
  adduct_group_list[boss_idx] = group_counter
3581
3595
  adduct_of_list[boss_idx] = 0
3582
-
3596
+
3583
3597
  # Assign members
3584
3598
  for member_uid in members:
3585
3599
  member_idx = uid_to_index[member_uid]
3586
3600
  adduct_group_list[member_idx] = group_counter
3587
3601
  adduct_of_list[member_idx] = boss_uid
3588
-
3602
+
3589
3603
  group_counter += 1
3590
-
3604
+
3591
3605
  # Handle ungrouped features
3592
3606
  for i in range(n_features):
3593
3607
  if adduct_group_list[i] == 0:
3594
3608
  adduct_group_list[i] = group_counter
3595
3609
  adduct_of_list[i] = 0
3596
3610
  group_counter += 1
3597
-
3611
+
3598
3612
  return adduct_group_list, adduct_of_list
3599
3613
 
3600
3614
 
3601
3615
  def _fast_correlation(x, y):
3602
3616
  """
3603
3617
  Fast correlation coefficient calculation for consensus matrix data.
3604
-
3618
+
3605
3619
  In the consensus matrix:
3606
- - Negative values (typically -1.0) indicate missing features
3620
+ - Negative values (typically -1.0) indicate missing features
3607
3621
  - Zero and positive values are actual intensities
3608
3622
  - Only consider intensities >= 1000 for meaningful correlation
3609
-
3623
+
3610
3624
  Args:
3611
3625
  x, y: numpy arrays of the same length
3612
-
3626
+
3613
3627
  Returns:
3614
3628
  Correlation coefficient (float), 0 if cannot be calculated
3615
3629
  """
3616
3630
  import numpy as np
3617
-
3631
+
3618
3632
  # For consensus matrix: exclude negative values (missing features) and very low intensities
3619
- # Use a very low threshold since processed matrix values are often scaled/normalized
3633
+ # Use a very low threshold since processed matrix values are often scaled/normalized
3620
3634
  valid = ~(np.isnan(x) | np.isnan(y) | (x < 0) | (y < 0) | (x < 0.1) | (y < 0.1))
3621
-
3635
+
3622
3636
  if np.sum(valid) < 3: # Need at least 3 valid pairs
3623
3637
  return 0.0
3624
-
3638
+
3625
3639
  x_valid = x[valid]
3626
3640
  y_valid = y[valid]
3627
-
3641
+
3628
3642
  # If all values are the same (e.g., all zeros), correlation is undefined
3629
3643
  if np.var(x_valid) == 0 or np.var(y_valid) == 0:
3630
3644
  return 0.0
3631
-
3645
+
3632
3646
  # Fast correlation using numpy
3633
3647
  try:
3634
3648
  correlation_matrix = np.corrcoef(x_valid, y_valid)
3635
3649
  correlation = correlation_matrix[0, 1]
3636
-
3650
+
3637
3651
  # Handle NaN result
3638
3652
  if np.isnan(correlation):
3639
3653
  return 0.0
3640
-
3654
+
3641
3655
  return correlation
3642
-
3656
+
3643
3657
  except Exception:
3644
3658
  return 0.0