masster 0.4.18__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py CHANGED
@@ -10,9 +10,270 @@ from datetime import datetime
10
10
  from tqdm import tqdm
11
11
  import pyopenms as oms
12
12
  import polars as pl
13
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
14
+ from concurrent.futures.process import BrokenProcessPool
13
15
  from masster.study.defaults import merge_defaults
14
16
 
15
17
 
18
+ def _process_kd_chunk_parallel(chunk_data):
19
+ """
20
+ Process a single KD chunk in parallel by reconstructing FeatureMaps from features_df slice.
21
+
22
+ Args:
23
+ chunk_data: Dictionary containing chunk processing parameters
24
+
25
+ Returns:
26
+ Tuple of (chunk_start_idx, serialized_consensus_features)
27
+ """
28
+ import pyopenms as oms
29
+
30
+ chunk_start_idx = chunk_data['chunk_start_idx']
31
+ chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
32
+ chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
33
+ params_dict = chunk_data['params']
34
+
35
+ # Reconstruct FeatureMaps from features data for each sample in the chunk
36
+ chunk_maps = []
37
+
38
+ for sample_data in chunk_samples_data:
39
+ sample_uid = sample_data['sample_uid']
40
+
41
+ # Filter features for this specific sample
42
+ sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
43
+
44
+ # Create FeatureMap for this sample
45
+ feature_map = oms.FeatureMap()
46
+
47
+ # Add each feature to the map
48
+ for feature_dict in sample_features:
49
+ feature = oms.Feature()
50
+ feature.setRT(float(feature_dict['rt']))
51
+ feature.setMZ(float(feature_dict['mz']))
52
+ feature.setIntensity(float(feature_dict['inty']))
53
+ feature.setCharge(int(feature_dict.get('charge', 0)))
54
+
55
+ # Set unique ID using feature_id for mapping back
56
+ feature.setUniqueId(int(feature_dict['feature_id']))
57
+
58
+ feature_map.push_back(feature)
59
+
60
+ chunk_maps.append(feature_map)
61
+
62
+ # Create the chunk consensus map
63
+ chunk_consensus_map = oms.ConsensusMap()
64
+
65
+ # Set up file descriptions for chunk
66
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
67
+ for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
68
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
69
+ file_description.filename = sample_data['sample_name']
70
+ file_description.size = feature_map.size()
71
+ file_description.unique_id = feature_map.getUniqueId()
72
+ file_descriptions[j] = file_description
73
+
74
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
75
+
76
+ # Use KD algorithm for chunk
77
+ grouper = oms.FeatureGroupingAlgorithmKD()
78
+ chunk_params = grouper.getParameters()
79
+ chunk_params.setValue("mz_unit", "Da")
80
+ chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
81
+ chunk_params.setValue("warp:enabled", "true")
82
+ chunk_params.setValue("warp:rt_tol", params_dict['rt_tol'])
83
+ chunk_params.setValue("warp:mz_tol", params_dict['mz_tol'])
84
+ chunk_params.setValue("link:rt_tol", params_dict['rt_tol'])
85
+ chunk_params.setValue("link:mz_tol", params_dict['mz_tol'])
86
+ chunk_params.setValue("link:min_rel_cc_size", params_dict['min_rel_cc_size'])
87
+ chunk_params.setValue("link:max_pairwise_log_fc", params_dict['max_pairwise_log_fc'])
88
+ chunk_params.setValue("link:max_nr_conflicts", params_dict['max_nr_conflicts'])
89
+
90
+ grouper.setParameters(chunk_params)
91
+ grouper.group(chunk_maps, chunk_consensus_map)
92
+
93
+ # Serialize the consensus map result for cross-process communication
94
+ consensus_features = []
95
+ for consensus_feature in chunk_consensus_map:
96
+ feature_data = {
97
+ 'rt': consensus_feature.getRT(),
98
+ 'mz': consensus_feature.getMZ(),
99
+ 'intensity': consensus_feature.getIntensity(),
100
+ 'quality': consensus_feature.getQuality(),
101
+ 'unique_id': str(consensus_feature.getUniqueId()),
102
+ 'features': []
103
+ }
104
+
105
+ # Get constituent features
106
+ for feature_handle in consensus_feature.getFeatureList():
107
+ feature_handle_data = {
108
+ 'unique_id': str(feature_handle.getUniqueId()),
109
+ 'map_index': feature_handle.getMapIndex()
110
+ }
111
+ feature_data['features'].append(feature_handle_data)
112
+
113
+ consensus_features.append(feature_data)
114
+
115
+ return chunk_start_idx, consensus_features
116
+
117
+
118
+ def _deserialize_consensus_features(consensus_features):
119
+ """
120
+ Deserialize consensus features back into an OpenMS ConsensusMap.
121
+
122
+ Args:
123
+ consensus_features: List of serialized consensus feature dictionaries
124
+
125
+ Returns:
126
+ OpenMS ConsensusMap object
127
+ """
128
+ import pyopenms as oms
129
+
130
+ consensus_map = oms.ConsensusMap()
131
+
132
+ for feature_data in consensus_features:
133
+ consensus_feature = oms.ConsensusFeature()
134
+ consensus_feature.setRT(float(feature_data['rt']))
135
+ consensus_feature.setMZ(float(feature_data['mz']))
136
+ consensus_feature.setIntensity(float(feature_data['intensity']))
137
+ consensus_feature.setQuality(float(feature_data['quality']))
138
+ consensus_feature.setUniqueId(int(feature_data['unique_id']))
139
+
140
+ # Reconstruct feature handles (simplified approach)
141
+ feature_handles = []
142
+ for handle_data in feature_data['features']:
143
+ feature_handle = oms.FeatureHandle()
144
+ feature_handle.setUniqueId(int(handle_data['unique_id']))
145
+ feature_handle.setMapIndex(int(handle_data['map_index']))
146
+ feature_handles.append(feature_handle)
147
+
148
+ # Set the feature list - properly add feature handles back to consensus feature
149
+ if feature_handles:
150
+ # Add each feature handle to the consensus feature using the correct OpenMS API
151
+ for feature_handle in feature_handles:
152
+ consensus_feature.getFeatureList().append(feature_handle)
153
+
154
+ consensus_map.push_back(consensus_feature)
155
+
156
+ return consensus_map
157
+
158
+
159
+ def _process_qt_chunk_parallel(chunk_data):
160
+ """
161
+ Process a single QT chunk in parallel by reconstructing FeatureMaps from features_df slice.
162
+
163
+ Args:
164
+ chunk_data: Dictionary containing chunk processing parameters
165
+
166
+ Returns:
167
+ Tuple of (chunk_start_idx, serialized_consensus_features)
168
+ """
169
+ import pyopenms as oms
170
+
171
+ chunk_start_idx = chunk_data['chunk_start_idx']
172
+ chunk_features_data = chunk_data['chunk_features_data'] # List of feature dicts
173
+ chunk_samples_data = chunk_data['chunk_samples_data'] # List of sample dicts
174
+ params_dict = chunk_data['params']
175
+
176
+ # Reconstruct FeatureMaps from features data for each sample in the chunk
177
+ chunk_maps = []
178
+
179
+ for sample_data in chunk_samples_data:
180
+ sample_uid = sample_data['sample_uid']
181
+
182
+ # Filter features for this specific sample
183
+ sample_features = [f for f in chunk_features_data if f['sample_uid'] == sample_uid]
184
+
185
+ # Create FeatureMap for this sample
186
+ feature_map = oms.FeatureMap()
187
+
188
+ # Add each feature to the map
189
+ for feature_dict in sample_features:
190
+ feature = oms.Feature()
191
+ feature.setRT(float(feature_dict['rt']))
192
+ feature.setMZ(float(feature_dict['mz']))
193
+ feature.setIntensity(float(feature_dict['inty']))
194
+ feature.setCharge(int(feature_dict.get('charge', 0)))
195
+
196
+ # Set unique ID using feature_id for mapping back
197
+ feature.setUniqueId(int(feature_dict['feature_id']))
198
+
199
+ feature_map.push_back(feature)
200
+
201
+ chunk_maps.append(feature_map)
202
+
203
+ # Create the chunk consensus map
204
+ chunk_consensus_map = oms.ConsensusMap()
205
+
206
+ # Set up file descriptions for chunk
207
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
208
+ for j, (feature_map, sample_data) in enumerate(zip(chunk_maps, chunk_samples_data)):
209
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
210
+ file_description.filename = sample_data['sample_name']
211
+ file_description.size = feature_map.size()
212
+ file_description.unique_id = feature_map.getUniqueId()
213
+ file_descriptions[j] = file_description
214
+
215
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
216
+
217
+ # Use QT algorithm for chunk
218
+ grouper = oms.FeatureGroupingAlgorithmQT()
219
+ chunk_params = grouper.getParameters()
220
+ chunk_params.setValue("distance_RT:max_difference", params_dict['rt_tol'])
221
+ chunk_params.setValue("distance_MZ:max_difference", params_dict['mz_tol'])
222
+ chunk_params.setValue("distance_MZ:unit", "Da")
223
+ chunk_params.setValue("ignore_charge", "true")
224
+ chunk_params.setValue("nr_partitions", params_dict['nr_partitions'])
225
+
226
+ grouper.setParameters(chunk_params)
227
+ grouper.group(chunk_maps, chunk_consensus_map)
228
+
229
+ # Serialize the consensus map result for cross-process communication
230
+ consensus_features = []
231
+ for consensus_feature in chunk_consensus_map:
232
+ feature_data = {
233
+ 'rt': consensus_feature.getRT(),
234
+ 'mz': consensus_feature.getMZ(),
235
+ 'intensity': consensus_feature.getIntensity(),
236
+ 'quality': consensus_feature.getQuality(),
237
+ 'unique_id': str(consensus_feature.getUniqueId()),
238
+ 'features': []
239
+ }
240
+
241
+ # Get constituent features
242
+ for feature_handle in consensus_feature.getFeatureList():
243
+ feature_handle_data = {
244
+ 'unique_id': str(feature_handle.getUniqueId()),
245
+ 'map_index': feature_handle.getMapIndex()
246
+ }
247
+ feature_data['features'].append(feature_handle_data)
248
+
249
+ consensus_features.append(feature_data)
250
+
251
+ return chunk_start_idx, consensus_features
252
+
253
+
254
+ def _serialize_feature_map(feature_map):
255
+ """
256
+ Serialize a FeatureMap to a list of dictionaries for multiprocessing.
257
+
258
+ Args:
259
+ feature_map: OpenMS FeatureMap object
260
+
261
+ Returns:
262
+ List of feature dictionaries
263
+ """
264
+ features_data = []
265
+ for feature in feature_map:
266
+ feature_data = {
267
+ 'rt': feature.getRT(),
268
+ 'mz': feature.getMZ(),
269
+ 'intensity': feature.getIntensity(),
270
+ 'charge': feature.getCharge(),
271
+ 'unique_id': feature.getUniqueId()
272
+ }
273
+ features_data.append(feature_data)
274
+ return features_data
275
+
276
+
16
277
  def merge(self, **kwargs) -> None:
17
278
  """
18
279
  Group features across samples into consensus features using various algorithms.
@@ -34,6 +295,8 @@ def merge(self, **kwargs) -> None:
34
295
  m/z tolerance in Da (Daltons) for all methods
35
296
  - chunk_size : int, default 500
36
297
  Chunk size for 'chunked' method
298
+ - threads : int, default 1
299
+ Number of parallel processes for chunked methods (kd_chunked, qt_chunked)
37
300
  - nr_partitions : int, default 500
38
301
  Number of partitions in m/z dimension for KD algorithms
39
302
  - min_rel_cc_size : float, default 0.3
@@ -54,9 +317,19 @@ def merge(self, **kwargs) -> None:
54
317
  - NoWarp: Memory efficient KD without RT warping for large datasets
55
318
  - KD-Chunked: Memory-optimized KD algorithm for very large datasets (>5000 samples)
56
319
  Uses optimized partitioning for better memory management while maintaining
57
- full cross-sample consensus feature detection.
320
+ full cross-sample consensus feature detection. Supports parallel processing.
58
321
  - QT-Chunked: Memory-optimized QT algorithm for very large datasets (>5000 samples)
59
322
  Uses QT clustering in first stage with optimized cross-chunk consensus building.
323
+ Supports parallel processing.
324
+
325
+ Parallel Processing
326
+ ------------------
327
+ For kd_chunked and qt_chunked methods, use threads > 1 to enable parallel processing
328
+ of chunk alignments. This can significantly reduce processing time for large datasets
329
+ by processing multiple chunks simultaneously in separate processes.
330
+
331
+ Example:
332
+ study.merge(method='kd_chunked', threads=4, chunk_size=200)
60
333
  """
61
334
  start_time = time.time()
62
335
 
@@ -95,6 +368,17 @@ def merge(self, **kwargs) -> None:
95
368
  if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
96
369
  raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
97
370
 
371
+ # Check if chunked method is advisable for large datasets
372
+ num_samples = len(self.samples_df) if hasattr(self, 'samples_df') and self.samples_df is not None else 0
373
+ if num_samples > 500:
374
+ chunked_methods = {'kd_chunked', 'qt_chunked'}
375
+ if params.method not in chunked_methods:
376
+ self.logger.warning(
377
+ f"Large dataset detected ({num_samples} samples > 500). "
378
+ f"For better performance and memory efficiency, consider using a chunked method: "
379
+ f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
380
+ )
381
+
98
382
  # Persist last used params for diagnostics
99
383
  try:
100
384
  self._merge_params_last = params.to_dict()
@@ -113,7 +397,7 @@ def merge(self, **kwargs) -> None:
113
397
  # Ensure feature maps are available for merging (regenerate if needed)
114
398
  if len(self.features_maps) < len(self.samples_df):
115
399
  self.features_maps = []
116
- self.load_features()
400
+ # Feature maps will be generated on-demand within each merge method
117
401
 
118
402
  self.logger.info(
119
403
  f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da, min_rel_cc_size={params.min_rel_cc_size}, max_pairwise_log_fc={params.max_pairwise_log_fc}, max_nr_conflicts={params.max_nr_conflicts}"
@@ -161,9 +445,16 @@ def merge(self, **kwargs) -> None:
161
445
  consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
162
446
  # Note: _merge_qt_chunked populates consensus_df directly, no need to extract
163
447
 
448
+ # Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
449
+ if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked']:
450
+ self._consensus_cleanup(params.rt_tol, params.mz_tol)
451
+
164
452
  # Perform adduct grouping
165
453
  self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
166
454
 
455
+ # Identify coeluting consensus features by mass shifts and update adduct information
456
+ self._identify_adduct_by_mass_shift(params.rt_tol, cached_adducts_df)
457
+
167
458
  # Link MS2 if requested
168
459
  if params.link_ms2:
169
460
  self._finalize_merge(params.link_ms2, params.min_samples)
@@ -176,10 +467,13 @@ def merge(self, **kwargs) -> None:
176
467
  def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
177
468
  """KD-tree based merge (fast, recommended)"""
178
469
 
470
+ # Generate temporary feature maps on-demand from features_df
471
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
472
+
179
473
  consensus_map = oms.ConsensusMap()
180
474
  file_descriptions = consensus_map.getColumnHeaders()
181
475
 
182
- for i, feature_map in enumerate(self.features_maps):
476
+ for i, feature_map in enumerate(temp_feature_maps):
183
477
  file_description = file_descriptions.get(i, oms.ColumnHeader())
184
478
  file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
185
479
  file_description.size = feature_map.size()
@@ -205,22 +499,145 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
205
499
  #params_oms.setValue("link:charge_merging", "With_charge_zero") THIS LEADS TO A CRASH
206
500
 
207
501
  grouper.setParameters(params_oms)
208
- grouper.group(self.features_maps, consensus_map)
502
+ grouper.group(temp_feature_maps, consensus_map)
209
503
 
210
504
  return consensus_map
211
505
 
212
506
 
507
+ def _generate_feature_maps_on_demand(study):
508
+ """
509
+ Generate feature maps on-demand from study.features_df for merge operations.
510
+ Returns temporary feature maps that are not cached in the study.
511
+
512
+ Args:
513
+ study: Study object containing features_df and samples_df
514
+
515
+ Returns:
516
+ list: List of temporary FeatureMap objects
517
+ """
518
+ import polars as pl
519
+ import pyopenms as oms
520
+ import numpy as np
521
+
522
+ if study.features_df is None or len(study.features_df) == 0:
523
+ study.logger.error("No features_df available for generating feature maps")
524
+ return []
525
+
526
+ temp_feature_maps = []
527
+ n_samples = len(study.samples_df)
528
+ n_features = len(study.features_df)
529
+
530
+ # Performance optimization: use efficient polars groupby for large datasets
531
+ use_groupby_optimization = n_features > 5000
532
+ if use_groupby_optimization:
533
+ study.logger.debug(f"Using polars groupby optimization for {n_features} features across {n_samples} samples")
534
+
535
+ # Pre-group features by sample_uid - this is much more efficient than repeated filtering
536
+ features_by_sample = study.features_df.group_by("sample_uid").agg([
537
+ pl.col("feature_id"),
538
+ pl.col("mz"),
539
+ pl.col("rt"),
540
+ pl.col("inty"),
541
+ pl.col("quality").fill_null(1.0),
542
+ pl.col("charge").fill_null(0)
543
+ ])
544
+
545
+ # Convert to dictionary for fast lookups
546
+ sample_feature_dict = {}
547
+ for row in features_by_sample.iter_rows(named=True):
548
+ sample_uid = row["sample_uid"]
549
+ # Convert lists to numpy arrays for vectorized operations
550
+ sample_feature_dict[sample_uid] = {
551
+ "feature_id": np.array(row["feature_id"]),
552
+ "mz": np.array(row["mz"]),
553
+ "rt": np.array(row["rt"]),
554
+ "inty": np.array(row["inty"]),
555
+ "quality": np.array(row["quality"]),
556
+ "charge": np.array(row["charge"])
557
+ }
558
+
559
+ # Process each sample in order
560
+ for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
561
+ sample_uid = row_dict["sample_uid"]
562
+
563
+ if use_groupby_optimization:
564
+ # Use pre-grouped data with vectorized operations
565
+ if sample_uid not in sample_feature_dict:
566
+ feature_map = oms.FeatureMap()
567
+ temp_feature_maps.append(feature_map)
568
+ continue
569
+
570
+ sample_data = sample_feature_dict[sample_uid]
571
+ n_sample_features = len(sample_data["feature_id"])
572
+
573
+ if n_sample_features == 0:
574
+ feature_map = oms.FeatureMap()
575
+ temp_feature_maps.append(feature_map)
576
+ continue
577
+
578
+ # Create new FeatureMap
579
+ feature_map = oms.FeatureMap()
580
+
581
+ # Use vectorized data directly (no conversion needed)
582
+ for i in range(n_sample_features):
583
+ try:
584
+ feature = oms.Feature()
585
+ feature.setUniqueId(int(sample_data["feature_id"][i]))
586
+ feature.setMZ(float(sample_data["mz"][i]))
587
+ feature.setRT(float(sample_data["rt"][i]))
588
+ feature.setIntensity(float(sample_data["inty"][i]))
589
+ feature.setOverallQuality(float(sample_data["quality"][i]))
590
+ feature.setCharge(int(sample_data["charge"][i]))
591
+ feature_map.push_back(feature)
592
+ except (ValueError, TypeError) as e:
593
+ study.logger.warning(f"Skipping feature due to conversion error: {e}")
594
+ continue
595
+ else:
596
+ # Use original polars-based approach for smaller datasets
597
+ sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
598
+
599
+ # Create new FeatureMap
600
+ feature_map = oms.FeatureMap()
601
+
602
+ # Convert DataFrame features to OpenMS Features
603
+ for feature_row in sample_features.iter_rows(named=True):
604
+ feature = oms.Feature()
605
+
606
+ # Set properties from DataFrame (handle missing values gracefully)
607
+ try:
608
+ feature.setUniqueId(int(feature_row["feature_id"]))
609
+ feature.setMZ(float(feature_row["mz"]))
610
+ feature.setRT(float(feature_row["rt"]))
611
+ feature.setIntensity(float(feature_row["inty"]))
612
+ feature.setOverallQuality(float(feature_row["quality"]))
613
+ feature.setCharge(int(feature_row["charge"]))
614
+
615
+ # Add to feature map
616
+ feature_map.push_back(feature)
617
+ except (ValueError, TypeError) as e:
618
+ study.logger.warning(f"Skipping feature due to conversion error: {e}")
619
+ continue
620
+
621
+ temp_feature_maps.append(feature_map)
622
+
623
+ study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df")
624
+ return temp_feature_maps
625
+
626
+
213
627
  def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
214
628
  """QT (Quality Threshold) based merge"""
215
629
 
216
- n_samples = len(self.features_maps)
630
+ # Generate temporary feature maps on-demand from features_df
631
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
632
+
633
+ n_samples = len(temp_feature_maps)
217
634
  if n_samples > 1000:
218
635
  self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
219
636
 
220
637
  consensus_map = oms.ConsensusMap()
221
638
  file_descriptions = consensus_map.getColumnHeaders()
222
639
 
223
- for i, feature_map in enumerate(self.features_maps):
640
+ for i, feature_map in enumerate(temp_feature_maps):
224
641
  file_description = file_descriptions.get(i, oms.ColumnHeader())
225
642
  file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
226
643
  file_description.size = feature_map.size()
@@ -243,7 +660,7 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
243
660
  params_oms.setValue("nr_partitions", params.nr_partitions)
244
661
 
245
662
  grouper.setParameters(params_oms)
246
- grouper.group(self.features_maps, consensus_map)
663
+ grouper.group(temp_feature_maps, consensus_map)
247
664
 
248
665
  return consensus_map
249
666
 
@@ -741,10 +1158,13 @@ def _filter_coherence(self, features: list, min_coherence: float) -> list:
741
1158
  def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
742
1159
  """KD-tree based merge without RT warping"""
743
1160
 
1161
+ # Generate temporary feature maps on-demand from features_df
1162
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
1163
+
744
1164
  consensus_map = oms.ConsensusMap()
745
1165
  file_descriptions = consensus_map.getColumnHeaders()
746
1166
 
747
- for i, feature_map in enumerate(self.features_maps):
1167
+ for i, feature_map in enumerate(temp_feature_maps):
748
1168
  file_description = file_descriptions.get(i, oms.ColumnHeader())
749
1169
  file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
750
1170
  file_description.size = feature_map.size()
@@ -768,15 +1188,18 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
768
1188
  #params_oms.setValue("link:charge_merging", "Any")
769
1189
 
770
1190
  grouper.setParameters(params_oms)
771
- grouper.group(self.features_maps, consensus_map)
1191
+ grouper.group(temp_feature_maps, consensus_map)
772
1192
 
773
1193
  return consensus_map
774
1194
 
775
1195
 
776
1196
  def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
777
- """KD-based chunked merge with proper cross-chunk consensus building"""
1197
+ """KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
778
1198
 
779
- n_samples = len(self.features_maps)
1199
+ # Generate temporary feature maps on-demand from features_df
1200
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
1201
+
1202
+ n_samples = len(temp_feature_maps)
780
1203
  if n_samples <= params.chunk_size:
781
1204
  self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
782
1205
  consensus_map = _merge_kd(self, params)
@@ -788,58 +1211,175 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
788
1211
  chunks = []
789
1212
  for i in range(0, n_samples, params.chunk_size):
790
1213
  chunk_end = min(i + params.chunk_size, n_samples)
791
- chunks.append((i, self.features_maps[i:chunk_end]))
1214
+ chunks.append((i, temp_feature_maps[i:chunk_end]))
792
1215
 
793
- self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
1216
+ self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
794
1217
 
795
1218
  # Process each chunk to create chunk consensus maps
796
1219
  chunk_consensus_maps = []
797
1220
 
798
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
799
- chunk_consensus_map = oms.ConsensusMap()
800
-
801
- # Set up file descriptions for chunk
802
- file_descriptions = chunk_consensus_map.getColumnHeaders()
803
- for j, feature_map in enumerate(chunk_maps):
804
- file_description = file_descriptions.get(j, oms.ColumnHeader())
805
- file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
806
- file_description.size = feature_map.size()
807
- file_description.unique_id = feature_map.getUniqueId()
808
- file_descriptions[j] = file_description
809
-
810
- chunk_consensus_map.setColumnHeaders(file_descriptions)
811
-
812
- # Use KD algorithm for chunk
813
- grouper = oms.FeatureGroupingAlgorithmKD()
814
- chunk_params = grouper.getParameters()
815
- chunk_params.setValue("mz_unit", "Da")
816
- chunk_params.setValue("nr_partitions", params.nr_partitions)
817
- chunk_params.setValue("warp:enabled", "true")
818
- chunk_params.setValue("warp:rt_tol", params.rt_tol)
819
- chunk_params.setValue("warp:mz_tol", params.mz_tol)
820
- chunk_params.setValue("link:rt_tol", params.rt_tol)
821
- chunk_params.setValue("link:mz_tol", params.mz_tol)
822
- chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
823
- chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
824
- chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
825
-
826
- grouper.setParameters(chunk_params)
827
- grouper.group(chunk_maps, chunk_consensus_map)
828
-
829
- chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
830
-
831
- # Merge chunk results with proper cross-chunk consensus building
1221
+ if params.threads is None:
1222
+ # Sequential processing (original behavior)
1223
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}KD Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1224
+ chunk_consensus_map = oms.ConsensusMap()
1225
+
1226
+ # Set up file descriptions for chunk
1227
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
1228
+ for j, feature_map in enumerate(chunk_maps):
1229
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
1230
+ file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1231
+ file_description.size = feature_map.size()
1232
+ file_description.unique_id = feature_map.getUniqueId()
1233
+ file_descriptions[j] = file_description
1234
+
1235
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
1236
+
1237
+ # Use KD algorithm for chunk
1238
+ grouper = oms.FeatureGroupingAlgorithmKD()
1239
+ chunk_params = grouper.getParameters()
1240
+ chunk_params.setValue("mz_unit", "Da")
1241
+ chunk_params.setValue("nr_partitions", params.nr_partitions)
1242
+ chunk_params.setValue("warp:enabled", "true")
1243
+ chunk_params.setValue("warp:rt_tol", params.rt_tol)
1244
+ chunk_params.setValue("warp:mz_tol", params.mz_tol)
1245
+ chunk_params.setValue("link:rt_tol", params.rt_tol)
1246
+ chunk_params.setValue("link:mz_tol", params.mz_tol)
1247
+ chunk_params.setValue("link:min_rel_cc_size", params.min_rel_cc_size)
1248
+ chunk_params.setValue("link:max_pairwise_log_fc", params.max_pairwise_log_fc)
1249
+ chunk_params.setValue("link:max_nr_conflicts", params.max_nr_conflicts)
1250
+
1251
+ grouper.setParameters(chunk_params)
1252
+ grouper.group(chunk_maps, chunk_consensus_map)
1253
+
1254
+ chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
1255
+
1256
+ else:
1257
+ # Parallel processing
1258
+ self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1259
+
1260
+ # Prepare chunk data for parallel processing using features_df slices
1261
+ chunk_data_list = []
1262
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
1263
+ # Get the sample UIDs for this chunk
1264
+ chunk_sample_uids = []
1265
+ chunk_samples_df_rows = []
1266
+ for j in range(len(chunk_maps)):
1267
+ sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1268
+ chunk_sample_uids.append(sample_row['sample_uid'])
1269
+ chunk_samples_df_rows.append(sample_row)
1270
+
1271
+ # Create a DataFrame for this chunk's samples
1272
+ chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1273
+
1274
+ # Filter features_df for this chunk's samples and select only necessary columns
1275
+ chunk_features_df = self.features_df.filter(
1276
+ pl.col('sample_uid').is_in(chunk_sample_uids)
1277
+ ).select([
1278
+ 'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
1279
+ ])
1280
+
1281
+ # Convert DataFrames to serializable format (lists of dicts)
1282
+ chunk_features_data = chunk_features_df.to_dicts()
1283
+ chunk_samples_data = chunk_samples_df.to_dicts()
1284
+
1285
+ chunk_data = {
1286
+ 'chunk_start_idx': chunk_start_idx,
1287
+ 'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
1288
+ 'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
1289
+ 'params': {
1290
+ 'nr_partitions': params.nr_partitions,
1291
+ 'rt_tol': params.rt_tol,
1292
+ 'mz_tol': params.mz_tol,
1293
+ 'min_rel_cc_size': params.min_rel_cc_size,
1294
+ 'max_pairwise_log_fc': params.max_pairwise_log_fc,
1295
+ 'max_nr_conflicts': params.max_nr_conflicts
1296
+ }
1297
+ }
1298
+ chunk_data_list.append(chunk_data)
1299
+
1300
+ # Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
1301
+ try:
1302
+ with ProcessPoolExecutor(max_workers=params.threads) as executor:
1303
+ # Submit all chunk processing tasks
1304
+ future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
1305
+ for i, chunk_data in enumerate(chunk_data_list)}
1306
+
1307
+ # Collect results with progress tracking
1308
+ completed_chunks = 0
1309
+ total_chunks = len(chunk_data_list)
1310
+ serialized_chunk_results = []
1311
+
1312
+ for future in as_completed(future_to_chunk):
1313
+ chunk_idx = future_to_chunk[future]
1314
+ try:
1315
+ chunk_start_idx, consensus_features = future.result()
1316
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1317
+ completed_chunks += 1
1318
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1319
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1320
+ except Exception as exc:
1321
+ # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1322
+ if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1323
+ # Convert to RuntimeError so outer except block can catch it for fallback
1324
+ raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1325
+ else:
1326
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1327
+ raise exc
1328
+
1329
+ except (RuntimeError, OSError, BrokenProcessPool) as e:
1330
+ # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1331
+ if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1332
+ "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1333
+ self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1334
+ self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1335
+
1336
+ with ThreadPoolExecutor(max_workers=params.threads) as executor:
1337
+ # Submit all chunk processing tasks
1338
+ future_to_chunk = {executor.submit(_process_kd_chunk_parallel, chunk_data): i
1339
+ for i, chunk_data in enumerate(chunk_data_list)}
1340
+
1341
+ # Collect results with progress tracking
1342
+ completed_chunks = 0
1343
+ total_chunks = len(chunk_data_list)
1344
+ serialized_chunk_results = []
1345
+
1346
+ for future in as_completed(future_to_chunk):
1347
+ chunk_idx = future_to_chunk[future]
1348
+ try:
1349
+ chunk_start_idx, consensus_features = future.result()
1350
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1351
+ completed_chunks += 1
1352
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1353
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1354
+ except Exception as exc:
1355
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1356
+ raise exc
1357
+ else:
1358
+ # Re-raise other exceptions
1359
+ raise
1360
+
1361
+ # Store serialized results for _merge_chunk_results to handle directly
1362
+ chunk_consensus_maps = []
1363
+ for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
1364
+ # Store serialized data directly for _merge_chunk_results to handle
1365
+ chunk_consensus_maps.append((chunk_start_idx, consensus_features))
1366
+
1367
+ # Merge chunk results with proper cross-chunk consensus building
1368
+ # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
832
1369
  _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
833
1370
 
834
- # Create a dummy consensus map for compatibility (since other functions expect it)
1371
+ # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
835
1372
  consensus_map = oms.ConsensusMap()
836
1373
  return consensus_map
837
1374
 
838
1375
 
839
1376
  def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
840
- """QT-based chunked merge with proper cross-chunk consensus building"""
1377
+ """QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
841
1378
 
842
- n_samples = len(self.features_maps)
1379
+ # Generate temporary feature maps on-demand from features_df
1380
+ temp_feature_maps = _generate_feature_maps_on_demand(self)
1381
+
1382
+ n_samples = len(temp_feature_maps)
843
1383
  if n_samples <= params.chunk_size:
844
1384
  self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
845
1385
  consensus_map = _merge_qt(self, params)
@@ -851,45 +1391,159 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
851
1391
  chunks = []
852
1392
  for i in range(0, n_samples, params.chunk_size):
853
1393
  chunk_end = min(i + params.chunk_size, n_samples)
854
- chunks.append((i, self.features_maps[i:chunk_end]))
1394
+ chunks.append((i, temp_feature_maps[i:chunk_end]))
855
1395
 
856
- self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples")
1396
+ self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
857
1397
 
858
1398
  # Process each chunk to create chunk consensus maps
859
1399
  chunk_consensus_maps = []
860
1400
 
861
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc="Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
862
- chunk_consensus_map = oms.ConsensusMap()
863
-
864
- # Set up file descriptions for chunk
865
- file_descriptions = chunk_consensus_map.getColumnHeaders()
866
- for j, feature_map in enumerate(chunk_maps):
867
- file_description = file_descriptions.get(j, oms.ColumnHeader())
868
- file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
869
- file_description.size = feature_map.size()
870
- file_description.unique_id = feature_map.getUniqueId()
871
- file_descriptions[j] = file_description
1401
+ if params.threads is None:
1402
+ # Sequential processing (original behavior)
1403
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}QT Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1404
+ chunk_consensus_map = oms.ConsensusMap()
1405
+
1406
+ # Set up file descriptions for chunk
1407
+ file_descriptions = chunk_consensus_map.getColumnHeaders()
1408
+ for j, feature_map in enumerate(chunk_maps):
1409
+ file_description = file_descriptions.get(j, oms.ColumnHeader())
1410
+ file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1411
+ file_description.size = feature_map.size()
1412
+ file_description.unique_id = feature_map.getUniqueId()
1413
+ file_descriptions[j] = file_description
1414
+
1415
+ chunk_consensus_map.setColumnHeaders(file_descriptions)
1416
+
1417
+ # Use QT algorithm for chunk (main difference from KD chunked)
1418
+ grouper = oms.FeatureGroupingAlgorithmQT()
1419
+ chunk_params = grouper.getParameters()
1420
+ chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
1421
+ chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
1422
+ chunk_params.setValue("distance_MZ:unit", "Da")
1423
+ chunk_params.setValue("ignore_charge", "true")
1424
+ chunk_params.setValue("nr_partitions", params.nr_partitions)
1425
+
1426
+ grouper.setParameters(chunk_params)
1427
+ grouper.group(chunk_maps, chunk_consensus_map)
1428
+
1429
+ chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
1430
+
1431
+ else:
1432
+ # Parallel processing
1433
+ self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
872
1434
 
873
- chunk_consensus_map.setColumnHeaders(file_descriptions)
1435
+ # Prepare chunk data for parallel processing using features_df slices
1436
+ chunk_data_list = []
1437
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(chunks):
1438
+ # Get the sample UIDs for this chunk
1439
+ chunk_sample_uids = []
1440
+ chunk_samples_df_rows = []
1441
+ for j in range(len(chunk_maps)):
1442
+ sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1443
+ chunk_sample_uids.append(sample_row['sample_uid'])
1444
+ chunk_samples_df_rows.append(sample_row)
1445
+
1446
+ # Create a DataFrame for this chunk's samples
1447
+ chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1448
+
1449
+ # Filter features_df for this chunk's samples and select only necessary columns
1450
+ chunk_features_df = self.features_df.filter(
1451
+ pl.col('sample_uid').is_in(chunk_sample_uids)
1452
+ ).select([
1453
+ 'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
1454
+ ])
1455
+
1456
+ # Convert DataFrames to serializable format (lists of dicts)
1457
+ chunk_features_data = chunk_features_df.to_dicts()
1458
+ chunk_samples_data = chunk_samples_df.to_dicts()
1459
+
1460
+ chunk_data = {
1461
+ 'chunk_start_idx': chunk_start_idx,
1462
+ 'chunk_features_data': chunk_features_data, # List of dicts instead of DataFrame
1463
+ 'chunk_samples_data': chunk_samples_data, # List of dicts instead of DataFrame
1464
+ 'params': {
1465
+ 'nr_partitions': params.nr_partitions,
1466
+ 'rt_tol': params.rt_tol,
1467
+ 'mz_tol': params.mz_tol,
1468
+ }
1469
+ }
1470
+ chunk_data_list.append(chunk_data)
874
1471
 
875
- # Use QT algorithm for chunk (main difference from KD chunked)
876
- grouper = oms.FeatureGroupingAlgorithmQT()
877
- chunk_params = grouper.getParameters()
878
- chunk_params.setValue("distance_RT:max_difference", params.rt_tol)
879
- chunk_params.setValue("distance_MZ:max_difference", params.mz_tol)
880
- chunk_params.setValue("distance_MZ:unit", "Da")
881
- chunk_params.setValue("ignore_charge", "true")
882
- chunk_params.setValue("nr_partitions", params.nr_partitions)
1472
+ # Process chunks in parallel - try ProcessPoolExecutor first, fallback to ThreadPoolExecutor on Windows
1473
+ executor_class = ProcessPoolExecutor
1474
+ executor_name = "processes"
883
1475
 
884
- grouper.setParameters(chunk_params)
885
- grouper.group(chunk_maps, chunk_consensus_map)
1476
+ try:
1477
+ with ProcessPoolExecutor(max_workers=params.threads) as executor:
1478
+ # Submit all chunk processing tasks
1479
+ future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
1480
+ for i, chunk_data in enumerate(chunk_data_list)}
1481
+
1482
+ # Collect results with progress tracking
1483
+ completed_chunks = 0
1484
+ total_chunks = len(chunk_data_list)
1485
+ serialized_chunk_results = []
1486
+
1487
+ for future in as_completed(future_to_chunk):
1488
+ chunk_idx = future_to_chunk[future]
1489
+ try:
1490
+ chunk_start_idx, consensus_features = future.result()
1491
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1492
+ completed_chunks += 1
1493
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1494
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1495
+ except Exception as exc:
1496
+ # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1497
+ if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1498
+ # Convert to RuntimeError so outer except block can catch it for fallback
1499
+ raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1500
+ else:
1501
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1502
+ raise exc
1503
+
1504
+ except (RuntimeError, OSError, BrokenProcessPool) as e:
1505
+ # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1506
+ if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1507
+ "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1508
+ self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1509
+ self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1510
+
1511
+ with ThreadPoolExecutor(max_workers=params.threads) as executor:
1512
+ # Submit all chunk processing tasks
1513
+ future_to_chunk = {executor.submit(_process_qt_chunk_parallel, chunk_data): i
1514
+ for i, chunk_data in enumerate(chunk_data_list)}
1515
+
1516
+ # Collect results with progress tracking
1517
+ completed_chunks = 0
1518
+ total_chunks = len(chunk_data_list)
1519
+ serialized_chunk_results = []
1520
+
1521
+ for future in as_completed(future_to_chunk):
1522
+ chunk_idx = future_to_chunk[future]
1523
+ try:
1524
+ chunk_start_idx, consensus_features = future.result()
1525
+ serialized_chunk_results.append((chunk_start_idx, consensus_features))
1526
+ completed_chunks += 1
1527
+ n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1528
+ self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1529
+ except Exception as exc:
1530
+ self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1531
+ raise exc
1532
+ else:
1533
+ # Re-raise other exceptions
1534
+ raise
886
1535
 
887
- chunk_consensus_maps.append((chunk_start_idx, chunk_consensus_map))
888
-
889
- # Merge chunk results with proper cross-chunk consensus building
1536
+ # Store serialized results for _merge_chunk_results to handle directly
1537
+ chunk_consensus_maps = []
1538
+ for chunk_start_idx, consensus_features in sorted(serialized_chunk_results):
1539
+ # Store serialized data directly for _merge_chunk_results to handle
1540
+ chunk_consensus_maps.append((chunk_start_idx, consensus_features))
1541
+
1542
+ # Merge chunk results with proper cross-chunk consensus building
1543
+ # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
890
1544
  _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
891
1545
 
892
- # Create a dummy consensus map for compatibility (since other functions expect it)
1546
+ # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
893
1547
  consensus_map = oms.ConsensusMap()
894
1548
  return consensus_map
895
1549
 
@@ -927,61 +1581,128 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
927
1581
  all_chunk_consensus = []
928
1582
  consensus_id_counter = 0
929
1583
 
930
- for chunk_idx, (chunk_start_idx, chunk_consensus_map) in enumerate(chunk_consensus_maps):
931
- for consensus_feature in chunk_consensus_map:
1584
+ for chunk_idx, (chunk_start_idx, chunk_data) in enumerate(chunk_consensus_maps):
1585
+ # Handle both ConsensusMap objects (sequential) and serialized data (parallel)
1586
+ if isinstance(chunk_data, list):
1587
+ # Parallel processing: chunk_data is a list of serialized consensus feature dictionaries
1588
+ consensus_features_data = chunk_data
1589
+ else:
1590
+ # Sequential processing: chunk_data is a ConsensusMap object
1591
+ chunk_consensus_map = chunk_data
1592
+ consensus_features_data = []
1593
+
1594
+ # Extract data from ConsensusMap and convert to serialized format
1595
+ for consensus_feature in chunk_consensus_map:
1596
+ # Extract feature_uids from this consensus feature
1597
+ feature_uids = []
1598
+ feature_data_list = []
1599
+ sample_uids = []
1600
+
1601
+ for feature_handle in consensus_feature.getFeatureList():
1602
+ fuid = str(feature_handle.getUniqueId())
1603
+ if fuid not in feature_uid_map:
1604
+ continue
1605
+
1606
+ feature_uid = feature_uid_map[fuid]
1607
+ feature_data = features_lookup.get(feature_uid)
1608
+ if feature_data:
1609
+ feature_uids.append(feature_uid)
1610
+ feature_data_list.append(feature_data)
1611
+ sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
1612
+
1613
+ if not feature_data_list:
1614
+ # No retrievable feature metadata (possible stale map reference) -> skip
1615
+ continue
1616
+
1617
+ # Convert ConsensusFeature to serialized format
1618
+ consensus_feature_data = {
1619
+ 'rt': consensus_feature.getRT(),
1620
+ 'mz': consensus_feature.getMZ(),
1621
+ 'intensity': consensus_feature.getIntensity(),
1622
+ 'quality': consensus_feature.getQuality(),
1623
+ 'feature_uids': feature_uids,
1624
+ 'feature_data_list': feature_data_list,
1625
+ 'sample_uids': sample_uids
1626
+ }
1627
+ consensus_features_data.append(consensus_feature_data)
1628
+
1629
+ # Process the consensus features (now all in serialized format)
1630
+ for consensus_feature_data in consensus_features_data:
932
1631
  # ACCEPT ALL consensus features (size >=1) here.
933
1632
  # Reason: A feature that is globally present in many samples can still
934
1633
  # appear only once inside a given sample chunk. Early filtering at
935
1634
  # size>=2 causes irreversible loss and underestimates the final
936
1635
  # consensus count (observed ~296 vs 950 for KD). We defer filtering
937
1636
  # strictly to the final global min_samples.
938
-
939
- # Extract feature_uids from this consensus feature
940
- feature_uids = []
941
- feature_data_list = []
942
- sample_uids = []
943
1637
 
944
- for feature_handle in consensus_feature.getFeatureList():
945
- fuid = str(feature_handle.getUniqueId())
946
- if fuid not in feature_uid_map:
1638
+ # For parallel processing, feature data is already extracted
1639
+ if isinstance(chunk_data, list):
1640
+ # Extract feature_uids and data from serialized format for parallel processing
1641
+ feature_uids = []
1642
+ feature_data_list = []
1643
+ sample_uids = []
1644
+
1645
+ for handle_data in consensus_feature_data['features']:
1646
+ fuid = str(handle_data['unique_id'])
1647
+ if fuid not in feature_uid_map:
1648
+ continue
1649
+
1650
+ feature_uid = feature_uid_map[fuid]
1651
+ feature_data = features_lookup.get(feature_uid)
1652
+ if feature_data:
1653
+ feature_uids.append(feature_uid)
1654
+ feature_data_list.append(feature_data)
1655
+ sample_uids.append(chunk_start_idx + handle_data['map_index'] + 1)
1656
+
1657
+ if not feature_data_list:
947
1658
  continue
948
1659
 
949
- feature_uid = feature_uid_map[fuid]
950
- feature_data = features_lookup.get(feature_uid)
951
- if feature_data:
952
- feature_uids.append(feature_uid)
953
- feature_data_list.append(feature_data)
954
- sample_uids.append(chunk_start_idx + feature_handle.getMapIndex() + 1)
1660
+ # Get RT/MZ from consensus feature data
1661
+ consensus_rt = consensus_feature_data['rt']
1662
+ consensus_mz = consensus_feature_data['mz']
1663
+ consensus_intensity = consensus_feature_data['intensity']
1664
+ consensus_quality = consensus_feature_data['quality']
1665
+ else:
1666
+ # Sequential processing: data is already extracted above
1667
+ feature_uids = consensus_feature_data['feature_uids']
1668
+ feature_data_list = consensus_feature_data['feature_data_list']
1669
+ sample_uids = consensus_feature_data['sample_uids']
1670
+ consensus_rt = consensus_feature_data['rt']
1671
+ consensus_mz = consensus_feature_data['mz']
1672
+ consensus_intensity = consensus_feature_data['intensity']
1673
+ consensus_quality = consensus_feature_data['quality']
955
1674
 
956
1675
  if not feature_data_list:
957
1676
  # No retrievable feature metadata (possible stale map reference) -> skip
958
- continue # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
1677
+ continue
1678
+
1679
+ # Derive RT / m/z ranges from underlying features (used for robust cross-chunk stitching)
959
1680
  rt_vals_local = [fd.get("rt") for fd in feature_data_list if fd.get("rt") is not None]
960
1681
  mz_vals_local = [fd.get("mz") for fd in feature_data_list if fd.get("mz") is not None]
961
1682
  if rt_vals_local:
962
1683
  rt_min_local = min(rt_vals_local)
963
1684
  rt_max_local = max(rt_vals_local)
964
1685
  else:
965
- rt_min_local = rt_max_local = consensus_feature.getRT()
1686
+ rt_min_local = rt_max_local = consensus_rt
966
1687
  if mz_vals_local:
967
1688
  mz_min_local = min(mz_vals_local)
968
1689
  mz_max_local = max(mz_vals_local)
969
1690
  else:
970
- mz_min_local = mz_max_local = consensus_feature.getMZ()
1691
+ mz_min_local = mz_max_local = consensus_mz
971
1692
 
972
1693
  # Store chunk consensus with feature tracking
973
1694
  chunk_consensus_data = {
974
1695
  'consensus_id': consensus_id_counter,
975
1696
  'chunk_idx': chunk_idx,
976
1697
  'chunk_start_idx': chunk_start_idx,
977
- 'mz': consensus_feature.getMZ(),
978
- 'rt': consensus_feature.getRT(),
1698
+ 'mz': consensus_mz,
1699
+ 'rt': consensus_rt,
979
1700
  'mz_min': mz_min_local,
980
1701
  'mz_max': mz_max_local,
981
1702
  'rt_min': rt_min_local,
982
1703
  'rt_max': rt_max_local,
983
- 'intensity': consensus_feature.getIntensity(),
984
- 'quality': consensus_feature.getQuality(),
1704
+ 'intensity': consensus_intensity,
1705
+ 'quality': consensus_quality,
985
1706
  'feature_uids': feature_uids,
986
1707
  'feature_data_list': feature_data_list,
987
1708
  'sample_uids': sample_uids,
@@ -1479,9 +2200,6 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
1479
2200
  return list(groups_by_root.values())
1480
2201
 
1481
2202
 
1482
- # Note: Restored proper chunked implementation with cross-chunk consensus clustering
1483
-
1484
-
1485
2203
  def _reset_consensus_data(self):
1486
2204
  """Reset consensus-related DataFrames at the start of merge."""
1487
2205
  self.consensus_df = pl.DataFrame()
@@ -1960,6 +2678,595 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
1960
2678
  )
1961
2679
 
1962
2680
 
2681
+ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
2682
+ """
2683
+ Count consensus features grouped in tight clusters.
2684
+
2685
+ Args:
2686
+ mz_tol: m/z tolerance in Daltons for cluster detection
2687
+ rt_tol: RT tolerance in seconds for cluster detection
2688
+
2689
+ Returns:
2690
+ Number of tight clusters found
2691
+ """
2692
+ if len(self.consensus_df) < 2:
2693
+ return 0
2694
+
2695
+ # Extract consensus feature data
2696
+ consensus_data = []
2697
+ for row in self.consensus_df.iter_rows(named=True):
2698
+ consensus_data.append({
2699
+ 'consensus_uid': row['consensus_uid'],
2700
+ 'mz': row['mz'],
2701
+ 'rt': row['rt']
2702
+ })
2703
+
2704
+ # Build spatial index using bins
2705
+ rt_bin_size = rt_tol / 2
2706
+ mz_bin_size = mz_tol / 2
2707
+
2708
+ bins = defaultdict(list)
2709
+ for feature in consensus_data:
2710
+ rt_bin = int(feature['rt'] / rt_bin_size)
2711
+ mz_bin = int(feature['mz'] / mz_bin_size)
2712
+ bins[(rt_bin, mz_bin)].append(feature)
2713
+
2714
+ processed_features = set()
2715
+ tight_clusters_count = 0
2716
+
2717
+ for bin_key, bin_features in bins.items():
2718
+ if len(bin_features) < 2:
2719
+ continue
2720
+
2721
+ # Check neighboring bins for additional features
2722
+ rt_bin, mz_bin = bin_key
2723
+ all_nearby_features = list(bin_features)
2724
+
2725
+ # Check 8 neighboring bins
2726
+ for drt in [-1, 0, 1]:
2727
+ for dmz in [-1, 0, 1]:
2728
+ if drt == 0 and dmz == 0:
2729
+ continue
2730
+ neighbor_key = (rt_bin + drt, mz_bin + dmz)
2731
+ if neighbor_key in bins:
2732
+ all_nearby_features.extend(bins[neighbor_key])
2733
+
2734
+ # Filter to features within actual tolerances and not yet processed
2735
+ valid_cluster_features = []
2736
+ for feature in all_nearby_features:
2737
+ if feature['consensus_uid'] in processed_features:
2738
+ continue
2739
+
2740
+ # Check if this feature is within tolerances of any bin feature
2741
+ for bin_feature in bin_features:
2742
+ rt_diff = abs(feature['rt'] - bin_feature['rt'])
2743
+ mz_diff = abs(feature['mz'] - bin_feature['mz'])
2744
+
2745
+ if rt_diff <= rt_tol and mz_diff <= mz_tol:
2746
+ valid_cluster_features.append(feature)
2747
+ break
2748
+
2749
+ # Count as tight cluster if we have multiple features
2750
+ if len(valid_cluster_features) >= 2:
2751
+ tight_clusters_count += 1
2752
+ for feature in valid_cluster_features:
2753
+ processed_features.add(feature['consensus_uid'])
2754
+
2755
+ return tight_clusters_count
2756
+
2757
+
2758
+ def _consensus_cleanup(self, rt_tol, mz_tol):
2759
+ """
2760
+ Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
2761
+
2762
+ This function:
2763
+ 1. Identifies and merges consensus features that are likely over-segmented
2764
+ (too many features in very tight m/z and RT windows)
2765
+ 2. Performs deisotoping to remove +1 and +2 isotopic features
2766
+ """
2767
+ if len(self.consensus_df) == 0:
2768
+ return
2769
+
2770
+ initial_count = len(self.consensus_df)
2771
+
2772
+ # Only perform enhanced post-clustering if there are many features
2773
+ if initial_count < 50:
2774
+ return
2775
+
2776
+ self.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
2777
+
2778
+ # Find tight clusters using spatial binning
2779
+ consensus_data = []
2780
+ for row in self.consensus_df.iter_rows(named=True):
2781
+ consensus_data.append({
2782
+ 'consensus_uid': row['consensus_uid'],
2783
+ 'mz': row['mz'],
2784
+ 'rt': row['rt'],
2785
+ 'inty_mean': row.get('inty_mean', 0),
2786
+ 'number_samples': row.get('number_samples', 0)
2787
+ })
2788
+
2789
+ # Parameters for tight clustering detection - more lenient for effective merging
2790
+ tight_rt_tol = min(0.5, rt_tol * 0.5) # More lenient RT tolerance (max 0.5s)
2791
+ tight_mz_tol = min(0.05, max(0.03, mz_tol * 2.0)) # More lenient m/z tolerance (min 30 mDa, max 50 mDa)
2792
+
2793
+ # Build spatial index using smaller RT and m/z bins for better coverage
2794
+ rt_bin_size = tight_rt_tol / 4 # Smaller bins to ensure nearby features are captured
2795
+ mz_bin_size = tight_mz_tol / 4 # Smaller bins to ensure nearby features are captured
2796
+
2797
+ bins = defaultdict(list)
2798
+ for feature in consensus_data:
2799
+ rt_bin = int(feature['rt'] / rt_bin_size)
2800
+ mz_bin = int(feature['mz'] / mz_bin_size)
2801
+ bins[(rt_bin, mz_bin)].append(feature)
2802
+
2803
+ # Find clusters that need merging
2804
+ merge_groups = []
2805
+ processed_uids = set()
2806
+
2807
+ for bin_key, bin_features in bins.items():
2808
+ # Check current bin and extended neighboring bins for complete cluster
2809
+ rt_bin, mz_bin = bin_key
2810
+ cluster_features = list(bin_features)
2811
+
2812
+ # Check a larger neighborhood (±2 bins) to ensure we capture all nearby features
2813
+ for dr in [-2, -1, 0, 1, 2]:
2814
+ for dm in [-2, -1, 0, 1, 2]:
2815
+ if dr == 0 and dm == 0:
2816
+ continue
2817
+ neighbor_key = (rt_bin + dr, mz_bin + dm)
2818
+ if neighbor_key in bins:
2819
+ cluster_features.extend(bins[neighbor_key])
2820
+
2821
+ # Remove duplicates
2822
+ seen_uids = set()
2823
+ unique_features = []
2824
+ for f in cluster_features:
2825
+ if f['consensus_uid'] not in seen_uids:
2826
+ unique_features.append(f)
2827
+ seen_uids.add(f['consensus_uid'])
2828
+
2829
+ # Only proceed if we have at least 2 features after including neighbors
2830
+ if len(unique_features) < 2:
2831
+ continue
2832
+
2833
+ # Calculate cluster bounds
2834
+ mzs = [f['mz'] for f in unique_features]
2835
+ rts = [f['rt'] for f in unique_features]
2836
+
2837
+ mz_spread = max(mzs) - min(mzs)
2838
+ rt_spread = max(rts) - min(rts)
2839
+
2840
+ # Only merge if features are tightly clustered
2841
+ if mz_spread <= tight_mz_tol and rt_spread <= tight_rt_tol:
2842
+ # Filter out features that were already processed
2843
+ uids_in_cluster = {f['consensus_uid'] for f in unique_features}
2844
+ unprocessed_features = [f for f in unique_features if f['consensus_uid'] not in processed_uids]
2845
+
2846
+ # Only proceed if we have at least 2 unprocessed features that still form a tight cluster
2847
+ if len(unprocessed_features) >= 2:
2848
+ # Recalculate bounds for unprocessed features only
2849
+ unprocessed_mzs = [f['mz'] for f in unprocessed_features]
2850
+ unprocessed_rts = [f['rt'] for f in unprocessed_features]
2851
+
2852
+ unprocessed_mz_spread = max(unprocessed_mzs) - min(unprocessed_mzs)
2853
+ unprocessed_rt_spread = max(unprocessed_rts) - min(unprocessed_rts)
2854
+
2855
+ # Check if unprocessed features still meet tight clustering criteria
2856
+ if unprocessed_mz_spread <= tight_mz_tol and unprocessed_rt_spread <= tight_rt_tol:
2857
+ merge_groups.append(unprocessed_features)
2858
+ processed_uids.update({f['consensus_uid'] for f in unprocessed_features})
2859
+
2860
+ if not merge_groups:
2861
+ return
2862
+
2863
+ self.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
2864
+
2865
+ # Merge clusters by keeping the most representative feature
2866
+ uids_to_remove = set()
2867
+
2868
+ for group in merge_groups:
2869
+ if len(group) < 2:
2870
+ continue
2871
+
2872
+ # Find the most representative feature (highest intensity and sample count)
2873
+ best_feature = max(group, key=lambda x: (x['number_samples'], x['inty_mean']))
2874
+
2875
+ # Mark other features for removal
2876
+ for f in group:
2877
+ if f['consensus_uid'] != best_feature['consensus_uid']:
2878
+ uids_to_remove.add(f['consensus_uid'])
2879
+
2880
+ if uids_to_remove:
2881
+ # Remove merged features from consensus_df
2882
+ self.consensus_df = self.consensus_df.filter(
2883
+ ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2884
+ )
2885
+
2886
+ # Also update consensus_mapping_df if it exists
2887
+ if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2888
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
2889
+ ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2890
+ )
2891
+
2892
+ final_count = len(self.consensus_df)
2893
+ reduction = initial_count - final_count
2894
+ reduction_pct = (reduction / initial_count) * 100
2895
+
2896
+ if reduction > 0:
2897
+ self.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
2898
+
2899
+ # Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
2900
+ pre_deisotoping_count = len(self.consensus_df)
2901
+ isotope_uids_to_remove = set()
2902
+
2903
+ # Use strict tolerances for deisotoping (same as declustering)
2904
+ deisotope_rt_tol = min(0.3, rt_tol * 0.3) # Strict RT tolerance for isotope detection
2905
+ deisotope_mz_tol = min(0.01, mz_tol * 0.5) # Strict m/z tolerance for isotope detection
2906
+
2907
+ # Get current consensus data for isotope detection
2908
+ current_consensus_data = []
2909
+ for row in self.consensus_df.iter_rows(named=True):
2910
+ current_consensus_data.append({
2911
+ 'consensus_uid': row['consensus_uid'],
2912
+ 'mz': row['mz'],
2913
+ 'rt': row['rt'],
2914
+ 'number_samples': row.get('number_samples', 0)
2915
+ })
2916
+
2917
+ # Sort by m/z for efficient searching
2918
+ current_consensus_data.sort(key=lambda x: x['mz'])
2919
+ n_current = len(current_consensus_data)
2920
+
2921
+ for i in range(n_current):
2922
+ feature_i = current_consensus_data[i]
2923
+
2924
+ # Skip if already marked for removal
2925
+ if feature_i['consensus_uid'] in isotope_uids_to_remove:
2926
+ continue
2927
+
2928
+ # Look for potential +1 and +2 isotopes (higher m/z)
2929
+ for j in range(i + 1, n_current):
2930
+ feature_j = current_consensus_data[j]
2931
+
2932
+ # Skip if already marked for removal
2933
+ if feature_j['consensus_uid'] in isotope_uids_to_remove:
2934
+ continue
2935
+
2936
+ mz_diff = feature_j['mz'] - feature_i['mz']
2937
+
2938
+ # Break if m/z difference is too large (features are sorted by m/z)
2939
+ if mz_diff > 2.1: # Beyond +2 isotope range
2940
+ break
2941
+
2942
+ rt_diff = abs(feature_j['rt'] - feature_i['rt'])
2943
+
2944
+ # Check for +1 isotope (C13 mass difference ≈ 1.003354 Da)
2945
+ if (0.995 <= mz_diff <= 1.011) and rt_diff <= deisotope_rt_tol:
2946
+ # Potential +1 isotope - should have fewer samples than main feature
2947
+ if feature_j['number_samples'] < feature_i['number_samples']:
2948
+ isotope_uids_to_remove.add(feature_j['consensus_uid'])
2949
+ continue
2950
+
2951
+ # Check for +2 isotope (2 * C13 mass difference ≈ 2.006708 Da)
2952
+ if (1.995 <= mz_diff <= 2.018) and rt_diff <= deisotope_rt_tol:
2953
+ # Potential +2 isotope - should have fewer samples than main feature
2954
+ if feature_j['number_samples'] < feature_i['number_samples']:
2955
+ isotope_uids_to_remove.add(feature_j['consensus_uid'])
2956
+ continue
2957
+
2958
+ # Remove isotopic features
2959
+ if isotope_uids_to_remove:
2960
+ self.consensus_df = self.consensus_df.filter(
2961
+ ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2962
+ )
2963
+
2964
+ # Also update consensus_mapping_df if it exists
2965
+ if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2966
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
2967
+ ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2968
+ )
2969
+
2970
+ post_deisotoping_count = len(self.consensus_df)
2971
+ isotope_reduction = pre_deisotoping_count - post_deisotoping_count
2972
+
2973
+ if isotope_reduction > 0:
2974
+ self.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
2975
+
2976
+ # Final summary
2977
+ final_count = len(self.consensus_df)
2978
+ total_reduction = initial_count - final_count
2979
+ if total_reduction > 0:
2980
+ total_reduction_pct = (total_reduction / initial_count) * 100
2981
+ self.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
2982
+
2983
+
2984
+ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
2985
+ """
2986
+ Identify coeluting consensus features by characteristic mass shifts between adducts
2987
+ and update their adduct information accordingly.
2988
+
2989
+ This function:
2990
+ 1. Generates a catalogue of mass shifts between adducts using _get_adducts()
2991
+ 2. Searches for pairs of consensus features with same RT (within strict RT tolerance)
2992
+ and matching m/z shifts (±0.005 Da)
2993
+ 3. Updates adduct_* columns based on identified relationships
2994
+
2995
+ Args:
2996
+ rt_tol: RT tolerance in seconds (strict tolerance for coelution detection)
2997
+ cached_adducts_df: Pre-computed adducts DataFrame for performance
2998
+ """
2999
+ import polars as pl
3000
+ import numpy as np
3001
+ from collections import defaultdict
3002
+
3003
+ # Check if consensus_df exists and has features
3004
+ if len(self.consensus_df) == 0:
3005
+ self.logger.debug("No consensus features for adduct identification by mass shift")
3006
+ return
3007
+
3008
+ self.logger.info(f"Identifying coeluting adducts by mass shifts in {len(self.consensus_df)} consensus features...")
3009
+
3010
+ # Get adducts DataFrame if not provided
3011
+ if cached_adducts_df is None or cached_adducts_df.is_empty():
3012
+ try:
3013
+ # Use lower min_probability for better adduct coverage in mass shift identification
3014
+ cached_adducts_df = self._get_adducts(min_probability=0.01)
3015
+ except Exception as e:
3016
+ self.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
3017
+ return
3018
+
3019
+ if cached_adducts_df.is_empty():
3020
+ self.logger.debug("No adducts available for mass shift identification")
3021
+ return
3022
+
3023
+ # Build catalogue of mass shifts between adducts
3024
+ mass_shift_catalog = {}
3025
+ adduct_info = {}
3026
+
3027
+ # Extract adduct information
3028
+ adducts_data = cached_adducts_df.select(["name", "charge", "mass_shift"]).to_dicts()
3029
+
3030
+ for adduct in adducts_data:
3031
+ name = adduct["name"]
3032
+ charge = adduct["charge"]
3033
+ mass_shift = adduct["mass_shift"]
3034
+
3035
+ adduct_info[name] = {
3036
+ "charge": charge,
3037
+ "mass_shift": mass_shift
3038
+ }
3039
+
3040
+ # Generate pairwise mass differences for catalog
3041
+ for adduct1 in adducts_data:
3042
+ for adduct2 in adducts_data:
3043
+ if adduct1["name"] == adduct2["name"]:
3044
+ continue
3045
+
3046
+ name1, charge1, ms1 = adduct1["name"], adduct1["charge"], adduct1["mass_shift"]
3047
+ name2, charge2, ms2 = adduct2["name"], adduct2["charge"], adduct2["mass_shift"]
3048
+
3049
+ # Only consider shifts between adducts that have the same charge (same ionization state)
3050
+ if charge1 != charge2:
3051
+ continue
3052
+
3053
+ # Calculate expected m/z difference
3054
+ if charge1 != 0 and charge2 != 0:
3055
+ mz_diff = (ms1 - ms2) / abs(charge1)
3056
+ else:
3057
+ continue # Skip neutral adducts for this analysis
3058
+
3059
+ # Store the mass shift relationship
3060
+ shift_key = round(mz_diff, 4) # Round to 4 decimal places for matching
3061
+ if shift_key not in mass_shift_catalog:
3062
+ mass_shift_catalog[shift_key] = []
3063
+ mass_shift_catalog[shift_key].append({
3064
+ "from_adduct": name1,
3065
+ "to_adduct": name2,
3066
+ "mz_shift": mz_diff,
3067
+ "from_charge": charge1,
3068
+ "to_charge": charge2
3069
+ })
3070
+
3071
+ self.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
3072
+
3073
+ # Get consensus features data
3074
+ consensus_data = []
3075
+ for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
3076
+ consensus_data.append({
3077
+ "index": i,
3078
+ "consensus_uid": row["consensus_uid"],
3079
+ "rt": row["rt"],
3080
+ "mz": row["mz"],
3081
+ "adduct_top": row.get("adduct_top", "[M+?]1+"),
3082
+ "adduct_charge_top": row.get("adduct_charge_top", 1),
3083
+ "adduct_mass_neutral_top": row.get("adduct_mass_neutral_top"),
3084
+ "adduct_mass_shift_top": row.get("adduct_mass_shift_top"),
3085
+ "inty_mean": row.get("inty_mean", 0)
3086
+ })
3087
+
3088
+ # Sort by RT for efficient searching
3089
+ consensus_data.sort(key=lambda x: x["rt"])
3090
+ n_features = len(consensus_data)
3091
+
3092
+ # Track updates to make
3093
+ adduct_updates = {} # consensus_uid -> new_adduct_info
3094
+
3095
+ # Strict RT tolerance for coelution (convert to minutes)
3096
+ rt_tol_strict = rt_tol * 0.5 # Use half the merge tolerance for strict coelution
3097
+ mz_tol_shift = 0.005 # ±5 mDa tolerance for mass shift matching
3098
+
3099
+ # Search for coeluting pairs with characteristic mass shifts
3100
+ updated_count = 0
3101
+
3102
+ for i in range(n_features):
3103
+ feature1 = consensus_data[i]
3104
+ rt1 = feature1["rt"]
3105
+ mz1 = feature1["mz"]
3106
+ adduct1 = feature1["adduct_top"]
3107
+
3108
+ # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
3109
+ # if adduct1 and "?" not in adduct1:
3110
+ # continue
3111
+
3112
+ # Search for coeluting features within strict RT tolerance
3113
+ for j in range(i + 1, n_features):
3114
+ feature2 = consensus_data[j]
3115
+ rt2 = feature2["rt"]
3116
+
3117
+ # Break if RT difference exceeds tolerance (sorted by RT)
3118
+ if abs(rt2 - rt1) > rt_tol_strict:
3119
+ break
3120
+
3121
+ mz2 = feature2["mz"]
3122
+ adduct2 = feature2["adduct_top"]
3123
+
3124
+ # Skip if already has identified adduct (not [M+?]) - DISABLED to allow re-evaluation
3125
+ # if adduct2 and "?" not in adduct2:
3126
+ # continue
3127
+
3128
+ # Calculate observed m/z difference
3129
+ mz_diff = mz2 - mz1
3130
+ shift_key = round(mz_diff, 4)
3131
+
3132
+ # Check if this mass shift matches any known adduct relationships
3133
+ for catalog_shift, relationships in mass_shift_catalog.items():
3134
+ if abs(shift_key - catalog_shift) <= mz_tol_shift:
3135
+ # Found a matching mass shift!
3136
+
3137
+ # Choose the best relationship based on common adducts
3138
+ best_rel = None
3139
+ best_score = 0
3140
+
3141
+ for rel in relationships:
3142
+ # Prioritize common adducts ([M+H]+, [M+Na]+, [M+NH4]+)
3143
+ score = 0
3144
+ if "H]" in rel["from_adduct"]: score += 3
3145
+ if "Na]" in rel["from_adduct"]: score += 2
3146
+ if "NH4]" in rel["from_adduct"]: score += 2
3147
+ if "H]" in rel["to_adduct"]: score += 3
3148
+ if "Na]" in rel["to_adduct"]: score += 2
3149
+ if "NH4]" in rel["to_adduct"]: score += 2
3150
+
3151
+ if score > best_score:
3152
+ best_score = score
3153
+ best_rel = rel
3154
+
3155
+ if best_rel:
3156
+ # Determine which feature gets which adduct based on intensity
3157
+ inty1 = feature1["inty_mean"]
3158
+ inty2 = feature2["inty_mean"]
3159
+
3160
+ # Assign higher intensity to [M+H]+ if possible
3161
+ if "H]" in best_rel["from_adduct"] and inty1 >= inty2:
3162
+ # Feature 1 = from_adduct, Feature 2 = to_adduct
3163
+ from_feature = feature1
3164
+ to_feature = feature2
3165
+ from_adduct_name = best_rel["from_adduct"]
3166
+ to_adduct_name = best_rel["to_adduct"]
3167
+ elif "H]" in best_rel["to_adduct"] and inty2 >= inty1:
3168
+ # Feature 2 = to_adduct (reverse), Feature 1 = from_adduct
3169
+ from_feature = feature2
3170
+ to_feature = feature1
3171
+ from_adduct_name = best_rel["to_adduct"]
3172
+ to_adduct_name = best_rel["from_adduct"]
3173
+ else:
3174
+ # Assignment based on mass shift direction
3175
+ # catalog_shift = (ms1 - ms2) / abs(charge1) where ms1 = from_adduct mass shift, ms2 = to_adduct mass shift
3176
+ # If catalog_shift > 0: from_adduct has higher m/z than to_adduct
3177
+ # If catalog_shift < 0: from_adduct has lower m/z than to_adduct
3178
+ # observed mz_diff = mz2 - mz1
3179
+ # If mz_diff matches catalog_shift: feature2 should get to_adduct, feature1 should get from_adduct
3180
+ # If mz_diff matches -catalog_shift: assignments are swapped
3181
+
3182
+ if abs(mz_diff - catalog_shift) <= abs(mz_diff - (-catalog_shift)):
3183
+ # mz_diff matches catalog_shift direction
3184
+ from_feature = feature1
3185
+ to_feature = feature2
3186
+ from_adduct_name = best_rel["from_adduct"]
3187
+ to_adduct_name = best_rel["to_adduct"]
3188
+ else:
3189
+ # mz_diff matches reverse direction of catalog_shift
3190
+ from_feature = feature2
3191
+ to_feature = feature1
3192
+ from_adduct_name = best_rel["to_adduct"]
3193
+ to_adduct_name = best_rel["from_adduct"]
3194
+
3195
+ # Get adduct details from catalog
3196
+ from_adduct_info = adduct_info.get(from_adduct_name, {})
3197
+ to_adduct_info = adduct_info.get(to_adduct_name, {})
3198
+
3199
+ # Calculate neutral masses
3200
+ from_charge = from_adduct_info.get("charge", 1)
3201
+ to_charge = to_adduct_info.get("charge", 1)
3202
+ from_mass_shift = from_adduct_info.get("mass_shift", 1.007825)
3203
+ to_mass_shift = to_adduct_info.get("mass_shift", 1.007825)
3204
+
3205
+ from_neutral_mass = from_feature["mz"] * abs(from_charge) - from_mass_shift
3206
+ to_neutral_mass = to_feature["mz"] * abs(to_charge) - to_mass_shift
3207
+
3208
+ # Store updates
3209
+ adduct_updates[from_feature["consensus_uid"]] = {
3210
+ "adduct_top": from_adduct_name,
3211
+ "adduct_charge_top": from_charge,
3212
+ "adduct_mass_neutral_top": from_neutral_mass,
3213
+ "adduct_mass_shift_top": from_mass_shift
3214
+ }
3215
+
3216
+ adduct_updates[to_feature["consensus_uid"]] = {
3217
+ "adduct_top": to_adduct_name,
3218
+ "adduct_charge_top": to_charge,
3219
+ "adduct_mass_neutral_top": to_neutral_mass,
3220
+ "adduct_mass_shift_top": to_mass_shift
3221
+ }
3222
+
3223
+ updated_count += 2
3224
+ self.logger.debug(
3225
+ f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
3226
+ f"<-> {to_adduct_name} (m/z {to_feature['mz']:.4f}), "
3227
+ f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
3228
+ )
3229
+ break # Found match, no need to check other relationships
3230
+
3231
+ # Apply updates to consensus_df
3232
+ if adduct_updates:
3233
+ # Prepare update data
3234
+ consensus_uids = self.consensus_df["consensus_uid"].to_list()
3235
+
3236
+ new_adduct_top = []
3237
+ new_adduct_charge_top = []
3238
+ new_adduct_mass_neutral_top = []
3239
+ new_adduct_mass_shift_top = []
3240
+
3241
+ for uid in consensus_uids:
3242
+ if uid in adduct_updates:
3243
+ update = adduct_updates[uid]
3244
+ new_adduct_top.append(update["adduct_top"])
3245
+ new_adduct_charge_top.append(update["adduct_charge_top"])
3246
+ new_adduct_mass_neutral_top.append(update["adduct_mass_neutral_top"])
3247
+ new_adduct_mass_shift_top.append(update["adduct_mass_shift_top"])
3248
+ else:
3249
+ # Keep existing values
3250
+ row_idx = consensus_uids.index(uid)
3251
+ row = self.consensus_df.row(row_idx, named=True)
3252
+ new_adduct_top.append(row.get("adduct_top"))
3253
+ new_adduct_charge_top.append(row.get("adduct_charge_top"))
3254
+ new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
3255
+ new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
3256
+
3257
+ # Update the DataFrame
3258
+ self.consensus_df = self.consensus_df.with_columns([
3259
+ pl.Series("adduct_top", new_adduct_top),
3260
+ pl.Series("adduct_charge_top", new_adduct_charge_top),
3261
+ pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
3262
+ pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
3263
+ ])
3264
+
3265
+ self.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
3266
+ else:
3267
+ self.logger.debug("No consensus features updated based on mass shift analysis")
3268
+
3269
+
1963
3270
  def _finalize_merge(self, link_ms2, min_samples):
1964
3271
  """Complete the merge process with final calculations and cleanup."""
1965
3272
  import polars as pl
@@ -1998,14 +3305,20 @@ def _finalize_merge(self, link_ms2, min_samples):
1998
3305
  )
1999
3306
 
2000
3307
  # Calculate the completeness of the consensus map
3308
+ # Log completion with tight cluster metrics
2001
3309
  if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
2002
3310
  c = (
2003
3311
  len(self.consensus_mapping_df)
2004
3312
  / len(self.consensus_df)
2005
3313
  / len(self.samples_df)
2006
3314
  )
3315
+
3316
+ # Count tight clusters with specified thresholds
3317
+ tight_clusters = _count_tight_clusters(self,mz_tol=0.04, rt_tol=0.3)
3318
+
2007
3319
  self.logger.info(
2008
- f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
3320
+ f"Merging completed. Consensus features: {len(self.consensus_df)}. "
3321
+ f"Completeness: {c:.2f}. Tight clusters left: {tight_clusters}.",
2009
3322
  )
2010
3323
  else:
2011
3324
  self.logger.warning(