masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -719
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.4.dist-info/RECORD +0 -50
  54. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
@@ -1,1046 +1,1057 @@
1
- from __future__ import annotations
2
-
3
- from datetime import datetime
4
-
5
- import numpy as np
6
- import polars as pl
7
- import pyopenms as oms
8
-
9
- from tqdm import tqdm
10
-
11
- from masster.study.defaults import (
12
- align_defaults,
13
- find_consensus_defaults,
14
- find_ms2_defaults,
15
- integrate_chrom_defaults,
16
- )
17
-
18
-
19
- def align(self, **kwargs):
20
- """
21
- Aligns feature maps using pose clustering and updates retention times in the features DataFrame.
22
-
23
- Parameters:
24
- **kwargs: Keyword arguments for alignment parameters. Can include:
25
- - An align_defaults instance to set all parameters at once
26
- - Individual parameter names and values (see align_defaults for details)
27
-
28
- Key Parameters:
29
- rt_max_diff (float): Maximum RT difference for alignment (default: 60.0).
30
- mz_max_diff (float): Maximum m/z difference for alignment (default: 0.01).
31
- rt_pair_distance_frac (float): RT pair distance fraction for superimposer (default: 0.2).
32
- mz_pair_max_distance (float): Maximum m/z pair distance for superimposer (default: 0.01).
33
- num_used_points (int): Number of points used for superimposer (default: 1000).
34
- save_features (bool): Whether to save features after alignment (default: True).
35
- skip_blanks (bool): Whether to skip blank samples during alignment (default: True).
36
- """
37
- # parameters initialization
38
- params = align_defaults()
39
- for key, value in kwargs.items():
40
- if isinstance(value, align_defaults):
41
- params = value
42
- self.logger.debug("Using provided align_defaults parameters")
43
- else:
44
- if hasattr(params, key):
45
- if params.set(key, value, validate=True):
46
- self.logger.debug(f"Updated parameter {key} = {value}")
47
- else:
48
- self.logger.warning(
49
- f"Failed to set parameter {key} = {value} (validation failed)",
50
- )
51
- else:
52
- self.logger.debug(f"Unknown parameter {key} ignored")
53
- # end of parameter initialization
54
-
55
- # Store parameters in the Study object
56
- self.store_history(["align"], params.to_dict())
57
- self.logger.debug("Parameters stored to align")
58
-
59
- if len(self.features_maps) < len(self.samples_df):
60
- self.features_maps = []
61
- self.load_features()
62
-
63
- self.logger.info("Starting alignment.")
64
-
65
- fmaps = self.features_maps
66
- # set ref_index to feature map index with largest number of features
67
- ref_index = [
68
- i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
69
- ][-1]
70
-
71
- self.logger.info(
72
- f"Reference feature map is {self.samples_df.row(ref_index, named=True)['sample_name']} with {fmaps[ref_index].size()} features.",
73
- )
74
-
75
- aligner = oms.MapAlignmentAlgorithmPoseClustering()
76
-
77
- params_oms = oms.Param()
78
- params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
79
- params_oms.setValue("pairfinder:ignore_charge", "true")
80
- params_oms.setValue("max_num_peaks_considered", 1000)
81
- params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
82
- params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
83
- params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
84
- params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
85
- params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
86
- params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
87
- params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
88
- aligner.setParameters(params_oms)
89
- """
90
- {b'max_num_peaks_considered': 1000,
91
- b'superimposer:mz_pair_max_distance': 0.5,
92
- b'superimposer:rt_pair_distance_fraction': 0.1,
93
- b'superimposer:num_used_points': 2000,
94
- b'superimposer:scaling_bucket_size': 0.005,
95
- b'superimposer:shift_bucket_size': 3.0,
96
- b'superimposer:max_shift': 1000.0,
97
- b'superimposer:max_scaling': 2.0,
98
- b'superimposer:dump_buckets': '',
99
- b'superimposer:dump_pairs': '',
100
- b'pairfinder:second_nearest_gap': 2.0,
101
- b'pairfinder:use_identifications': 'false',
102
- b'pairfinder:ignore_charge': 'false',
103
- b'pairfinder:ignore_adduct': 'true',
104
- b'pairfinder:distance_RT:max_difference': 100.0,
105
- b'pairfinder:distance_RT:exponent': 1.0,
106
- b'pairfinder:distance_RT:weight': 1.0,
107
- b'pairfinder:distance_MZ:max_difference': 0.3,
108
- b'pairfinder:distance_MZ:unit': 'Da',
109
- b'pairfinder:distance_MZ:exponent': 2.0,
110
- b'pairfinder:distance_MZ:weight': 1.0,
111
- b'pairfinder:distance_intensity:exponent': 1.0,
112
- b'pairfinder:distance_intensity:weight': 0.0,
113
- b'pairfinder:distance_intensity:log_transform': 'disabled'} """
114
-
115
- aligner.setReference(fmaps[ref_index])
116
-
117
- self.logger.debug(f"Parameters for alignment: {params}")
118
-
119
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
120
- # perform alignment and transformation of feature maps to the reference map (exclude reference map)
121
- for index, fm in tqdm(
122
- list(enumerate(fmaps)),
123
- total=len(fmaps),
124
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Align feature maps",
125
- disable=tdqm_disable,
126
- ):
127
- if index == ref_index:
128
- continue
129
- if (
130
- params.get("skip_blanks")
131
- and self.samples_df.row(index, named=True)["sample_type"] == "blank"
132
- ):
133
- continue
134
- trafo = oms.TransformationDescription()
135
- aligner.align(fm, trafo)
136
- transformer = oms.MapAlignmentTransformer()
137
- transformer.transformRetentionTimes(fm, trafo, True)
138
-
139
- self.alignment_ref_index = ref_index
140
-
141
- # check if rt_original exists in features_df, if not, add it after rt
142
- if "rt_original" not in self.features_df.columns:
143
- # add column 'rt_original' after 'rt'
144
- rt_index = self.features_df.columns.get_loc("rt") + 1
145
- self.features_df.insert(rt_index, "rt_original", 0)
146
- self.features_df["rt_original"] = self.features_df["rt"]
147
-
148
- # iterate through all feature_maps and add the transformed retention times to the features_df
149
-
150
- # Build a fast lookup for (sample_uid, feature_uid) to index in features_df
151
- feats = self.features_df
152
-
153
- # Pre-build sample_uid lookup for faster access
154
- self.logger.debug("Build sample_uid lookup for fast access...")
155
- sample_uid_lookup = {
156
- idx: row_dict["sample_uid"]
157
- for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
158
- }
159
-
160
- # Build the main lookup using feature_uid (not feature_id)
161
- if "feature_id" in feats.columns:
162
- # Create lookup mapping (sample_uid, feature_uid) to DataFrame index using Polars
163
- # Since we need a pandas-style index lookup, we'll create a simple dict
164
- sample_uids = feats.get_column("sample_uid").to_list()
165
-
166
- # Handle feature_id column - it might be Object type due to conversion
167
- feature_id_col = feats.get_column("feature_id")
168
- if feature_id_col.dtype == pl.Object:
169
- # If it's Object type, convert to list and let Python handle the conversion
170
- feature_ids = feature_id_col.to_list()
171
- # Convert to strings if they're not already
172
- feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
173
- else:
174
- # Safe to cast normally
175
- feature_ids = feature_id_col.cast(pl.Utf8).to_list()
176
-
177
- lookup = {
178
- (sample_uid, feature_id): idx
179
- for idx, (sample_uid, feature_id) in enumerate(
180
- zip(sample_uids, feature_ids, strict=True),
181
- )
182
- }
183
- else:
184
- # fallback: skip if feature_uid column missing
185
- lookup = {}
186
- self.logger.warning("feature_id column not found in features_df")
187
-
188
- # Pre-allocate update lists for better performance
189
- all_update_idx = []
190
- all_update_rt = []
191
- all_update_rt_original = []
192
-
193
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
194
-
195
- for index, fm in tqdm(
196
- list(enumerate(fmaps)),
197
- total=len(fmaps),
198
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract RTs",
199
- disable=tdqm_disable,
200
- ):
201
- sample_uid = sample_uid_lookup.get(index)
202
- if sample_uid is None:
203
- continue
204
-
205
- # Collect all updates for this feature map
206
- for f in fm:
207
- feature_uid = str(f.getUniqueId())
208
- idx = lookup.get((sample_uid, feature_uid))
209
- if idx is not None:
210
- rt = round(f.getRT(), 3)
211
- # rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
212
- all_update_idx.append(idx)
213
- all_update_rt.append(rt)
214
- # all_update_rt_original.append(rt_or)
215
-
216
- # Single batch update for all features at once
217
- if all_update_idx:
218
- # Update "rt" column for specified indices using Polars
219
- self.features_df = self.features_df.with_columns(
220
- pl.when(pl.int_range(0, self.features_df.height).is_in(all_update_idx))
221
- .then(pl.Series("rt", all_update_rt))
222
- .otherwise(pl.col("rt"))
223
- .alias("rt"),
224
- )
225
- # self.features_df.loc[all_update_idx, "rt_original"] = all_update_rt_original
226
-
227
- self.logger.debug("Alignment completed successfully.")
228
-
229
- if params.get("save_features"):
230
- self.save_samples()
231
-
232
-
233
- def find_consensus(self, **kwargs):
234
- """
235
- Groups features across samples into consensus features using the specified algorithm.
236
-
237
- Parameters:
238
- **kwargs: Keyword arguments for consensus parameters. Can include:
239
- - A find_consensus_defaults instance to set all parameters at once
240
- - Individual parameter names and values (see find_consensus_defaults for details)
241
-
242
- Key Parameters:
243
- algorithm (str): Feature grouping algorithm ('kd', 'unlabeled', 'sequential', or default 'qt').
244
- min_samples (int): Minimum number of samples for a consensus feature.
245
- link_ms2 (bool): Whether to link MS2 spectra to consensus features.
246
- mz_tol (float): m/z tolerance for grouping (default: 0.01).
247
- rt_tol (float): RT tolerance for grouping (default: 1.0).
248
- """
249
- # parameters initialization
250
- params = find_consensus_defaults()
251
- for key, value in kwargs.items():
252
- if isinstance(value, find_consensus_defaults):
253
- params = value
254
- self.logger.debug("Using provided find_consensus_defaults parameters")
255
- else:
256
- if hasattr(params, key):
257
- if params.set(key, value, validate=True):
258
- self.logger.debug(f"Updated parameter {key} = {value}")
259
- else:
260
- self.logger.warning(
261
- f"Failed to set parameter {key} = {value} (validation failed)",
262
- )
263
- else:
264
- self.logger.debug(f"Unknown parameter {key} ignored")
265
- # end of parameter initialization
266
-
267
- # Store parameters in the Study object
268
- self.store_history(["find_consensus"], params.to_dict())
269
- self.logger.debug("Parameters stored to find_consensus")
270
-
271
- # Get parameter values for use in the method
272
- algorithm = params.get("algorithm")
273
- min_samples = params.get("min_samples")
274
- link_ms2 = params.get("link_ms2")
275
- mz_tol = kwargs.get("mz_tol", 0.01) # Default values for parameters not in defaults class
276
- rt_tol = kwargs.get("rt_tol", 1.0)
277
-
278
- if len(self.samples_df) > 200 and algorithm == 'qt':
279
- self.logger.warning("Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].")
280
-
281
- # check that features_maps is not empty
282
- if not self.features_maps or len(self.features_maps) == 0:
283
- self.load_features()
284
- params_oms = oms.Param()
285
- ## TODO expose these
286
-
287
- feature_grouper: object # Use generic type for different OpenMS algorithms
288
- match algorithm.lower():
289
- case "kd":
290
- feature_grouper = oms.FeatureGroupingAlgorithmKD()
291
- self.logger.info("Grouping features with KDTree...")
292
- params_oms.setValue("mz_unit", "Da")
293
- params_oms.setValue("nr_partitions", len(self.samples_df))
294
-
295
- params_oms.setValue("warp:enabled", "true")
296
- params_oms.setValue("warp:rt_tol", rt_tol)
297
- params_oms.setValue("warp:mz_tol", mz_tol)
298
-
299
- params_oms.setValue("link:rt_tol", rt_tol)
300
- params_oms.setValue("link:mz_tol", mz_tol)
301
- case "unlabeled":
302
- feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
303
- self.logger.info("Grouping features with Unlabelled algorithm...")
304
- params_oms.setValue("second_nearest_gap", 2.0)
305
- params_oms.setValue("ignore_charge", "true")
306
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
307
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
308
- params_oms.setValue("distance_MZ:unit", "Da")
309
- case "sequential":
310
- self.logger.info(
311
- "Grouping features sequentially with Unlabelled algorithm...",
312
- )
313
- params_oms.setValue("second_nearest_gap", 2.0)
314
- params_oms.setValue("ignore_charge", "true")
315
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
316
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
317
- params_oms.setValue("distance_MZ:unit", "Da")
318
- case "qt":
319
- feature_grouper = oms.FeatureGroupingAlgorithmQT()
320
- self.logger.info("Grouping features with QT...")
321
- params_oms.setValue("nr_partitions", len(self.samples_df))
322
- params_oms.setValue("ignore_charge", "true")
323
- params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
324
- params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
325
- params_oms.setValue("distance_MZ:unit", "Da")
326
- self.logger.debug(f"Parameters for feature grouping: {params_oms}")
327
- consensus_map = oms.ConsensusMap()
328
- file_descriptions = consensus_map.getColumnHeaders() # type: ignore
329
- feature_maps = self.features_maps
330
- for i, feature_map in enumerate(feature_maps):
331
- file_description = file_descriptions.get(i, oms.ColumnHeader())
332
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
333
- file_description.size = feature_map.size()
334
- file_description.unique_id = feature_map.getUniqueId()
335
- file_descriptions[i] = file_description
336
-
337
- consensus_map.setColumnHeaders(file_descriptions) # type: ignore
338
-
339
- # create a copy of the feature maps to store the original feature map information
340
- match algorithm.lower():
341
- case "sequential":
342
- # set the reference map to self.alignment_ref_index
343
- if self.alignment_ref_index is None:
344
- # pick the feature map with the most features as reference
345
- self.alignment_ref_index = max(
346
- range(len(self.features_maps)),
347
- key=lambda i: self.features_maps[i].size(),
348
- )
349
- feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
350
- feature_grouper.setParameters(params_oms)
351
- feature_grouper.setReference(
352
- self.alignment_ref_index,
353
- self.features_maps[self.alignment_ref_index],
354
- )
355
- self.logger.info(
356
- f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
357
- )
358
-
359
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
360
- for i, feature_map in tqdm(
361
- enumerate(self.features_maps),
362
- total=len(self.features_maps),
363
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add samples",
364
- disable=tdqm_disable,
365
- ):
366
- if i == self.alignment_ref_index:
367
- continue
368
- feature_grouper.addToGroup(i, feature_map)
369
- self.logger.debug("Grouping features.")
370
- consensus_map = feature_grouper.getResultMap()
371
- if hasattr(consensus_map, "setUniqueIds"):
372
- consensus_map.setUniqueIds()
373
- case _:
374
- feature_grouper.setParameters(params_oms) # type: ignore
375
- # add all feature maps and group in one batch
376
- self.logger.debug("Grouping features in one batch...")
377
- feature_grouper.group(feature_maps, consensus_map) # type: ignore
378
- if hasattr(consensus_map, "setUniqueIds"):
379
- consensus_map.setUniqueIds()
380
-
381
- # create a dict to map uid to feature_uid using self.features_df
382
- feature_uid_map = {
383
- row["feature_id"]: row["feature_uid"]
384
- for row in self.features_df.iter_rows(named=True)
385
- }
386
- imax = consensus_map.size()
387
-
388
- # Pre-build fast lookup tables for features_df data
389
- features_lookup = {}
390
- feature_columns = [
391
- "rt",
392
- "mz",
393
- "rt_start",
394
- "rt_end",
395
- "rt_delta",
396
- "mz_start",
397
- "mz_end",
398
- "inty",
399
- "chrom_coherence",
400
- "chrom_prominence",
401
- "chrom_prominence_scaled",
402
- "chrom_height_scaled",
403
- "iso",
404
- "charge",
405
- "ms2_scans",
406
- ]
407
-
408
- for row in self.features_df.iter_rows(named=True):
409
- feature_uid = row["feature_uid"]
410
- features_lookup[feature_uid] = {
411
- col: row[col] for col in feature_columns if col in self.features_df.columns
412
- }
413
-
414
- # create a list to store the consensus mapping
415
- consensus_mapping = []
416
- metadata_list = []
417
-
418
- tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
419
-
420
- for i, feature in enumerate(
421
- tqdm(
422
- consensus_map,
423
- total=imax,
424
- disable=tqdm_disable,
425
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
426
- ),
427
- ):
428
- # get all features in the feature map with the same unique id as the consensus feature
429
- features_list = feature.getFeatureList()
430
- uids = []
431
- feature_data_list = []
432
-
433
- for _j, f in enumerate(features_list):
434
- fuid = str(f.getUniqueId())
435
- if fuid not in feature_uid_map:
436
- # this is a feature that was removed but is still in the feature maps
437
- continue
438
- fuid = feature_uid_map[fuid]
439
- consensus_mapping.append({
440
- "consensus_uid": i,
441
- "sample_uid": f.getMapIndex() + 1,
442
- "feature_uid": fuid,
443
- })
444
- uids.append(fuid)
445
-
446
- # Get feature data from lookup instead of DataFrame filtering
447
- feature_data = features_lookup.get(fuid)
448
- if feature_data:
449
- feature_data_list.append(feature_data)
450
-
451
- if not feature_data_list:
452
- # Skip this consensus feature if no valid features found
453
- continue
454
-
455
- # Compute statistics using vectorized operations on collected data
456
- # Convert to numpy arrays for faster computation
457
- rt_values = np.array([
458
- fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None
459
- ])
460
- mz_values = np.array([
461
- fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None
462
- ])
463
- rt_start_values = np.array([
464
- fd.get("rt_start", 0)
465
- for fd in feature_data_list
466
- if fd.get("rt_start") is not None
467
- ])
468
- rt_end_values = np.array([
469
- fd.get("rt_end", 0)
470
- for fd in feature_data_list
471
- if fd.get("rt_end") is not None
472
- ])
473
- rt_delta_values = np.array([
474
- fd.get("rt_delta", 0)
475
- for fd in feature_data_list
476
- if fd.get("rt_delta") is not None
477
- ])
478
- mz_start_values = np.array([
479
- fd.get("mz_start", 0)
480
- for fd in feature_data_list
481
- if fd.get("mz_start") is not None
482
- ])
483
- mz_end_values = np.array([
484
- fd.get("mz_end", 0)
485
- for fd in feature_data_list
486
- if fd.get("mz_end") is not None
487
- ])
488
- inty_values = np.array([
489
- fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None
490
- ])
491
- coherence_values = np.array([
492
- fd.get("chrom_coherence", 0)
493
- for fd in feature_data_list
494
- if fd.get("chrom_coherence") is not None
495
- ])
496
- prominence_values = np.array([
497
- fd.get("chrom_prominence", 0)
498
- for fd in feature_data_list
499
- if fd.get("chrom_prominence") is not None
500
- ])
501
- prominence_scaled_values = np.array([
502
- fd.get("chrom_prominence_scaled", 0)
503
- for fd in feature_data_list
504
- if fd.get("chrom_prominence_scaled") is not None
505
- ])
506
- height_scaled_values = np.array([
507
- fd.get("chrom_height_scaled", 0)
508
- for fd in feature_data_list
509
- if fd.get("chrom_height_scaled") is not None
510
- ])
511
- iso_values = np.array([
512
- fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None
513
- ])
514
- charge_values = np.array([
515
- fd.get("charge", 0)
516
- for fd in feature_data_list
517
- if fd.get("charge") is not None
518
- ])
519
-
520
- # Calculate number of MS2 spectra
521
- ms2_count = 0
522
- for fd in feature_data_list:
523
- ms2_scans = fd.get("ms2_scans")
524
- if ms2_scans is not None:
525
- ms2_count += len(ms2_scans)
526
-
527
- metadata_list.append({
528
- "consensus_uid": int(i), # "consensus_id": i,
529
- "consensus_id": str(feature.getUniqueId()),
530
- "quality": round(float(feature.getQuality()), 3),
531
- "number_samples": len(feature_data_list),
532
- # "number_ext": int(len(features_list)),
533
- "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
534
- "mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
535
- "rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
536
- "rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
537
- "rt_mean": round(float(np.mean(rt_values)), 3)
538
- if len(rt_values) > 0
539
- else 0.0,
540
- "rt_start_mean": round(float(np.mean(rt_start_values)), 3)
541
- if len(rt_start_values) > 0
542
- else 0.0,
543
- "rt_end_mean": round(float(np.mean(rt_end_values)), 3)
544
- if len(rt_end_values) > 0
545
- else 0.0,
546
- "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3)
547
- if len(rt_delta_values) > 0
548
- else 0.0,
549
- "mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
550
- "mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
551
- "mz_mean": round(float(np.mean(mz_values)), 4)
552
- if len(mz_values) > 0
553
- else 0.0,
554
- "mz_start_mean": round(float(np.mean(mz_start_values)), 4)
555
- if len(mz_start_values) > 0
556
- else 0.0,
557
- "mz_end_mean": round(float(np.mean(mz_end_values)), 4)
558
- if len(mz_end_values) > 0
559
- else 0.0,
560
- "inty_mean": round(float(np.mean(inty_values)), 0)
561
- if len(inty_values) > 0
562
- else 0.0,
563
- "bl": -1.0,
564
- "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3)
565
- if len(coherence_values) > 0
566
- else 0.0,
567
- "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0)
568
- if len(prominence_values) > 0
569
- else 0.0,
570
- "chrom_prominence_scaled_mean": round(
571
- float(np.mean(prominence_scaled_values)),
572
- 3,
573
- )
574
- if len(prominence_scaled_values) > 0
575
- else 0.0,
576
- "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
577
- if len(height_scaled_values) > 0
578
- else 0.0,
579
- "iso_mean": round(float(np.mean(iso_values)), 2)
580
- if len(iso_values) > 0
581
- else 0.0,
582
- "charge_mean": round(float(np.mean(charge_values)), 2)
583
- if len(charge_values) > 0
584
- else 0.0,
585
- "number_ms2": int(ms2_count),
586
- })
587
-
588
- consensus_mapping_df = pl.DataFrame(consensus_mapping)
589
- # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
590
- l1 = len(consensus_mapping_df)
591
- consensus_mapping_df = consensus_mapping_df.filter(
592
- pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
593
- )
594
- self.logger.debug(
595
- f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
596
- )
597
- self.consensus_mapping_df = consensus_mapping_df
598
- self.consensus_df = pl.DataFrame(metadata_list)
599
-
600
- if min_samples is None:
601
- min_samples = 1
602
- if min_samples < 1:
603
- min_samples = int(min_samples * len(self.samples_df))
604
- # filter out consensus features with less than min_samples features
605
- l1 = len(self.consensus_df)
606
- self.consensus_df = self.consensus_df.filter(
607
- pl.col("number_samples") >= min_samples,
608
- )
609
- self.logger.debug(
610
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
611
- )
612
- # filter out consensus mapping with less than min_samples features
613
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
614
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
615
- )
616
-
617
- self.consensus_map = consensus_map
618
- # calculate the completeness of the consensus map
619
- c = len(self.consensus_mapping_df) / len(self.consensus_df) / len(self.samples_df)
620
- self.logger.info(
621
- f"Grouping completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
622
- )
623
- if link_ms2:
624
- self.find_ms2()
625
-
626
-
627
- def find_ms2(self, **kwargs):
628
- """
629
- Links MS2 spectra to consensus features and stores the result in self.consensus_ms2.
630
-
631
- Parameters:
632
- **kwargs: Keyword arguments for MS2 linking parameters. Can include:
633
- - A find_ms2_defaults instance to set all parameters at once
634
- - Individual parameter names and values (see find_ms2_defaults for details)
635
- """
636
- # parameters initialization
637
- params = find_ms2_defaults()
638
- for key, value in kwargs.items():
639
- if isinstance(value, find_ms2_defaults):
640
- params = value
641
- self.logger.debug("Using provided find_ms2_defaults parameters")
642
- else:
643
- if hasattr(params, key):
644
- if params.set(key, value, validate=True):
645
- self.logger.debug(f"Updated parameter {key} = {value}")
646
- else:
647
- self.logger.warning(
648
- f"Failed to set parameter {key} = {value} (validation failed)",
649
- )
650
- else:
651
- self.logger.debug(f"Unknown parameter {key} ignored")
652
- # end of parameter initialization
653
-
654
- # Store parameters in the Study object
655
- self.store_history(["find_ms2"], params.to_dict())
656
- self.logger.debug("Parameters stored to find_ms2")
657
-
658
- data = []
659
- if self.consensus_mapping_df.is_empty():
660
- self.logger.error(
661
- "No consensus mapping found. Please run find_consensus() first.",
662
- )
663
- return
664
- self.logger.info("Linking MS2 spectra to consensus features...")
665
-
666
- # Build fast lookup for feature_uid to features_df row data
667
- feats = self.features_df
668
- feature_lookup = {}
669
- relevant_cols = [
670
- "ms2_specs",
671
- "ms2_scans",
672
- "inty",
673
- "chrom_coherence",
674
- "chrom_prominence_scaled",
675
- ]
676
- for row in feats.iter_rows(named=True):
677
- feature_uid = row["feature_uid"]
678
- feature_lookup[feature_uid] = {
679
- col: row[col] for col in relevant_cols if col in feats.columns
680
- }
681
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
682
-
683
- # Process consensus mapping in batch
684
- for mapping_row in tqdm(
685
- self.consensus_mapping_df.iter_rows(named=True),
686
- total=self.consensus_mapping_df.shape[0],
687
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}MS2 spectra",
688
- disable=tdqm_disable,
689
- ):
690
- feature_uid = mapping_row["feature_uid"]
691
- feature_data = feature_lookup.get(feature_uid)
692
- if feature_data is None or feature_data.get("ms2_specs") is None:
693
- continue
694
- ms2_specs = feature_data["ms2_specs"]
695
- ms2_scans = feature_data["ms2_scans"]
696
- inty = feature_data.get("inty")
697
- chrom_coherence = feature_data.get("chrom_coherence")
698
- chrom_prominence_scaled = feature_data.get("chrom_prominence_scaled")
699
- for j in range(len(ms2_specs)):
700
- spec = ms2_specs[j]
701
- scanid = ms2_scans[j]
702
- data.append({
703
- "consensus_uid": int(mapping_row["consensus_uid"]),
704
- "feature_uid": int(mapping_row["feature_uid"]),
705
- "sample_uid": int(mapping_row["sample_uid"]),
706
- "scan_id": int(scanid),
707
- "energy": round(spec.energy, 1)
708
- if hasattr(spec, "energy") and spec.energy is not None
709
- else None,
710
- "prec_inty": round(inty, 0) if inty is not None else None,
711
- "prec_coherence": round(chrom_coherence, 3)
712
- if chrom_coherence is not None
713
- else None,
714
- "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
715
- if chrom_prominence_scaled is not None
716
- else None,
717
- "number_frags": len(spec.mz),
718
- "spec": spec,
719
- })
720
- self.consensus_ms2 = pl.DataFrame(data)
721
- unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
722
- self.logger.info(
723
- f"Linking completed. {len(self.consensus_ms2)} MS2 spectra associated to {unique_consensus_features} consensus features.",
724
- )
725
-
726
-
727
- ## TODO these are not modelled the same way as other ranges, harmonize for tuples
728
- def filter_consensus(
729
- self,
730
- inplace=True,
731
- number_samples=None,
732
- quality=None,
733
- coherence=None,
734
- ):
735
- if self.consensus_df is None:
736
- self.logger.error("No consensus found.")
737
- return
738
- cons = self.consensus_df if inplace else self.consensus_df.copy()
739
- l = len(cons)
740
- self.logger.info(f"Filtering consensus features with {l} entries...")
741
- if coherence is not None:
742
- if "chrom_coherence" not in cons.columns:
743
- self.logger.warning("No coherence data found in features.")
744
- else:
745
- if isinstance(coherence, tuple) and len(coherence) == 2:
746
- min_coherence, max_coherence = coherence
747
- cons = cons[
748
- (cons["chrom_coherence"] >= min_coherence)
749
- & (cons["chrom_coherence"] <= max_coherence)
750
- ]
751
- else:
752
- cons = cons[cons["chrom_coherence"] >= coherence]
753
- l2 = len(cons)
754
- self.logger.info(
755
- f"Filtered {l - l2} entries based on coherence. Remaining {l2} entries.",
756
- )
757
-
758
- if quality is not None:
759
- if isinstance(quality, tuple) and len(quality) == 2:
760
- min_quality, max_quality = quality
761
- cons = cons[
762
- (cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
763
- ]
764
- else:
765
- cons = cons[cons["quality"] >= quality]
766
- l3 = len(cons)
767
- self.logger.info(
768
- f"Filtered {l2 - l3} entries based on quality. Remaining {l3} entries.",
769
- )
770
-
771
- if number_samples is not None:
772
- if isinstance(number_samples, tuple) and len(number_samples) == 2:
773
- min_number, max_number = number_samples
774
- cons = cons[
775
- (cons["number_samples"] >= min_number)
776
- & (cons["number_samples"] <= max_number)
777
- ]
778
- else:
779
- cons = cons[cons["number_samples"] >= number_samples]
780
- l4 = len(cons)
781
- self.logger.info(
782
- f"Filtered {l3 - l4} entries based on number of samples. Remaining {l4} entries.",
783
- )
784
-
785
- self.logger.info(f"Filtering completed. {len(cons)} entries remaining.")
786
-
787
- if inplace:
788
- self.consensus_df = cons
789
- else:
790
- return cons
791
-
792
-
793
- def filter_features(self):
794
- # remove rows from features_df where feature_uid is not in consensus_mapping_df['feature_uid']
795
- feature_uids_to_keep = self.consensus_mapping_df["feature_uid"].to_list()
796
- self.features_df = self.features_df.filter(
797
- pl.col("uid").is_in(feature_uids_to_keep),
798
- )
799
-
800
-
801
- ## TODO is uid supposed to be a list? rt_tol 0?
802
- def integrate_chrom(self, **kwargs):
803
- """
804
- Given a consensus_id, integrate the intensity of all features in the consensus map.
805
-
806
- Parameters:
807
- **kwargs: Keyword arguments for integration parameters. Can include:
808
- - An integrate_chrom_defaults instance to set all parameters at once
809
- - Individual parameter names and values (see integrate_chrom_defaults for details)
810
-
811
- Key Parameters:
812
- uids: List of consensus UIDs to integrate (default: all consensus features).
813
- rt_tol: RT tolerance for integration boundaries.
814
- """
815
- # parameters initialization
816
- params = integrate_chrom_defaults()
817
- for key, value in kwargs.items():
818
- if isinstance(value, integrate_chrom_defaults):
819
- params = value
820
- self.logger.debug("Using provided integrate_chrom_defaults parameters")
821
- else:
822
- if hasattr(params, key):
823
- if params.set(key, value, validate=True):
824
- self.logger.debug(f"Updated parameter {key} = {value}")
825
- else:
826
- self.logger.warning(
827
- f"Failed to set parameter {key} = {value} (validation failed)",
828
- )
829
- else:
830
- self.logger.debug(f"Unknown parameter {key} ignored")
831
- # end of parameter initialization
832
-
833
- # Store parameters in the Study object
834
- self.store_history(["integrate_chrom"], params.to_dict())
835
- self.logger.debug("Parameters stored to integrate_chrom")
836
-
837
- # Get parameter values for use in the method
838
- uids = params.get("uids")
839
- rt_tol = params.get("rt_tol")
840
-
841
- if self.consensus_map is None:
842
- self.logger.error("No consensus map found.")
843
- return
844
- if uids is None:
845
- # get all consensus_id from consensus_df
846
- ids = self.consensus_df["consensus_uid"].to_list()
847
- else:
848
- # keep only id that are in consensus_df
849
- ids = [i for i in uids if i in self.consensus_df["consensus_uid"].to_list()]
850
-
851
- # Ensure chrom_area column is Float64 to avoid dtype conflicts
852
- if "chrom_area" in self.features_df.columns:
853
- self.features_df = self.features_df.with_columns(
854
- pl.col("chrom_area").cast(pl.Float64, strict=False),
855
- )
856
-
857
- # Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
858
- # Use Polars join operation instead of pandas merge
859
- consensus_subset = self.consensus_df.select([
860
- "consensus_uid",
861
- "rt_start_mean",
862
- "rt_end_mean",
863
- ])
864
- df1 = self.consensus_mapping_df.join(
865
- consensus_subset,
866
- on="consensus_uid",
867
- how="left",
868
- )
869
- df1 = df1.filter(pl.col("consensus_uid").is_in(ids))
870
-
871
- # Build a fast lookup for feature_uid to row index in features_df
872
- # Since Polars doesn't have index-based access like pandas, we'll use row position
873
- feature_uid_to_row = {}
874
- for i, row_dict in enumerate(self.features_df.iter_rows(named=True)):
875
- if "feature_uid" in row_dict:
876
- feature_uid_to_row[row_dict["feature_uid"]] = i
877
- elif "uid" in row_dict: # fallback column name
878
- feature_uid_to_row[row_dict["uid"]] = i
879
-
880
- # Prepare lists for batch update
881
- update_rows = []
882
- chroms: list = []
883
- rt_starts: list[float] = []
884
- rt_ends: list[float] = []
885
- rt_deltas: list[float] = []
886
- chrom_areas = []
887
-
888
- self.logger.debug(f"Integrating {df1.shape[0]} features using consensus...")
889
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
890
- for row in tqdm(
891
- df1.iter_rows(named=True),
892
- total=df1.shape[0],
893
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Integrate EICs by consensus",
894
- disable=tdqm_disable,
895
- ):
896
- feature_uid = row["feature_uid"]
897
- row_idx = feature_uid_to_row.get(feature_uid)
898
- if row_idx is None:
899
- continue
900
-
901
- # Get the feature row from Polars DataFrame
902
- feature_row = self.features_df.row(row_idx, named=True)
903
- # get chromatogram for the feature
904
- chrom = feature_row["chrom"]
905
- if chrom is None or len(chrom) == 0:
906
- update_rows.append(row_idx)
907
- chroms.append(None)
908
- rt_starts.append(None)
909
- rt_ends.append(None)
910
- rt_deltas.append(None)
911
- chrom_areas.append(-1.0)
912
- continue
913
- ## TODO expose parameters
914
- rt_start = _find_closest_valley(
915
- chrom,
916
- row["rt_start_mean"] - rt_tol,
917
- dir="left",
918
- threshold=0.9,
919
- )
920
- rt_end = _find_closest_valley(
921
- chrom,
922
- row["rt_end_mean"] + rt_tol,
923
- dir="right",
924
- threshold=0.9,
925
- )
926
- chrom.feature_start = rt_start
927
- chrom.feature_end = rt_end
928
- chrom.integrate()
929
- update_rows.append(row_idx)
930
- chroms.append(chrom)
931
- rt_starts.append(rt_start)
932
- rt_ends.append(rt_end)
933
- rt_deltas.append(rt_end - rt_start)
934
- chrom_areas.append(float(chrom.feature_area))
935
-
936
- # Batch update DataFrame - Polars style
937
- if update_rows:
938
- # Create mapping from row index to new values
939
- row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
940
- row_to_rt_start = {
941
- update_rows[i]: rt_starts[i] for i in range(len(update_rows))
942
- }
943
- row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
944
- row_to_rt_delta = {
945
- update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
946
- }
947
- row_to_chrom_area = {
948
- update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
949
- for i in range(len(update_rows))
950
- }
951
-
952
- # Use with_row_index to create a temporary row index column
953
- df_with_index = self.features_df.with_row_index("__row_idx")
954
-
955
- # Create update masks and values
956
- update_mask = pl.col("__row_idx").is_in(update_rows)
957
-
958
- # Update columns conditionally
959
- try:
960
- self.features_df = df_with_index.with_columns([
961
- # Update chrom column - use when() to update only specific rows
962
- pl.when(update_mask)
963
- .then(
964
- pl.col("__row_idx").map_elements(
965
- lambda x: row_to_chrom.get(x, None),
966
- return_dtype=pl.Object,
967
- ),
968
- )
969
- .otherwise(pl.col("chrom"))
970
- .alias("chrom"),
971
- # Update rt_start column
972
- pl.when(update_mask)
973
- .then(
974
- pl.col("__row_idx").map_elements(
975
- lambda x: row_to_rt_start.get(x, None),
976
- return_dtype=pl.Float64,
977
- ),
978
- )
979
- .otherwise(pl.col("rt_start"))
980
- .alias("rt_start"),
981
- # Update rt_end column
982
- pl.when(update_mask)
983
- .then(
984
- pl.col("__row_idx").map_elements(
985
- lambda x: row_to_rt_end.get(x, None),
986
- return_dtype=pl.Float64,
987
- ),
988
- )
989
- .otherwise(pl.col("rt_end"))
990
- .alias("rt_end"),
991
- # Update rt_delta column
992
- pl.when(update_mask)
993
- .then(
994
- pl.col("__row_idx").map_elements(
995
- lambda x: row_to_rt_delta.get(x, None),
996
- return_dtype=pl.Float64,
997
- ),
998
- )
999
- .otherwise(pl.col("rt_delta"))
1000
- .alias("rt_delta"),
1001
- # Update chrom_area column
1002
- pl.when(update_mask)
1003
- .then(
1004
- pl.col("__row_idx").map_elements(
1005
- lambda x: row_to_chrom_area.get(x, 0),
1006
- return_dtype=pl.Float64,
1007
- ),
1008
- )
1009
- .otherwise(pl.col("chrom_area"))
1010
- .alias("chrom_area"),
1011
- ]).drop("__row_idx") # Remove the temporary row index column
1012
-
1013
- self.logger.debug(
1014
- f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
1015
- )
1016
- except Exception as e:
1017
- self.logger.error(f"Failed to update features DataFrame: {e}")
1018
- else:
1019
- self.logger.debug("No features were updated during integration.")
1020
-
1021
-
1022
- def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
1023
- # find closest index to rt in chrom['rt']
1024
- chrom.rt = chrom.rt.astype(np.float64)
1025
- chrom.inty = chrom.inty.astype(np.float64)
1026
- idx = np.abs(chrom.rt - rt).argmin()
1027
- # ensure rt and inty are float64
1028
- if dir == "left":
1029
- inty = np.inf
1030
- # iterate left from idx to the end od the peaks until we find a valley
1031
- for i in range(idx, 0, -1):
1032
- if chrom.inty[i] < inty * threshold:
1033
- idx = i
1034
- inty = chrom.inty[i]
1035
- else:
1036
- break
1037
- if dir == "right":
1038
- inty = np.inf
1039
- # iterate right from idx to the end od the peaks until we find a valley
1040
- for i in range(idx, len(chrom.inty)):
1041
- if chrom.inty[i] < inty * threshold:
1042
- idx = i
1043
- inty = chrom.inty[i]
1044
- else:
1045
- break
1046
- return chrom.rt[idx]
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ import pyopenms as oms
8
+
9
+ from tqdm import tqdm
10
+
11
+ from masster.study.defaults import (
12
+ align_defaults,
13
+ find_ms2_defaults,
14
+ integrate_defaults,
15
+ merge_defaults,
16
+ )
17
+
18
+
19
+ def align(self, **kwargs):
20
+ """
21
+ Aligns feature maps using pose clustering and updates retention times in the features DataFrame.
22
+
23
+ Parameters:
24
+ **kwargs: Keyword arguments for alignment parameters. Can include:
25
+ - An align_defaults instance to set all parameters at once
26
+ - Individual parameter names and values (see align_defaults for details)
27
+
28
+ Key Parameters:
29
+ rt_max_diff (float): Maximum RT difference for alignment (default: 60.0).
30
+ mz_max_diff (float): Maximum m/z difference for alignment (default: 0.01).
31
+ rt_pair_distance_frac (float): RT pair distance fraction for superimposer (default: 0.2).
32
+ mz_pair_max_distance (float): Maximum m/z pair distance for superimposer (default: 0.01).
33
+ num_used_points (int): Number of points used for superimposer (default: 1000).
34
+ save_features (bool): Whether to save features after alignment (default: True).
35
+ skip_blanks (bool): Whether to skip blank samples during alignment (default: True).
36
+ """
37
+ # parameters initialization
38
+ params = align_defaults()
39
+ for key, value in kwargs.items():
40
+ if isinstance(value, align_defaults):
41
+ params = value
42
+ self.logger.debug("Using provided align_defaults parameters")
43
+ else:
44
+ if hasattr(params, key):
45
+ if params.set(key, value, validate=True):
46
+ self.logger.debug(f"Updated parameter {key} = {value}")
47
+ else:
48
+ self.logger.warning(
49
+ f"Failed to set parameter {key} = {value} (validation failed)",
50
+ )
51
+ else:
52
+ self.logger.debug(f"Unknown parameter {key} ignored")
53
+ # end of parameter initialization
54
+
55
+ # Store parameters in the Study object
56
+ self.store_history(["align"], params.to_dict())
57
+ self.logger.debug("Parameters stored to align")
58
+
59
+ if len(self.features_maps) < len(self.samples_df):
60
+ self.features_maps = []
61
+ self.load_features()
62
+
63
+ self.logger.debug("Starting alignment")
64
+
65
+ fmaps = self.features_maps
66
+ # set ref_index to feature map index with largest number of features
67
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
68
+
69
+ self.logger.info(
70
+ f"Align on {self.samples_df.row(ref_index, named=True)['sample_name']}",
71
+ )
72
+
73
+ aligner = oms.MapAlignmentAlgorithmPoseClustering()
74
+
75
+ params_oms = oms.Param()
76
+ params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
77
+ params_oms.setValue("pairfinder:ignore_charge", "true")
78
+ params_oms.setValue("max_num_peaks_considered", 1000)
79
+ params_oms.setValue("pairfinder:distance_RT:max_difference", params.get("rt_max_diff"))
80
+ params_oms.setValue("pairfinder:distance_MZ:max_difference", params.get("mz_max_diff"))
81
+ params_oms.setValue("superimposer:rt_pair_distance_fraction", params.get("rt_pair_distance_frac"))
82
+ params_oms.setValue("superimposer:mz_pair_max_distance", params.get("mz_pair_max_distance"))
83
+ params_oms.setValue("superimposer:num_used_points", params.get("num_used_points"))
84
+ params_oms.setValue("pairfinder:distance_MZ:exponent", 3.0)
85
+ params_oms.setValue("pairfinder:distance_RT:exponent", 2.0)
86
+ aligner.setParameters(params_oms)
87
+ """
88
+ {b'max_num_peaks_considered': 1000,
89
+ b'superimposer:mz_pair_max_distance': 0.5,
90
+ b'superimposer:rt_pair_distance_fraction': 0.1,
91
+ b'superimposer:num_used_points': 2000,
92
+ b'superimposer:scaling_bucket_size': 0.005,
93
+ b'superimposer:shift_bucket_size': 3.0,
94
+ b'superimposer:max_shift': 1000.0,
95
+ b'superimposer:max_scaling': 2.0,
96
+ b'superimposer:dump_buckets': '',
97
+ b'superimposer:dump_pairs': '',
98
+ b'pairfinder:second_nearest_gap': 2.0,
99
+ b'pairfinder:use_identifications': 'false',
100
+ b'pairfinder:ignore_charge': 'false',
101
+ b'pairfinder:ignore_adduct': 'true',
102
+ b'pairfinder:distance_RT:max_difference': 100.0,
103
+ b'pairfinder:distance_RT:exponent': 1.0,
104
+ b'pairfinder:distance_RT:weight': 1.0,
105
+ b'pairfinder:distance_MZ:max_difference': 0.3,
106
+ b'pairfinder:distance_MZ:unit': 'Da',
107
+ b'pairfinder:distance_MZ:exponent': 2.0,
108
+ b'pairfinder:distance_MZ:weight': 1.0,
109
+ b'pairfinder:distance_intensity:exponent': 1.0,
110
+ b'pairfinder:distance_intensity:weight': 0.0,
111
+ b'pairfinder:distance_intensity:log_transform': 'disabled'} """
112
+
113
+ aligner.setReference(fmaps[ref_index])
114
+
115
+ self.logger.debug(f"Parameters for alignment: {params}")
116
+
117
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
118
+ # perform alignment and transformation of feature maps to the reference map (exclude reference map)
119
+ for index, fm in tqdm(
120
+ list(enumerate(fmaps)),
121
+ total=len(fmaps),
122
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Align feature maps",
123
+ disable=tdqm_disable,
124
+ ):
125
+ if index == ref_index:
126
+ continue
127
+ if params.get("skip_blanks") and self.samples_df.row(index, named=True)["sample_type"] == "blank":
128
+ continue
129
+ trafo = oms.TransformationDescription()
130
+ aligner.align(fm, trafo)
131
+ transformer = oms.MapAlignmentTransformer()
132
+ transformer.transformRetentionTimes(fm, trafo, True)
133
+
134
+ self.alignment_ref_index = ref_index
135
+
136
+ # check if rt_original exists in features_df, if not, add it after rt
137
+ if "rt_original" not in self.features_df.columns:
138
+ # add column 'rt_original' after 'rt'
139
+ rt_index = self.features_df.columns.get_loc("rt") + 1
140
+ self.features_df.insert(rt_index, "rt_original", 0)
141
+ self.features_df["rt_original"] = self.features_df["rt"]
142
+
143
+ # iterate through all feature_maps and add the transformed retention times to the features_df
144
+
145
+ # Build a fast lookup for (sample_uid, feature_uid) to index in features_df
146
+ feats = self.features_df
147
+
148
+ # Pre-build sample_uid lookup for faster access
149
+ self.logger.debug("Build sample_uid lookup for fast access...")
150
+ sample_uid_lookup = {
151
+ idx: row_dict["sample_uid"] for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
152
+ }
153
+
154
+ # Build the main lookup using feature_uid (not feature_id)
155
+ if "feature_id" in feats.columns:
156
+ # Create lookup mapping (sample_uid, feature_uid) to DataFrame index using Polars
157
+ # Since we need a pandas-style index lookup, we'll create a simple dict
158
+ sample_uids = feats.get_column("sample_uid").to_list()
159
+
160
+ # Handle feature_id column - it might be Object type due to conversion
161
+ feature_id_col = feats.get_column("feature_id")
162
+ if feature_id_col.dtype == pl.Object:
163
+ # If it's Object type, convert to list and let Python handle the conversion
164
+ feature_ids = feature_id_col.to_list()
165
+ # Convert to strings if they're not already
166
+ feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
167
+ else:
168
+ # Safe to cast normally
169
+ feature_ids = feature_id_col.cast(pl.Utf8).to_list()
170
+
171
+ lookup = {
172
+ (sample_uid, feature_id): idx
173
+ for idx, (sample_uid, feature_id) in enumerate(
174
+ zip(sample_uids, feature_ids, strict=True),
175
+ )
176
+ }
177
+ else:
178
+ # fallback: skip if feature_uid column missing
179
+ lookup = {}
180
+ self.logger.warning("feature_id column not found in features_df")
181
+
182
+ # Pre-allocate update lists for better performance
183
+ all_update_idx = []
184
+ all_update_rt = []
185
+ all_update_rt_original = []
186
+
187
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
188
+
189
+ for index, fm in tqdm(
190
+ list(enumerate(fmaps)),
191
+ total=len(fmaps),
192
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract RTs",
193
+ disable=tdqm_disable,
194
+ ):
195
+ sample_uid = sample_uid_lookup.get(index)
196
+ if sample_uid is None:
197
+ continue
198
+
199
+ # Collect all updates for this feature map
200
+ for f in fm:
201
+ feature_uid = str(f.getUniqueId())
202
+ idx = lookup.get((sample_uid, feature_uid))
203
+ if idx is not None:
204
+ rt = round(f.getRT(), 3)
205
+ # rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
206
+ all_update_idx.append(idx)
207
+ all_update_rt.append(rt)
208
+ # all_update_rt_original.append(rt_or)
209
+
210
+ # Single batch update for all features at once
211
+ if all_update_idx:
212
+ # Update "rt" column for specified indices using Polars
213
+ self.features_df = self.features_df.with_columns(
214
+ pl.when(pl.int_range(0, self.features_df.height).is_in(all_update_idx))
215
+ .then(pl.Series("rt", all_update_rt))
216
+ .otherwise(pl.col("rt"))
217
+ .alias("rt"),
218
+ )
219
+ # self.features_df.loc[all_update_idx, "rt_original"] = all_update_rt_original
220
+
221
+ self.logger.debug("Alignment completed successfully.")
222
+
223
+ if params.get("save_features"):
224
+ self.save_samples()
225
+
226
+
227
+ def merge(self, **kwargs):
228
+ """
229
+ Groups features across samples into consensus features using the specified algorithm.
230
+
231
+ Parameters:
232
+ **kwargs: Keyword arguments for consensus parameters. Can include:
233
+ - A merge_defaults instance to set all parameters at once
234
+ - Individual parameter names and values (see merge_defaults for details)
235
+
236
+ Key Parameters:
237
+ algorithm (str): Feature grouping algorithm ('kd', 'unlabeled', 'sequential', or default 'qt').
238
+ min_samples (int): Minimum number of samples for a consensus feature.
239
+ link_ms2 (bool): Whether to link MS2 spectra to consensus features.
240
+ mz_tol (float): m/z tolerance for grouping (default: 0.01).
241
+ rt_tol (float): RT tolerance for grouping (default: 1.0).
242
+ """
243
+ # Reset consensus-related DataFrames at the start
244
+ self.consensus_df = pl.DataFrame()
245
+ self.consensus_ms2 = pl.DataFrame()
246
+ self.consensus_mapping_df = pl.DataFrame()
247
+
248
+ self.logger.info('Merging...')
249
+ # parameters initialization
250
+ params = merge_defaults()
251
+ for key, value in kwargs.items():
252
+ if isinstance(value, merge_defaults):
253
+ params = value
254
+ self.logger.debug("Using provided merge_defaults parameters")
255
+ else:
256
+ if hasattr(params, key):
257
+ if params.set(key, value, validate=True):
258
+ self.logger.debug(f"Updated parameter {key} = {value}")
259
+ else:
260
+ self.logger.warning(
261
+ f"Failed to set parameter {key} = {value} (validation failed)",
262
+ )
263
+ else:
264
+ self.logger.debug(f"Unknown parameter {key} ignored")
265
+ # end of parameter initialization
266
+
267
+ # Store parameters in the Study object
268
+ self.store_history(["merge"], params.to_dict())
269
+ self.logger.debug("Parameters stored to merge")
270
+
271
+ # Get parameter values for use in the method
272
+ algorithm = params.get("algorithm")
273
+ min_samples = params.get("min_samples")
274
+ link_ms2 = params.get("link_ms2")
275
+ mz_tol = kwargs.get("mz_tol", 0.01) # Default values for parameters not in defaults class
276
+ rt_tol = kwargs.get("rt_tol", 1.0)
277
+
278
+ if len(self.samples_df) > 200 and algorithm == "qt":
279
+ self.logger.warning(
280
+ "Using QT for large datasets is NOT recommended [O(n²)], consider using KDTree instead [O(n log n)].",
281
+ )
282
+
283
+ # check that features_maps is not empty
284
+ if not self.features_maps or len(self.features_maps) == 0:
285
+ self.load_features()
286
+ params_oms = oms.Param()
287
+ ## TODO expose these
288
+
289
+ feature_grouper: object # Use generic type for different OpenMS algorithms
290
+ match algorithm.lower():
291
+ case "kd":
292
+ feature_grouper = oms.FeatureGroupingAlgorithmKD()
293
+ self.logger.debug("Merging features with KDTree...")
294
+ params_oms.setValue("mz_unit", "Da")
295
+ params_oms.setValue("nr_partitions", len(self.samples_df))
296
+
297
+ params_oms.setValue("warp:enabled", "true")
298
+ params_oms.setValue("warp:rt_tol", rt_tol)
299
+ params_oms.setValue("warp:mz_tol", mz_tol)
300
+
301
+ params_oms.setValue("link:rt_tol", rt_tol)
302
+ params_oms.setValue("link:mz_tol", mz_tol)
303
+ case "unlabeled":
304
+ feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
305
+ self.logger.debug("Merging features with Unlabelled algorithm...")
306
+ params_oms.setValue("second_nearest_gap", 2.0)
307
+ params_oms.setValue("ignore_charge", "true")
308
+ params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
309
+ params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
310
+ params_oms.setValue("distance_MZ:unit", "Da")
311
+ case "sequential":
312
+ self.logger.debug(
313
+ "Merging features sequentially with Unlabelled algorithm...",
314
+ )
315
+ params_oms.setValue("second_nearest_gap", 2.0)
316
+ params_oms.setValue("ignore_charge", "true")
317
+ params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
318
+ params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
319
+ params_oms.setValue("distance_MZ:unit", "Da")
320
+ case "qt":
321
+ feature_grouper = oms.FeatureGroupingAlgorithmQT()
322
+ self.logger.debug("Grouping features with QT...")
323
+ params_oms.setValue("nr_partitions", len(self.samples_df))
324
+ params_oms.setValue("ignore_charge", "true")
325
+ params_oms.setValue("distance_RT:max_difference", rt_tol * 3)
326
+ params_oms.setValue("distance_MZ:max_difference", mz_tol * 3)
327
+ params_oms.setValue("distance_MZ:unit", "Da")
328
+ self.logger.debug(f"Parameters for feature grouping: {params_oms}")
329
+ consensus_map = oms.ConsensusMap()
330
+ file_descriptions = consensus_map.getColumnHeaders() # type: ignore
331
+ feature_maps = self.features_maps
332
+ for i, feature_map in enumerate(feature_maps):
333
+ file_description = file_descriptions.get(i, oms.ColumnHeader())
334
+ file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
335
+ file_description.size = feature_map.size()
336
+ file_description.unique_id = feature_map.getUniqueId()
337
+ file_descriptions[i] = file_description
338
+
339
+ consensus_map.setColumnHeaders(file_descriptions) # type: ignore
340
+
341
+ # create a copy of the feature maps to store the original feature map information
342
+ match algorithm.lower():
343
+ case "sequential":
344
+ # set the reference map to self.alignment_ref_index
345
+ if self.alignment_ref_index is None:
346
+ # pick the feature map with the most features as reference
347
+ self.alignment_ref_index = max(
348
+ range(len(self.features_maps)),
349
+ key=lambda i: self.features_maps[i].size(),
350
+ )
351
+ feature_grouper = oms.FeatureGroupingAlgorithmUnlabeled()
352
+ feature_grouper.setParameters(params_oms)
353
+ feature_grouper.setReference(
354
+ self.alignment_ref_index,
355
+ self.features_maps[self.alignment_ref_index],
356
+ )
357
+ self.logger.info(
358
+ f"Using feature map {self.samples_df.row(self.alignment_ref_index, named=True)['sample_name']} as reference.",
359
+ )
360
+
361
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
362
+ for i, feature_map in tqdm(
363
+ enumerate(self.features_maps),
364
+ total=len(self.features_maps),
365
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Add samples",
366
+ disable=tdqm_disable,
367
+ ):
368
+ if i == self.alignment_ref_index:
369
+ continue
370
+ feature_grouper.addToGroup(i, feature_map)
371
+ self.logger.debug("Grouping features.")
372
+ consensus_map = feature_grouper.getResultMap()
373
+ if hasattr(consensus_map, "setUniqueIds"):
374
+ consensus_map.setUniqueIds()
375
+ case _:
376
+ feature_grouper.setParameters(params_oms) # type: ignore
377
+ # add all feature maps and group in one batch
378
+ self.logger.debug("Grouping features in one batch...")
379
+ feature_grouper.group(feature_maps, consensus_map) # type: ignore
380
+ if hasattr(consensus_map, "setUniqueIds"):
381
+ consensus_map.setUniqueIds()
382
+
383
+ # create a dict to map uid to feature_uid using self.features_df
384
+ feature_uid_map = {row["feature_id"]: row["feature_uid"] for row in self.features_df.iter_rows(named=True)}
385
+ imax = consensus_map.size()
386
+
387
+ # Pre-build fast lookup tables for features_df data
388
+ features_lookup = {}
389
+ feature_columns = [
390
+ "rt",
391
+ "mz",
392
+ "rt_start",
393
+ "rt_end",
394
+ "rt_delta",
395
+ "mz_start",
396
+ "mz_end",
397
+ "inty",
398
+ "chrom_coherence",
399
+ "chrom_prominence",
400
+ "chrom_prominence_scaled",
401
+ "chrom_height_scaled",
402
+ "iso",
403
+ "charge",
404
+ "ms2_scans",
405
+ "adduct",
406
+ "adduct_mass",
407
+ ]
408
+
409
+ for row in self.features_df.iter_rows(named=True):
410
+ feature_uid = row["feature_uid"]
411
+ features_lookup[feature_uid] = {col: row[col] for col in feature_columns if col in self.features_df.columns}
412
+
413
+ # create a list to store the consensus mapping
414
+ consensus_mapping = []
415
+ metadata_list = []
416
+
417
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
418
+
419
+ for i, feature in enumerate(
420
+ tqdm(
421
+ consensus_map,
422
+ total=imax,
423
+ disable=tqdm_disable,
424
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
425
+ ),
426
+ ):
427
+ # get all features in the feature map with the same unique id as the consensus feature
428
+ features_list = feature.getFeatureList()
429
+ uids = []
430
+ feature_data_list = []
431
+
432
+ for _j, f in enumerate(features_list):
433
+ fuid = str(f.getUniqueId())
434
+ if fuid not in feature_uid_map:
435
+ # this is a feature that was removed but is still in the feature maps
436
+ continue
437
+ fuid = feature_uid_map[fuid]
438
+ consensus_mapping.append({
439
+ "consensus_uid": i,
440
+ "sample_uid": f.getMapIndex() + 1,
441
+ "feature_uid": fuid,
442
+ })
443
+ uids.append(fuid)
444
+
445
+ # Get feature data from lookup instead of DataFrame filtering
446
+ feature_data = features_lookup.get(fuid)
447
+ if feature_data:
448
+ feature_data_list.append(feature_data)
449
+
450
+ if not feature_data_list:
451
+ # Skip this consensus feature if no valid features found
452
+ continue
453
+
454
+ # Compute statistics using vectorized operations on collected data
455
+ # Convert to numpy arrays for faster computation
456
+ rt_values = np.array([fd.get("rt", 0) for fd in feature_data_list if fd.get("rt") is not None])
457
+ mz_values = np.array([fd.get("mz", 0) for fd in feature_data_list if fd.get("mz") is not None])
458
+ rt_start_values = np.array([
459
+ fd.get("rt_start", 0) for fd in feature_data_list if fd.get("rt_start") is not None
460
+ ])
461
+ rt_end_values = np.array([fd.get("rt_end", 0) for fd in feature_data_list if fd.get("rt_end") is not None])
462
+ rt_delta_values = np.array([
463
+ fd.get("rt_delta", 0) for fd in feature_data_list if fd.get("rt_delta") is not None
464
+ ])
465
+ mz_start_values = np.array([
466
+ fd.get("mz_start", 0) for fd in feature_data_list if fd.get("mz_start") is not None
467
+ ])
468
+ mz_end_values = np.array([fd.get("mz_end", 0) for fd in feature_data_list if fd.get("mz_end") is not None])
469
+ inty_values = np.array([fd.get("inty", 0) for fd in feature_data_list if fd.get("inty") is not None])
470
+ coherence_values = np.array([
471
+ fd.get("chrom_coherence", 0) for fd in feature_data_list if fd.get("chrom_coherence") is not None
472
+ ])
473
+ prominence_values = np.array([
474
+ fd.get("chrom_prominence", 0) for fd in feature_data_list if fd.get("chrom_prominence") is not None
475
+ ])
476
+ prominence_scaled_values = np.array([
477
+ fd.get("chrom_prominence_scaled", 0)
478
+ for fd in feature_data_list
479
+ if fd.get("chrom_prominence_scaled") is not None
480
+ ])
481
+ height_scaled_values = np.array([
482
+ fd.get("chrom_height_scaled", 0) for fd in feature_data_list if fd.get("chrom_height_scaled") is not None
483
+ ])
484
+ iso_values = np.array([fd.get("iso", 0) for fd in feature_data_list if fd.get("iso") is not None])
485
+ charge_values = np.array([fd.get("charge", 0) for fd in feature_data_list if fd.get("charge") is not None])
486
+
487
+ # adduct_values
488
+ # Collect all adducts from feature_data_list to create consensus adduct information
489
+ all_adducts = []
490
+ adduct_masses = {}
491
+
492
+ for fd in feature_data_list:
493
+ # Get individual adduct and mass from each feature data (fd)
494
+ adduct = fd.get("adduct")
495
+ adduct_mass = fd.get("adduct_mass")
496
+
497
+ if adduct is not None:
498
+ all_adducts.append(adduct)
499
+ if adduct_mass is not None:
500
+ adduct_masses[adduct] = adduct_mass
501
+
502
+ # Calculate adduct_values for the consensus feature
503
+ adduct_values = []
504
+ if all_adducts:
505
+ adduct_counts = {adduct: all_adducts.count(adduct) for adduct in set(all_adducts)}
506
+ total_count = sum(adduct_counts.values())
507
+ for adduct, count in adduct_counts.items():
508
+ percentage = (count / total_count) * 100 if total_count > 0 else 0
509
+ mass = adduct_masses.get(adduct, None)
510
+ # Store as dict instead of tuple to avoid type confusion
511
+ adduct_values.append({
512
+ "adduct": str(adduct),
513
+ "count": int(count),
514
+ "percentage": float(round(percentage, 2)),
515
+ "mass": float(mass) if mass is not None else None
516
+ })
517
+
518
+ # Sort adduct_values by count in descending order
519
+ adduct_values.sort(key=lambda x: x["count"], reverse=True) # type: ignore[arg-type,return-value]
520
+ # Store adduct_values for use in metadata
521
+ consensus_adduct_values = adduct_values
522
+
523
+ # Calculate number of MS2 spectra
524
+ ms2_count = 0
525
+ for fd in feature_data_list:
526
+ ms2_scans = fd.get("ms2_scans")
527
+ if ms2_scans is not None:
528
+ ms2_count += len(ms2_scans)
529
+
530
+ metadata_list.append({
531
+ "consensus_uid": int(i), # "consensus_id": i,
532
+ "consensus_id": str(feature.getUniqueId()),
533
+ "quality": round(float(feature.getQuality()), 3),
534
+ "number_samples": len(feature_data_list),
535
+ # "number_ext": int(len(features_list)),
536
+ "rt": round(float(np.mean(rt_values)), 4) if len(rt_values) > 0 else 0.0,
537
+ "mz": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
538
+ "rt_min": round(float(np.min(rt_values)), 3) if len(rt_values) > 0 else 0.0,
539
+ "rt_max": round(float(np.max(rt_values)), 3) if len(rt_values) > 0 else 0.0,
540
+ "rt_mean": round(float(np.mean(rt_values)), 3) if len(rt_values) > 0 else 0.0,
541
+ "rt_start_mean": round(float(np.mean(rt_start_values)), 3) if len(rt_start_values) > 0 else 0.0,
542
+ "rt_end_mean": round(float(np.mean(rt_end_values)), 3) if len(rt_end_values) > 0 else 0.0,
543
+ "rt_delta_mean": round(float(np.ptp(rt_delta_values)), 3) if len(rt_delta_values) > 0 else 0.0,
544
+ "mz_min": round(float(np.min(mz_values)), 4) if len(mz_values) > 0 else 0.0,
545
+ "mz_max": round(float(np.max(mz_values)), 4) if len(mz_values) > 0 else 0.0,
546
+ "mz_mean": round(float(np.mean(mz_values)), 4) if len(mz_values) > 0 else 0.0,
547
+ "mz_start_mean": round(float(np.mean(mz_start_values)), 4) if len(mz_start_values) > 0 else 0.0,
548
+ "mz_end_mean": round(float(np.mean(mz_end_values)), 4) if len(mz_end_values) > 0 else 0.0,
549
+ "inty_mean": round(float(np.mean(inty_values)), 0) if len(inty_values) > 0 else 0.0,
550
+ "bl": -1.0,
551
+ "chrom_coherence_mean": round(float(np.mean(coherence_values)), 3) if len(coherence_values) > 0 else 0.0,
552
+ "chrom_prominence_mean": round(float(np.mean(prominence_values)), 0) if len(prominence_values) > 0 else 0.0,
553
+ "chrom_prominence_scaled_mean": round(
554
+ float(np.mean(prominence_scaled_values)),
555
+ 3,
556
+ )
557
+ if len(prominence_scaled_values) > 0
558
+ else 0.0,
559
+ "chrom_height_scaled_mean": round(float(np.mean(height_scaled_values)), 3)
560
+ if len(height_scaled_values) > 0
561
+ else 0.0,
562
+ "iso_mean": round(float(np.mean(iso_values)), 2) if len(iso_values) > 0 else 0.0,
563
+ "charge_mean": round(float(np.mean(charge_values)), 2) if len(charge_values) > 0 else 0.0,
564
+ "number_ms2": int(ms2_count),
565
+ "adducts": consensus_adduct_values if consensus_adduct_values else [], # Ensure it's always a list
566
+ })
567
+
568
+ consensus_mapping_df = pl.DataFrame(consensus_mapping)
569
+ # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
570
+ l1 = len(consensus_mapping_df)
571
+ consensus_mapping_df = consensus_mapping_df.filter(
572
+ pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
573
+ )
574
+ self.logger.debug(
575
+ f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
576
+ )
577
+ self.consensus_mapping_df = consensus_mapping_df
578
+ self.consensus_df = pl.DataFrame(metadata_list, strict=False)
579
+
580
+ if min_samples is None:
581
+ min_samples = 1
582
+ if min_samples < 1:
583
+ min_samples = int(min_samples * len(self.samples_df))
584
+ # filter out consensus features with less than min_samples features
585
+ l1 = len(self.consensus_df)
586
+ self.consensus_df = self.consensus_df.filter(
587
+ pl.col("number_samples") >= min_samples,
588
+ )
589
+ self.logger.debug(
590
+ f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
591
+ )
592
+ # filter out consensus mapping with less than min_samples features
593
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
594
+ pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
595
+ )
596
+
597
+ self.consensus_map = consensus_map
598
+ # calculate the completeness of the consensus map
599
+ c = len(self.consensus_mapping_df) / len(self.consensus_df) / len(self.samples_df)
600
+ self.logger.info(
601
+ f"Merging completed. Consensus features: {len(self.consensus_df)}. Completeness: {c:.2f}.",
602
+ )
603
+ if link_ms2:
604
+ self.find_ms2()
605
+
606
+
607
+ # Backward compatibility alias
608
+ find_consensus = merge
609
+
610
+
611
+ def find_ms2(self, **kwargs):
612
+ """
613
+ Links MS2 spectra to consensus features and stores the result in self.consensus_ms2.
614
+
615
+ Parameters:
616
+ **kwargs: Keyword arguments for MS2 linking parameters. Can include:
617
+ - A find_ms2_defaults instance to set all parameters at once
618
+ - Individual parameter names and values (see find_ms2_defaults for details)
619
+ """
620
+ # Reset consensus_ms2 DataFrame at the start
621
+ self.consensus_ms2 = pl.DataFrame()
622
+
623
+ # parameters initialization
624
+ params = find_ms2_defaults()
625
+ for key, value in kwargs.items():
626
+ if isinstance(value, find_ms2_defaults):
627
+ params = value
628
+ self.logger.debug("Using provided find_ms2_defaults parameters")
629
+ else:
630
+ if hasattr(params, key):
631
+ if params.set(key, value, validate=True):
632
+ self.logger.debug(f"Updated parameter {key} = {value}")
633
+ else:
634
+ self.logger.warning(
635
+ f"Failed to set parameter {key} = {value} (validation failed)",
636
+ )
637
+ else:
638
+ self.logger.debug(f"Unknown parameter {key} ignored")
639
+ # end of parameter initialization
640
+
641
+ # Store parameters in the Study object
642
+ self.store_history(["find_ms2"], params.to_dict())
643
+ self.logger.debug("Parameters stored to find_ms2")
644
+
645
+ data = []
646
+ if self.consensus_mapping_df.is_empty():
647
+ self.logger.error(
648
+ "No consensus mapping found. Please run merge() first.",
649
+ )
650
+ return
651
+ self.logger.info("Linking MS2 spectra to consensus features...")
652
+
653
+ # Build fast lookup for feature_uid to features_df row data
654
+ feats = self.features_df
655
+ feature_lookup = {}
656
+ relevant_cols = [
657
+ "ms2_specs",
658
+ "ms2_scans",
659
+ "inty",
660
+ "chrom_coherence",
661
+ "chrom_prominence_scaled",
662
+ ]
663
+ for row in feats.iter_rows(named=True):
664
+ feature_uid = row["feature_uid"]
665
+ feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
666
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
667
+
668
+ # Process consensus mapping in batch
669
+ for mapping_row in tqdm(
670
+ self.consensus_mapping_df.iter_rows(named=True),
671
+ total=self.consensus_mapping_df.shape[0],
672
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}MS2 spectra",
673
+ disable=tdqm_disable,
674
+ ):
675
+ feature_uid = mapping_row["feature_uid"]
676
+ feature_data = feature_lookup.get(feature_uid)
677
+ if feature_data is None or feature_data.get("ms2_specs") is None:
678
+ continue
679
+ ms2_specs = feature_data["ms2_specs"]
680
+ ms2_scans = feature_data["ms2_scans"]
681
+ inty = feature_data.get("inty")
682
+ chrom_coherence = feature_data.get("chrom_coherence")
683
+ chrom_prominence_scaled = feature_data.get("chrom_prominence_scaled")
684
+ for j in range(len(ms2_specs)):
685
+ spec = ms2_specs[j]
686
+ scanid = ms2_scans[j]
687
+ data.append({
688
+ "consensus_uid": int(mapping_row["consensus_uid"]),
689
+ "feature_uid": int(mapping_row["feature_uid"]),
690
+ "sample_uid": int(mapping_row["sample_uid"]),
691
+ "scan_id": int(scanid),
692
+ "energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
693
+ "prec_inty": round(inty, 0) if inty is not None else None,
694
+ "prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
695
+ "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
696
+ if chrom_prominence_scaled is not None
697
+ else None,
698
+ "number_frags": len(spec.mz),
699
+ "spec": spec,
700
+ })
701
+ self.consensus_ms2 = pl.DataFrame(data)
702
+ if not self.consensus_ms2.is_empty():
703
+ unique_consensus_features = self.consensus_ms2["consensus_uid"].n_unique()
704
+ else:
705
+ unique_consensus_features = 0
706
+ self.logger.info(
707
+ f"Linking completed. {len(self.consensus_ms2)} MS2 spectra associated to {unique_consensus_features} consensus features.",
708
+ )
709
+
710
+
711
+ ## TODO these are not modelled the same way as other ranges, harmonize for tuples
712
+ def filter_consensus(
713
+ self,
714
+ inplace=True,
715
+ number_samples=None,
716
+ quality=None,
717
+ coherence=None,
718
+ ):
719
+ if self.consensus_df is None:
720
+ self.logger.error("No consensus found.")
721
+ return
722
+ cons = self.consensus_df if inplace else self.consensus_df.copy()
723
+ l = len(cons)
724
+ self.logger.info(f"Filtering consensus features with {l} entries...")
725
+ if coherence is not None:
726
+ if "chrom_coherence" not in cons.columns:
727
+ self.logger.warning("No coherence data found in features.")
728
+ else:
729
+ if isinstance(coherence, tuple) and len(coherence) == 2:
730
+ min_coherence, max_coherence = coherence
731
+ cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
732
+ else:
733
+ cons = cons[cons["chrom_coherence"] >= coherence]
734
+ l2 = len(cons)
735
+ self.logger.info(
736
+ f"Filtered {l - l2} entries based on coherence. Remaining {l2} entries.",
737
+ )
738
+
739
+ if quality is not None:
740
+ if isinstance(quality, tuple) and len(quality) == 2:
741
+ min_quality, max_quality = quality
742
+ cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
743
+ else:
744
+ cons = cons[cons["quality"] >= quality]
745
+ l3 = len(cons)
746
+ self.logger.info(
747
+ f"Filtered {l2 - l3} entries based on quality. Remaining {l3} entries.",
748
+ )
749
+
750
+ if number_samples is not None:
751
+ if isinstance(number_samples, tuple) and len(number_samples) == 2:
752
+ min_number, max_number = number_samples
753
+ cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
754
+ else:
755
+ cons = cons[cons["number_samples"] >= number_samples]
756
+ l4 = len(cons)
757
+ self.logger.info(
758
+ f"Filtered {l3 - l4} entries based on number of samples. Remaining {l4} entries.",
759
+ )
760
+
761
+ self.logger.info(f"Filtering completed. {len(cons)} entries remaining.")
762
+
763
+ if inplace:
764
+ self.consensus_df = cons
765
+ else:
766
+ return cons
767
+
768
+
769
+ ## TODO is uid supposed to be a list? rt_tol 0?
770
+ def _integrate_chrom_impl(self, **kwargs):
771
+ """
772
+ Given a consensus_id, integrate the intensity of all features in the consensus map.
773
+
774
+ Parameters:
775
+ **kwargs: Keyword arguments for integration parameters. Can include:
776
+ - An integrate_chrom_defaults instance to set all parameters at once
777
+ - Individual parameter names and values (see integrate_chrom_defaults for details)
778
+
779
+ Key Parameters:
780
+ uids: List of consensus UIDs to integrate (default: all consensus features).
781
+ rt_tol: RT tolerance for integration boundaries.
782
+ """
783
+ # parameters initialization
784
+ params = integrate_defaults()
785
+ for key, value in kwargs.items():
786
+ if isinstance(value, integrate_defaults):
787
+ params = value
788
+ self.logger.debug("Using provided integrate_chrom_defaults parameters")
789
+ else:
790
+ if hasattr(params, key):
791
+ if params.set(key, value, validate=True):
792
+ self.logger.debug(f"Updated parameter {key} = {value}")
793
+ else:
794
+ self.logger.warning(
795
+ f"Failed to set parameter {key} = {value} (validation failed)",
796
+ )
797
+ else:
798
+ self.logger.debug(f"Unknown parameter {key} ignored")
799
+ # end of parameter initialization
800
+
801
+ # Store parameters in the Study object
802
+ self.store_history(["integrate_chrom"], params.to_dict())
803
+ self.logger.debug("Parameters stored to integrate_chrom")
804
+
805
+ # Get parameter values for use in the method
806
+ uids = params.get("uids")
807
+ rt_tol = params.get("rt_tol")
808
+
809
+ if self.consensus_map is None:
810
+ self.logger.error("No consensus map found.")
811
+ return
812
+ if uids is None:
813
+ # get all consensus_id from consensus_df
814
+ ids = self.consensus_df["consensus_uid"].to_list()
815
+ else:
816
+ # keep only id that are in consensus_df
817
+ ids = [i for i in uids if i in self.consensus_df["consensus_uid"].to_list()]
818
+
819
+ # Ensure chrom_area column is Float64 to avoid dtype conflicts
820
+ if "chrom_area" in self.features_df.columns:
821
+ self.features_df = self.features_df.with_columns(
822
+ pl.col("chrom_area").cast(pl.Float64, strict=False),
823
+ )
824
+
825
+ # Merge consensus_mapping with consensus_df to get rt_start_mean and rt_end_mean
826
+ # Use Polars join operation instead of pandas merge
827
+ consensus_subset = self.consensus_df.select([
828
+ "consensus_uid",
829
+ "rt_start_mean",
830
+ "rt_end_mean",
831
+ ])
832
+ df1 = self.consensus_mapping_df.join(
833
+ consensus_subset,
834
+ on="consensus_uid",
835
+ how="left",
836
+ )
837
+ df1 = df1.filter(pl.col("consensus_uid").is_in(ids))
838
+
839
+ # Build a fast lookup for feature_uid to row index in features_df
840
+ # Since Polars doesn't have index-based access like pandas, we'll use row position
841
+ feature_uid_to_row = {}
842
+ for i, row_dict in enumerate(self.features_df.iter_rows(named=True)):
843
+ if "feature_uid" in row_dict:
844
+ feature_uid_to_row[row_dict["feature_uid"]] = i
845
+ elif "uid" in row_dict: # fallback column name
846
+ feature_uid_to_row[row_dict["uid"]] = i
847
+
848
+ # Prepare lists for batch update
849
+ update_rows = []
850
+ chroms: list = []
851
+ rt_starts: list[float] = []
852
+ rt_ends: list[float] = []
853
+ rt_deltas: list[float] = []
854
+ chrom_areas = []
855
+
856
+ self.logger.debug(f"Integrating {df1.shape[0]} features using consensus...")
857
+ tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
858
+ for row in tqdm(
859
+ df1.iter_rows(named=True),
860
+ total=df1.shape[0],
861
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Integrate EICs by consensus",
862
+ disable=tdqm_disable,
863
+ ):
864
+ feature_uid = row["feature_uid"]
865
+ row_idx = feature_uid_to_row.get(feature_uid)
866
+ if row_idx is None:
867
+ continue
868
+
869
+ # Get the feature row from Polars DataFrame
870
+ feature_row = self.features_df.row(row_idx, named=True)
871
+ # get chromatogram for the feature
872
+ chrom = feature_row["chrom"]
873
+ if chrom is None or len(chrom) == 0:
874
+ update_rows.append(row_idx)
875
+ chroms.append(None)
876
+ rt_starts.append(None)
877
+ rt_ends.append(None)
878
+ rt_deltas.append(None)
879
+ chrom_areas.append(-1.0)
880
+ continue
881
+ ## TODO expose parameters
882
+ rt_start = _find_closest_valley(
883
+ chrom,
884
+ row["rt_start_mean"] - rt_tol,
885
+ dir="left",
886
+ threshold=0.9,
887
+ )
888
+ rt_end = _find_closest_valley(
889
+ chrom,
890
+ row["rt_end_mean"] + rt_tol,
891
+ dir="right",
892
+ threshold=0.9,
893
+ )
894
+ chrom.feature_start = rt_start
895
+ chrom.feature_end = rt_end
896
+ chrom.integrate()
897
+ update_rows.append(row_idx)
898
+ chroms.append(chrom)
899
+ rt_starts.append(rt_start)
900
+ rt_ends.append(rt_end)
901
+ rt_deltas.append(rt_end - rt_start)
902
+ chrom_areas.append(float(chrom.feature_area))
903
+
904
+ # Batch update DataFrame - Polars style
905
+ if update_rows:
906
+ # Create mapping from row index to new values
907
+ row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
908
+ row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
909
+ row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
910
+ row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
911
+ row_to_chrom_area = {
912
+ update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
913
+ for i in range(len(update_rows))
914
+ }
915
+
916
+ # Use with_row_index to create a temporary row index column
917
+ df_with_index = self.features_df.with_row_index("__row_idx")
918
+
919
+ # Create update masks and values
920
+ update_mask = pl.col("__row_idx").is_in(update_rows)
921
+
922
+ # Update columns conditionally
923
+ try:
924
+ self.features_df = df_with_index.with_columns([
925
+ # Update chrom column - use when() to update only specific rows
926
+ pl.when(update_mask)
927
+ .then(
928
+ pl.col("__row_idx").map_elements(
929
+ lambda x: row_to_chrom.get(x, None),
930
+ return_dtype=pl.Object,
931
+ ),
932
+ )
933
+ .otherwise(pl.col("chrom"))
934
+ .alias("chrom"),
935
+ # Update rt_start column
936
+ pl.when(update_mask)
937
+ .then(
938
+ pl.col("__row_idx").map_elements(
939
+ lambda x: row_to_rt_start.get(x, None),
940
+ return_dtype=pl.Float64,
941
+ ),
942
+ )
943
+ .otherwise(pl.col("rt_start"))
944
+ .alias("rt_start"),
945
+ # Update rt_end column
946
+ pl.when(update_mask)
947
+ .then(
948
+ pl.col("__row_idx").map_elements(
949
+ lambda x: row_to_rt_end.get(x, None),
950
+ return_dtype=pl.Float64,
951
+ ),
952
+ )
953
+ .otherwise(pl.col("rt_end"))
954
+ .alias("rt_end"),
955
+ # Update rt_delta column
956
+ pl.when(update_mask)
957
+ .then(
958
+ pl.col("__row_idx").map_elements(
959
+ lambda x: row_to_rt_delta.get(x, None),
960
+ return_dtype=pl.Float64,
961
+ ),
962
+ )
963
+ .otherwise(pl.col("rt_delta"))
964
+ .alias("rt_delta"),
965
+ # Update chrom_area column
966
+ pl.when(update_mask)
967
+ .then(
968
+ pl.col("__row_idx").map_elements(
969
+ lambda x: row_to_chrom_area.get(x, 0),
970
+ return_dtype=pl.Float64,
971
+ ),
972
+ )
973
+ .otherwise(pl.col("chrom_area"))
974
+ .alias("chrom_area"),
975
+ ]).drop("__row_idx") # Remove the temporary row index column
976
+
977
+ self.logger.debug(
978
+ f"Integration completed. Updated {len(update_rows)} features with chromatogram data.",
979
+ )
980
+ except Exception as e:
981
+ self.logger.error(f"Failed to update features DataFrame: {e}")
982
+ else:
983
+ self.logger.debug("No features were updated during integration.")
984
+
985
+
986
+ def integrate(self, **kwargs):
987
+ """
988
+ Integrate chromatograms across consensus features.
989
+
990
+ Parameters:
991
+ **kwargs: Keyword arguments for integration parameters. Can include:
992
+ - An integrate_defaults instance to set all parameters at once
993
+ - Individual parameter names and values (see integrate_defaults for details)
994
+
995
+ Key Parameters:
996
+ uids (Optional[list]): List of consensus UIDs to integrate (None for all).
997
+ rt_tol (float): RT tolerance for integration boundaries (default: 0.0).
998
+ """
999
+ # parameters initialization
1000
+ params = integrate_defaults()
1001
+ for key, value in kwargs.items():
1002
+ if isinstance(value, integrate_defaults):
1003
+ params = value
1004
+ self.logger.debug("Using provided integrate_defaults parameters")
1005
+ else:
1006
+ if hasattr(params, key):
1007
+ if params.set(key, value, validate=True):
1008
+ self.logger.debug(f"Updated parameter {key} = {value}")
1009
+ else:
1010
+ self.logger.warning(
1011
+ f"Failed to set parameter {key} = {value} (validation failed)",
1012
+ )
1013
+ else:
1014
+ self.logger.debug(f"Unknown parameter {key} ignored")
1015
+ # end of parameter initialization
1016
+
1017
+ # Store parameters in the Study object
1018
+ self.store_history(["integrate"], params.to_dict())
1019
+ self.logger.debug("Parameters stored to integrate")
1020
+
1021
+ # Call the original integrate_chrom function with extracted parameters
1022
+ return _integrate_chrom_impl(
1023
+ self,
1024
+ uids=params.get("uids"),
1025
+ rt_tol=params.get("rt_tol"),
1026
+ )
1027
+
1028
+
1029
+ # Backward compatibility alias
1030
+ integrate_chrom = integrate
1031
+
1032
+
1033
+ def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
1034
+ # find closest index to rt in chrom['rt']
1035
+ chrom.rt = chrom.rt.astype(np.float64)
1036
+ chrom.inty = chrom.inty.astype(np.float64)
1037
+ idx = np.abs(chrom.rt - rt).argmin()
1038
+ # ensure rt and inty are float64
1039
+ if dir == "left":
1040
+ inty = np.inf
1041
+ # iterate left from idx to the end od the peaks until we find a valley
1042
+ for i in range(idx, 0, -1):
1043
+ if chrom.inty[i] < inty * threshold:
1044
+ idx = i
1045
+ inty = chrom.inty[i]
1046
+ else:
1047
+ break
1048
+ if dir == "right":
1049
+ inty = np.inf
1050
+ # iterate right from idx to the end od the peaks until we find a valley
1051
+ for i in range(idx, len(chrom.inty)):
1052
+ if chrom.inty[i] < inty * threshold:
1053
+ idx = i
1054
+ inty = chrom.inty[i]
1055
+ else:
1056
+ break
1057
+ return chrom.rt[idx]